diff --git a/.github/workflows/markdown-links-check.yml b/.github/workflows/markdown-links-check.yml new file mode 100644 index 000000000..833b99ec8 --- /dev/null +++ b/.github/workflows/markdown-links-check.yml @@ -0,0 +1,36 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A workflow to check if PR got broken hyperlinks +name: Check Markdown links + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + markdown-link-check: + runs-on: ubuntu-latest + steps: + - name: work around permission issue + run: git config --global --add safe.directory /github/workspace + - uses: actions/checkout@master + - uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + max-depth: -1 + use-verbose-mode: 'yes' + config-file: '.github/workflows/markdown-links-check/markdown-links-check-config.json' + base-branch: 'dev' + + \ No newline at end of file diff --git a/.github/workflows/markdown-links-check/markdown-links-check-config.json b/.github/workflows/markdown-links-check/markdown-links-check-config.json new file mode 100644 index 000000000..ec4af8ca8 --- /dev/null +++ b/.github/workflows/markdown-links-check/markdown-links-check-config.json @@ -0,0 +1,17 @@ +{ + "ignorePatterns": [ + { + "pattern": "https://github.com/NVIDIA/spark-rapids-tools/issues/*" + }, + { + "pattern": "http://localhost*" + }, + { + "pattern": "https://www.nvidia.com/en-us/security/pgp-key" + } + ], + "timeout": "15s", + "retryOn429": true, + "retryCount":30, + "aliveStatusCodes": [200, 403] +} \ No newline at end of file diff --git a/README.md b/README.md index 42f142176..ac1a5dcfb 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ This repo provides the tools to use [RAPIDS Accelerator for Apache Spark](https: ## Catalog -- [RAPIDS core tools](/core): Tools that help developers getting the most out of their Apache Spark applications +- [RAPIDS core tools](./core): Tools that help developers getting the most out of their Apache + Spark applications without any code change: - Report acceleration potential of RAPIDS Accelerator for Apache Spark on a set of Spark applications. - Generate comprehensive profiling analysis for Apache Sparks executing on accelerated GPU instances. This information can be used to further tune and optimize the application. -- [spark-rapids-user-tools](/user_tools): A simple wrapper process around cloud service providers to run - [RAPIDS core tools](/core) across multiple cloud platforms. In addition, the output educates the users on +- [spark-rapids-user-tools](./user_tools): A simple wrapper process around cloud service + providers to run + [RAPIDS core tools](./core) across multiple cloud platforms. In addition, the output educates + the users on the cost savings and acceleration potential of RAPIDS Accelerator for Apache Spark and makes recommendations to tune the application performance based on the cluster shape. diff --git a/core/pom.xml b/core/pom.xml index 0e2a28274..d7c04d332 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-tools_2.12 RAPIDS Accelerator for Apache Spark tools RAPIDS Accelerator for Apache Spark tools - 23.10.1 + 23.10.2-SNAPSHOT jar http://github.com/NVIDIA/spark-rapids-tools diff --git a/core/src/main/resources/operatorsScore-databricks-aws.csv b/core/src/main/resources/operatorsScore-databricks-aws.csv index 25296214c..394437c13 100644 --- a/core/src/main/resources/operatorsScore-databricks-aws.csv +++ b/core/src/main/resources/operatorsScore-databricks-aws.csv @@ -105,6 +105,7 @@ Expm1,2.45 First,2.45 Flatten,2.45 Floor,2.45 +FormatNumber,2.45 FromUTCTimestamp,2.45 FromUnixTime,2.45 GetArrayItem,2.45 @@ -175,6 +176,7 @@ Not,2.45 NthValue,2.45 OctetLength,2.45 Or,2.45 +Percentile,2.45 PercentRank,2.45 PivotFirst,2.45 Pmod,2.45 @@ -214,6 +216,7 @@ SortOrder,2.45 SparkPartitionID,2.45 SpecifiedWindowFrame,2.45 Sqrt,2.45 +Stack,2.45 StartsWith,2.45 StddevPop,2.45 StddevSamp,2.45 @@ -229,6 +232,7 @@ StringTranslate,2.45 StringTrim,2.45 StringTrimLeft,2.45 StringTrimRight,2.45 +StructsToJson,2.45 Substring,2.45 SubstringIndex,2.45 Subtract,2.45 diff --git a/core/src/main/resources/operatorsScore-databricks-azure.csv b/core/src/main/resources/operatorsScore-databricks-azure.csv index 1e04e16c8..86daf247d 100644 --- a/core/src/main/resources/operatorsScore-databricks-azure.csv +++ b/core/src/main/resources/operatorsScore-databricks-azure.csv @@ -105,6 +105,7 @@ Expm1,2.73 First,2.73 Flatten,2.73 Floor,2.73 +FormatNumber,2.73 FromUTCTimestamp,2.73 FromUnixTime,2.73 GetArrayItem,2.73 @@ -175,6 +176,7 @@ Not,2.73 NthValue,2.73 OctetLength,2.73 Or,2.73 +Percentile,2.73 PercentRank,2.73 PivotFirst,2.73 Pmod,2.73 @@ -214,6 +216,7 @@ SortOrder,2.73 SparkPartitionID,2.73 SpecifiedWindowFrame,2.73 Sqrt,2.73 +Stack,2.73 StartsWith,2.73 StddevPop,2.73 StddevSamp,2.73 @@ -229,6 +232,7 @@ StringTranslate,2.73 StringTrim,2.73 StringTrimLeft,2.73 StringTrimRight,2.73 +StructsToJson,2.73 Substring,2.73 SubstringIndex,2.73 Subtract,2.73 diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv index 1426aa047..e1d3678d4 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv @@ -98,6 +98,7 @@ Expm1,3.74 First,3.74 Flatten,3.74 Floor,3.74 +FormatNumber,3.74 FromUTCTimestamp,3.74 FromUnixTime,3.74 GetArrayItem,3.74 @@ -168,6 +169,7 @@ Not,3.74 NthValue,3.74 OctetLength,3.74 Or,3.74 +Percentile,3.74 PercentRank,3.74 PivotFirst,3.74 Pmod,3.74 @@ -207,6 +209,7 @@ SortOrder,3.74 SparkPartitionID,3.74 SpecifiedWindowFrame,3.74 Sqrt,3.74 +Stack,3.74 StartsWith,3.74 StddevPop,3.74 StddevSamp,3.74 @@ -222,6 +225,7 @@ StringTranslate,3.74 StringTrim,3.74 StringTrimLeft,3.74 StringTrimRight,3.74 +StructsToJson,3.74 Substring,3.74 SubstringIndex,3.74 Subtract,3.74 diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv index 3083cbe8b..2777068b7 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -98,6 +98,7 @@ Expm1,3.65 First,3.65 Flatten,3.65 Floor,3.65 +FormatNumber,3.65 FromUTCTimestamp,3.65 FromUnixTime,3.65 GetArrayItem,3.65 @@ -168,6 +169,7 @@ Not,3.65 NthValue,3.65 OctetLength,3.65 Or,3.65 +Percentile,3.65 PercentRank,3.65 PivotFirst,3.65 Pmod,3.65 @@ -207,6 +209,7 @@ SortOrder,3.65 SparkPartitionID,3.65 SpecifiedWindowFrame,3.65 Sqrt,3.65 +Stack,3.65 StartsWith,3.65 StddevPop,3.65 StddevSamp,3.65 @@ -222,6 +225,7 @@ StringTranslate,3.65 StringTrim,3.65 StringTrimLeft,3.65 StringTrimRight,3.65 +StructsToJson,3.65 Substring,3.65 SubstringIndex,3.65 Subtract,3.65 diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv index e61959ea6..ad371bb8f 100644 --- a/core/src/main/resources/operatorsScore-dataproc-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv @@ -105,6 +105,7 @@ Expm1,4.16 First,4.16 Flatten,4.16 Floor,4.16 +FormatNumber,4.16 FromUTCTimestamp,4.16 FromUnixTime,4.16 GetArrayItem,4.16 @@ -175,6 +176,7 @@ Not,4.16 NthValue,4.16 OctetLength,4.16 Or,4.16 +Percentile,4.16 PercentRank,4.16 PivotFirst,4.16 Pmod,4.16 @@ -214,6 +216,7 @@ SortOrder,4.16 SparkPartitionID,4.16 SpecifiedWindowFrame,4.16 Sqrt,4.16 +Stack,4.16 StartsWith,4.16 StddevPop,4.16 StddevSamp,4.16 @@ -229,6 +232,7 @@ StringTranslate,4.16 StringTrim,4.16 StringTrimLeft,4.16 StringTrimRight,4.16 +StructsToJson,4.16 Substring,4.16 SubstringIndex,4.16 Subtract,4.16 diff --git a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv index 493f1154c..fbe0d057e 100644 --- a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv @@ -98,6 +98,7 @@ Expm1,4.25 First,4.25 Flatten,4.25 Floor,4.25 +FormatNumber,4.25 FromUTCTimestamp,4.25 FromUnixTime,4.25 GetArrayItem,4.25 @@ -168,6 +169,7 @@ Not,4.25 NthValue,4.25 OctetLength,4.25 Or,4.25 +Percentile,4.25 PercentRank,4.25 PivotFirst,4.25 Pmod,4.25 @@ -207,6 +209,7 @@ SortOrder,4.25 SparkPartitionID,4.25 SpecifiedWindowFrame,4.25 Sqrt,4.25 +Stack,4.25 StartsWith,4.25 StddevPop,4.25 StddevSamp,4.25 @@ -222,6 +225,7 @@ StringTranslate,4.25 StringTrim,4.25 StringTrimLeft,4.25 StringTrimRight,4.25 +StructsToJson,4.25 Substring,4.25 SubstringIndex,4.25 Subtract,4.25 diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv index 3be159955..c6f3a7654 100644 --- a/core/src/main/resources/operatorsScore-dataproc-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv @@ -105,6 +105,7 @@ Expm1,4.88 First,4.88 Flatten,4.88 Floor,4.88 +FormatNumber,4.88 FromUTCTimestamp,4.88 FromUnixTime,4.88 GetArrayItem,4.88 @@ -175,6 +176,7 @@ Not,4.88 NthValue,4.88 OctetLength,4.88 Or,4.88 +Percentile,4.88 PercentRank,4.88 PivotFirst,4.88 Pmod,4.88 @@ -214,6 +216,7 @@ SortOrder,4.88 SparkPartitionID,4.88 SpecifiedWindowFrame,4.88 Sqrt,4.88 +Stack,4.88 StartsWith,4.88 StddevPop,4.88 StddevSamp,4.88 @@ -229,6 +232,7 @@ StringTranslate,4.88 StringTrim,4.88 StringTrimLeft,4.88 StringTrimRight,4.88 +StructsToJson,4.88 Substring,4.88 SubstringIndex,4.88 Subtract,4.88 diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv index b7557b7ef..91a2a840d 100644 --- a/core/src/main/resources/operatorsScore-emr-a10.csv +++ b/core/src/main/resources/operatorsScore-emr-a10.csv @@ -105,6 +105,7 @@ Expm1,2.59 First,2.59 Flatten,2.59 Floor,2.59 +FormatNumber,2.59 FromUTCTimestamp,2.59 FromUnixTime,2.59 GetArrayItem,2.59 @@ -175,6 +176,7 @@ Not,2.59 NthValue,2.59 OctetLength,2.59 Or,2.59 +Percentile,2.59 PercentRank,2.59 PivotFirst,2.59 Pmod,2.59 @@ -214,6 +216,7 @@ SortOrder,2.59 SparkPartitionID,2.59 SpecifiedWindowFrame,2.59 Sqrt,2.59 +Stack,2.59 StartsWith,2.59 StddevPop,2.59 StddevSamp,2.59 @@ -229,6 +232,7 @@ StringTranslate,2.59 StringTrim,2.59 StringTrimLeft,2.59 StringTrimRight,2.59 +StructsToJson,2.59 Substring,2.59 SubstringIndex,2.59 Subtract,2.59 diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv index 05f293080..8d34a914c 100644 --- a/core/src/main/resources/operatorsScore-emr-t4.csv +++ b/core/src/main/resources/operatorsScore-emr-t4.csv @@ -105,6 +105,7 @@ Expm1,2.07 First,2.07 Flatten,2.07 Floor,2.07 +FormatNumber,2.07 FromUTCTimestamp,2.07 FromUnixTime,2.07 GetArrayItem,2.07 @@ -175,6 +176,7 @@ Not,2.07 NthValue,2.07 OctetLength,2.07 Or,2.07 +Percentile,2.07 PercentRank,2.07 PivotFirst,2.07 Pmod,2.07 @@ -214,6 +216,7 @@ SortOrder,2.07 SparkPartitionID,2.07 SpecifiedWindowFrame,2.07 Sqrt,2.07 +Stack,2.07 StartsWith,2.07 StddevPop,2.07 StddevSamp,2.07 @@ -229,6 +232,7 @@ StringTranslate,2.07 StringTrim,2.07 StringTrimLeft,2.07 StringTrimRight,2.07 +StructsToJson,2.07 Substring,2.07 SubstringIndex,2.07 Subtract,2.07 diff --git a/core/src/main/resources/operatorsScore.csv b/core/src/main/resources/operatorsScore-onprem.csv similarity index 98% rename from core/src/main/resources/operatorsScore.csv rename to core/src/main/resources/operatorsScore-onprem.csv index 3903479a6..50ec61028 100644 --- a/core/src/main/resources/operatorsScore.csv +++ b/core/src/main/resources/operatorsScore-onprem.csv @@ -110,6 +110,7 @@ Expm1,4 First,4 Flatten,4 Floor,4 +FormatNumber,4 FromUTCTimestamp,4 FromUnixTime,4 GetArrayItem,4 @@ -180,6 +181,7 @@ Not,4 NthValue,4 OctetLength,4 Or,4 +Percentile,4 PercentRank,4 PivotFirst,4 Pmod,4 @@ -219,6 +221,7 @@ SortOrder,4 SparkPartitionID,4 SpecifiedWindowFrame,4 Sqrt,4 +Stack,4 StartsWith,4 StddevPop,4 StddevSamp,4 @@ -234,6 +237,7 @@ StringTranslate,4 StringTrim,4 StringTrimLeft,4 StringTrimRight,4 +StructsToJson,4 Substring,4 SubstringIndex,4 Subtract,4 diff --git a/core/src/main/resources/supportedExecs.csv b/core/src/main/resources/supportedExecs.csv index 130b0657a..f5a3fe7c4 100644 --- a/core/src/main/resources/supportedExecs.csv +++ b/core/src/main/resources/supportedExecs.csv @@ -19,7 +19,7 @@ HashAggregateExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,S,PS,NS,PS,PS,PS,NS ObjectHashAggregateExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,S,PS,NS,PS,PS,PS,NS SortAggregateExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,S,PS,NS,PS,PS,PS,NS InMemoryTableScanExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,PS,PS,PS,NS -DataWritingCommandExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,PS,NS,PS,NS,PS,PS,PS,NS +DataWritingCommandExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,PS,NS,S,NS,PS,PS,PS,NS ExecutedCommandExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,S,S,S,PS,PS,PS,S BatchScanExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,NS,S,NS,PS,PS,PS,NS BroadcastExchangeExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv index 40b15c5ee..f6d4ee1fc 100644 --- a/core/src/main/resources/supportedExprs.csv +++ b/core/src/main/resources/supportedExprs.csv @@ -19,8 +19,8 @@ Add,S,`+`,None,AST,rhs,NA,NS,NS,S,S,S,S,NA,NA,NA,NS,NA,NA,NS,NA,NA,NA,NA Add,S,`+`,None,AST,result,NA,NS,NS,S,S,S,S,NA,NA,NA,NS,NA,NA,NS,NA,NA,NA,NA Alias,S, ,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS Alias,S, ,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS -Alias,S, ,None,AST,input,S,S,S,S,S,S,S,S,PS,NS,NS,NS,NS,NS,NS,NS,NS,NS -Alias,S, ,None,AST,result,S,S,S,S,S,S,S,S,PS,NS,NS,NS,NS,NS,NS,NS,NS,NS +Alias,S, ,None,AST,input,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,NS,NS,NS,NS +Alias,S, ,None,AST,result,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,NS,NS,NS,NS And,S,`and`,None,project,lhs,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA And,S,`and`,None,project,rhs,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA And,S,`and`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -79,7 +79,7 @@ Atanh,S,`atanh`,None,project,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA, Atanh,S,`atanh`,None,AST,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Atanh,S,`atanh`,None,AST,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA AttributeReference,S, ,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS -AttributeReference,S, ,None,AST,result,S,S,S,S,S,S,S,S,PS,NS,NS,NS,NS,NS,NS,NS,NS,NS +AttributeReference,S, ,None,AST,result,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,NS,NS,NS,NS BRound,S,`bround`,None,project,value,NA,S,S,S,S,PS,PS,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA BRound,S,`bround`,None,project,scale,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA BRound,S,`bround`,None,project,result,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA @@ -191,8 +191,8 @@ EqualNullSafe,S,`<=>`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, EqualTo,S,`=`; `==`,None,project,lhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS EqualTo,S,`=`; `==`,None,project,rhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS EqualTo,S,`=`; `==`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -EqualTo,S,`=`; `==`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS -EqualTo,S,`=`; `==`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS +EqualTo,S,`=`; `==`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS +EqualTo,S,`=`; `==`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS EqualTo,S,`=`; `==`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Exp,S,`exp`,None,project,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Exp,S,`exp`,None,project,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -208,6 +208,9 @@ Flatten,S,`flatten`,None,project,input,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Flatten,S,`flatten`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA Floor,S,`floor`,None,project,input,NA,NA,NA,NA,S,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA Floor,S,`floor`,None,project,result,NA,NA,NA,NA,S,NA,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA +FormatNumber,S,`format_number`,None,project,x,NA,S,S,S,S,S,S,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA +FormatNumber,S,`format_number`,None,project,d,NA,NA,NA,PS,NA,NA,NA,NA,NA,NS,NA,NA,NA,NA,NA,NA,NA,NA +FormatNumber,S,`format_number`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA FromUTCTimestamp,S,`from_utc_timestamp`,None,project,timestamp,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA FromUTCTimestamp,S,`from_utc_timestamp`,None,project,timezone,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA FromUTCTimestamp,S,`from_utc_timestamp`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -233,14 +236,14 @@ GetTimestamp,S, ,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,N GreaterThan,S,`>`,None,project,lhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS GreaterThan,S,`>`,None,project,rhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS GreaterThan,S,`>`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -GreaterThan,S,`>`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS -GreaterThan,S,`>`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS +GreaterThan,S,`>`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS +GreaterThan,S,`>`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS GreaterThan,S,`>`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA GreaterThanOrEqual,S,`>=`,None,project,lhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS GreaterThanOrEqual,S,`>=`,None,project,rhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS GreaterThanOrEqual,S,`>=`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -GreaterThanOrEqual,S,`>=`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS -GreaterThanOrEqual,S,`>=`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS +GreaterThanOrEqual,S,`>=`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS +GreaterThanOrEqual,S,`>=`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS GreaterThanOrEqual,S,`>=`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Greatest,S,`greatest`,None,project,param,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS Greatest,S,`greatest`,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS @@ -273,7 +276,7 @@ IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,N IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA JsonToStructs,NS,`from_json`,This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA -JsonToStructs,NS,`from_json`,This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,NS,NA +JsonToStructs,NS,`from_json`,This is disabled by default because parsing JSON from a column has a large number of issues and should be considered beta quality right now.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA JsonTuple,S,`json_tuple`,None,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,S,`json_tuple`,None,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA JsonTuple,S,`json_tuple`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA @@ -301,20 +304,20 @@ Length,S,`length`; `character_length`; `char_length`,None,project,result,NA,NA,N LessThan,S,`<`,None,project,lhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS LessThan,S,`<`,None,project,rhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS LessThan,S,`<`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -LessThan,S,`<`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS -LessThan,S,`<`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS +LessThan,S,`<`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS +LessThan,S,`<`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS LessThan,S,`<`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA LessThanOrEqual,S,`<=`,None,project,lhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS LessThanOrEqual,S,`<=`,None,project,rhs,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,PS,NS LessThanOrEqual,S,`<=`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA -LessThanOrEqual,S,`<=`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS -LessThanOrEqual,S,`<=`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,NS,NS,NS,NS,NS,NS,NA,NS,NS +LessThanOrEqual,S,`<=`,None,AST,lhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS +LessThanOrEqual,S,`<=`,None,AST,rhs,S,S,S,S,S,NS,NS,S,PS,S,NS,NS,NS,NS,NS,NA,NS,NS LessThanOrEqual,S,`<=`,None,AST,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Like,S,`like`,None,project,src,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA Like,S,`like`,None,project,search,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA Like,S,`like`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Literal,S, ,None,project,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,S,PS,PS,PS,NS -Literal,S, ,None,AST,result,S,S,S,S,S,S,S,S,PS,NS,NS,NS,NS,NS,NS,NS,NS,NS +Literal,S, ,None,AST,result,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,NS,NS,NS,NS Log,S,`ln`,None,project,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Log,S,`ln`,None,project,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Log10,S,`log10`,None,project,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -487,6 +490,9 @@ Sqrt,S,`sqrt`,None,project,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA, Sqrt,S,`sqrt`,None,project,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Sqrt,S,`sqrt`,None,AST,input,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Sqrt,S,`sqrt`,None,AST,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +Stack,S,`stack`,None,project,n,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +Stack,S,`stack`,None,project,expr,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,PS,PS,NS +Stack,S,`stack`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA StartsWith,S, ,None,project,src,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StartsWith,S, ,None,project,search,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA StartsWith,S, ,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -533,6 +539,8 @@ StringTrimLeft,S,`ltrim`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA, StringTrimRight,S,`rtrim`,None,project,src,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA StringTrimRight,S,`rtrim`,None,project,trimStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA StringTrimRight,S,`rtrim`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA +StructsToJson,NS,`to_json`,This is disabled by default because to_json support is experimental. See compatibility guide for more information.,project,struct,S,S,S,S,S,S,S,NA,NA,S,NA,NA,NA,NA,S,S,S,NA +StructsToJson,NS,`to_json`,This is disabled by default because to_json support is experimental. See compatibility guide for more information.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA Substring,S,`substr`; `substring`,None,project,str,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NS,NA,NA,NA,NA,NA Substring,S,`substr`; `substring`,None,project,pos,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Substring,S,`substr`; `substring`,None,project,len,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA @@ -665,6 +673,14 @@ Min,S,`min`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS Min,S,`min`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS Min,S,`min`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS Min,S,`min`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS +Percentile,S,`percentile`,None,aggregation,input,NA,S,S,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +Percentile,S,`percentile`,None,aggregation,percentage,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA +Percentile,S,`percentile`,None,aggregation,frequency,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA +Percentile,S,`percentile`,None,aggregation,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA +Percentile,S,`percentile`,None,reduction,input,NA,S,S,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA +Percentile,S,`percentile`,None,reduction,percentage,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA +Percentile,S,`percentile`,None,reduction,frequency,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA +Percentile,S,`percentile`,None,reduction,result,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA PivotFirst,S, ,None,aggregation,pivotColumn,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NS,NS,NS PivotFirst,S, ,None,aggregation,valueColumn,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NS,NS,NS PivotFirst,S, ,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NS,NS,NS diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/Platform.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/Platform.scala new file mode 100644 index 000000000..67a36e8b9 --- /dev/null +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/Platform.scala @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.nvidia.spark.rapids.tool + +import scala.annotation.tailrec + +import org.apache.spark.internal.Logging + +/** + * Utility object containing constants for various platform names. + */ +object PlatformNames { + val DATABRICKS_AWS = "databricks-aws" + val DATABRICKS_AZURE = "databricks-azure" + val DATAPROC = "dataproc" + val DATAPROC_GKE_L4 = "dataproc-gke-l4" + val DATAPROC_GKE_T4 = "dataproc-gke-t4" + val DATAPROC_L4 = "dataproc-l4" + val DATAPROC_SL_L4 = "dataproc-serverless-l4" + val DATAPROC_T4 = "dataproc-t4" + val EMR = "emr" + val EMR_A10 = "emr-a10" + val EMR_T4 = "emr-t4" + val ONPREM = "onprem" + val DEFAULT: String = ONPREM + + /** + * Return a list of all platform names. + */ + def getAllNames: List[String] = List( + DATABRICKS_AWS, DATABRICKS_AZURE, DATAPROC, DATAPROC_GKE_L4, DATAPROC_GKE_T4, + DATAPROC_L4, DATAPROC_SL_L4, DATAPROC_T4, EMR, EMR_A10, EMR_T4, ONPREM + ) +} + +/** + * Represents a platform and its associated recommendations. + * + * @param platformName Name of the platform. See [[PlatformNames]] for supported platform names. + */ +class Platform(platformName: String) { + /** + * Recommendations to be excluded from the list of recommendations. + * These have the highest priority. + */ + val recommendationsToExclude: Seq[String] = Seq.empty + /** + * Recommendations to be included in the final list of recommendations. + * These properties should be specific to the platform and not general Spark properties. + * For example: "spark.databricks.optimizer.dynamicFilePruning" for the Databricks platform. + * + * Represented as a tuple of (propertyKey, propertyValue). + */ + val recommendationsToInclude: Seq[(String, String)] = Seq.empty + /** + * Dynamically calculates the recommendation for a specific Spark property by invoking + * the appropriate function based on `sparkProperty`. + * TODO: Implement this function and integrate with existing code in AutoTuner + * + * @param sparkProperty The Spark property for which the recommendation is calculated. + * @param args Variable list of arguments passed to the calculation function for dynamic + * processing. + * @return Optional string containing the recommendation, or `None` if unavailable. + */ + def getRecommendation(sparkProperty: String, args: Any*): Option[String] = None + + /** + * Checks if the `property` is valid: + * 1. It should not be in exclusion list + * OR + * 2. It should be in the inclusion list + */ + def isValidRecommendation(property: String): Boolean = { + !recommendationsToExclude.contains(property) || + recommendationsToInclude.map(_._1).contains(property) + } + + /** + * Checks if the `comment` is valid: + * 1. It should not have any property from the exclusion list + */ + def isValidComment(comment: String): Boolean = { + recommendationsToExclude.forall(excluded => !comment.contains(excluded)) + } + + def getName: String = platformName + + def getOperatorScoreFile: String = { + s"operatorsScore-$platformName.csv" + } +} + +class DatabricksPlatform(platformType: String) extends Platform(platformType) { + override val recommendationsToExclude: Seq[String] = Seq( + "spark.executor.cores", + "spark.executor.instances", + "spark.executor.memory", + "spark.executor.memoryOverhead" + ) + override val recommendationsToInclude: Seq[(String, String)] = Seq( + ("spark.databricks.optimizer.dynamicFilePruning", "false") + ) +} + +class DataprocPlatform(platformType: String) extends Platform(platformType) + +class EmrPlatform(platformType: String) extends Platform(platformType) + +class OnPremPlatform extends Platform(PlatformNames.ONPREM) + +/** + * Factory for creating instances of different platforms. + * This factory supports various platforms and provides methods for creating + * corresponding platform instances. + */ +object PlatformFactory extends Logging { + /** + * Creates an instance of a platform based on the specified platform key. + * If platform key is not defined, returns an instance of onprem platform. + * + * @param platformKey The key representing the desired platform. + * @return An instance of the specified platform. + * @throws IllegalArgumentException if the specified platform key is not supported. + */ + @tailrec + def createInstance(platformKey: String = PlatformNames.DEFAULT): Platform = { + platformKey match { + case PlatformNames.DATABRICKS_AWS | PlatformNames.DATABRICKS_AZURE => + new DatabricksPlatform(platformKey) + case PlatformNames.DATAPROC | PlatformNames.DATAPROC_T4 => + // if no GPU specified, then default to dataproc-t4 for backward compatibility + new DataprocPlatform(PlatformNames.DATAPROC_T4) + case PlatformNames.DATAPROC_L4 | PlatformNames.DATAPROC_SL_L4 | + PlatformNames.DATAPROC_GKE_L4 | PlatformNames.DATAPROC_GKE_T4 => + new DataprocPlatform(platformKey) + case PlatformNames.EMR | PlatformNames.EMR_T4 => + // if no GPU specified, then default to emr-t4 for backward compatibility + new EmrPlatform(PlatformNames.EMR_T4) + case PlatformNames.EMR_A10 => new EmrPlatform(PlatformNames.EMR_A10) + case PlatformNames.ONPREM => new OnPremPlatform + case p if p.isEmpty => + logInfo(s"Platform is not specified. Using ${PlatformNames.DEFAULT} " + + "as default.") + PlatformFactory.createInstance(PlatformNames.DEFAULT) + case _ => throw new IllegalArgumentException(s"Unsupported platform: $platformKey. " + + s"Options include ${PlatformNames.getAllNames.mkString(", ")}.") + } + } +} diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala index 96829df86..b809d9521 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala @@ -39,7 +39,9 @@ class ExecInfo( val children: Option[Seq[ExecInfo]], // only one level deep val stages: Set[Int] = Set.empty, val shouldRemove: Boolean = false, - val unsupportedExprs: Array[String] = Array.empty) { + val unsupportedExprs: Array[String] = Array.empty, + val dataSet: Boolean = false, + val udf: Boolean = false) { private def childrenToString = { val str = children.map { c => c.map(" " + _.toString).mkString("\n") @@ -76,7 +78,8 @@ object SQLPlanParser extends Logging { val windowFunctionPattern = """(\w+)\(""".r - val ignoreExpressions = Array("any", "cast", "decimal", "decimaltype", "every", "some", + val ignoreExpressions = Array("any", "cast", "ansi_cast", "decimal", "decimaltype", "every", + "some", "merge_max", "merge_min", "merge_sum", "merge_count", "merge_avg", "merge_first", "list", // current_database does not cause any CPU fallbacks "current_database", @@ -301,13 +304,12 @@ object SQLPlanParser extends Logging { } val stagesInNode = getStagesInSQLNode(node, app) val supported = execInfos.isSupported && !ds && !containsUDF - // shouldRemove is set to true if the exec is a member of "execsToBeRemoved" or if the node // is a duplicate val removeFlag = execInfos.shouldRemove || isDupNode || execsToBeRemoved.contains(node.name) Seq(new ExecInfo(execInfos.sqlID, execInfos.exec, execInfos.expr, execInfos.speedupFactor, execInfos.duration, execInfos.nodeId, supported, execInfos.children, - stagesInNode, removeFlag, execInfos.unsupportedExprs)) + stagesInNode, removeFlag, execInfos.unsupportedExprs, ds, containsUDF)) } } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala index 9b78a5b41..5419df9c6 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/AutoTuner.scala @@ -25,6 +25,7 @@ import scala.collection.mutable import scala.collection.mutable.ListBuffer import scala.util.matching.Regex +import com.nvidia.spark.rapids.tool.{Platform, PlatformFactory} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path} import org.yaml.snakeyaml.{DumperOptions, LoaderOptions, Yaml} @@ -329,7 +330,8 @@ class RecommendationEntry(val name: String, class AutoTuner( val clusterProps: ClusterProperties, val appInfoProvider: AppSummaryInfoBaseProvider, - val platform: String) extends Logging { + val platform: Platform, + unsupportedOperators: Seq[DriverLogUnsupportedOperators]) extends Logging { import AutoTuner._ @@ -343,23 +345,15 @@ class AutoTuner( private val limitedLogicRecommendations: mutable.HashSet[String] = mutable.HashSet[String]() // When enabled, the profiler recommendations should only include updated settings. private var filterByUpdatedPropertiesEnabled: Boolean = true - val selectedPlatform: Platform = platform match { - case "databricks" => new DatabricksPlatform() - case "dataproc" => new DataprocPlatform() - case "emr" => new EmrPlatform() - case "onprem" => new OnPremPlatform() - } private def isCalculationEnabled(prop: String) : Boolean = { !limitedLogicRecommendations.contains(prop) } def getPropertyValue(key: String): Option[String] = { - val fromProfile = appInfoProvider.getProperty(key) - fromProfile match { - case None => Option(clusterProps.softwareProperties.get(key)) - case Some(_) => fromProfile - } + val fromProfile = Option(appInfoProvider).flatMap(_.getProperty(key)) + // If the value is not found above, fallback to cluster properties + fromProfile.orElse(Option(clusterProps.softwareProperties.get(key))) } def initRecommendations(): Unit = { @@ -824,6 +818,20 @@ class AutoTuner( appendRecommendation("spark.sql.shuffle.partitions", s"$shufflePartitions") } + /** + * Analyzes unsupported driver logs and generates recommendations for configuration properties. + */ + private def recommendFromDriverLogs(): Unit = { + // Iterate through unsupported operators' reasons and check for matching properties + unsupportedOperators.map(_.reason).foreach { operatorReason => + recommendationsFromDriverLogs.collect { + case (config, recommendedValue) if operatorReason.contains(config) => + appendRecommendation(config, recommendedValue) + appendComment(commentForExperimentalConfig(config)) + } + } + } + def appendOptionalComment(lookup: String, comment: String): Unit = { if (!skippedRecommendations.contains(lookup)) { appendComment(comment) @@ -912,7 +920,7 @@ class AutoTuner( limitedSeq.foreach(_ => limitedLogicRecommendations.add(_)) } skipList.foreach(skipSeq => skipSeq.foreach(_ => skippedRecommendations.add(_))) - skippedRecommendations ++= selectedPlatform.recommendationsToExclude + skippedRecommendations ++= platform.recommendationsToExclude initRecommendations() calculateJobLevelRecommendations() if (processPropsAndCheck) { @@ -922,10 +930,13 @@ class AutoTuner( addDefaultComments() } // add all platform specific recommendations - selectedPlatform.recommendationsToInclude.foreach { + platform.recommendationsToInclude.foreach { case (property, value) => appendRecommendation(property, value) } } + if (unsupportedOperators.nonEmpty) { + recommendFromDriverLogs() + } (toRecommendationsProfileResult, toCommentProfileResult) } } @@ -975,6 +986,8 @@ object AutoTuner extends Logging { val DEF_READ_SIZE_THRESHOLD = 100 * 1024L * 1024L * 1024L val DEFAULT_WORKER_INFO_PATH = "./worker_info.yaml" val SUPPORTED_SIZE_UNITS: Seq[String] = Seq("b", "k", "m", "g", "t", "p") + private val DOC_URL: String = "https://nvidia.github.io/spark-rapids/docs/" + + "additional-functionality/advanced_configs.html#advanced-configuration" val commentsForMissingProps: Map[String, String] = Map( "spark.executor.memory" -> @@ -1022,15 +1035,27 @@ object AutoTuner extends Logging { " If the Spark RAPIDS jar is being bundled with your Spark\n" + " distribution, this step is not needed.") ) + + // Recommended values for specific unsupported configurations + private val recommendationsFromDriverLogs: Map[String, String] = Map( + "spark.rapids.sql.incompatibleDateFormats.enabled" -> "true" + ) + + def commentForExperimentalConfig(config: String): String = { + s"Using $config does not guarantee to produce the same results as CPU. " + + s"Please refer to $DOC_URL." + } + // the plugin jar is in the form of rapids-4-spark_scala_binary-(version)-*.jar val pluginJarRegEx: Regex = "rapids-4-spark_\\d\\.\\d+-(\\d{2}\\.\\d{2}\\.\\d+).*\\.jar".r private def handleException( ex: Exception, appInfo: AppSummaryInfoBaseProvider, - platform: String): AutoTuner = { + platform: Platform, + unsupportedOperators: Seq[DriverLogUnsupportedOperators]): AutoTuner = { logError("Exception: " + ex.getStackTrace.mkString("Array(", ", ", ")")) - val tuning = new AutoTuner(new ClusterProperties(), appInfo, platform) + val tuning = new AutoTuner(new ClusterProperties(), appInfo, platform, unsupportedOperators) val msg = ex match { case cEx: ConstructorException => cEx.getContext case _ => if (ex.getCause != null) ex.getCause.toString else ex.toString @@ -1080,26 +1105,30 @@ object AutoTuner extends Logging { def buildAutoTunerFromProps( clusterProps: String, singleAppProvider: AppSummaryInfoBaseProvider, - platform: String = Profiler.DEFAULT_PLATFORM): AutoTuner = { + platform: Platform = PlatformFactory.createInstance(), + unsupportedOperators: Seq[DriverLogUnsupportedOperators] = Seq.empty): AutoTuner = { try { val clusterPropsOpt = loadClusterPropertiesFromContent(clusterProps) - new AutoTuner(clusterPropsOpt.getOrElse(new ClusterProperties()), singleAppProvider, platform) + new AutoTuner(clusterPropsOpt.getOrElse(new ClusterProperties()), singleAppProvider, platform, + unsupportedOperators) } catch { case e: Exception => - handleException(e, singleAppProvider, platform) + handleException(e, singleAppProvider, platform, unsupportedOperators) } } def buildAutoTuner( filePath: String, singleAppProvider: AppSummaryInfoBaseProvider, - platform: String = Profiler.DEFAULT_PLATFORM): AutoTuner = { + platform: Platform = PlatformFactory.createInstance(), + unsupportedOperators: Seq[DriverLogUnsupportedOperators] = Seq.empty): AutoTuner = { try { val clusterPropsOpt = loadClusterProps(filePath) - new AutoTuner(clusterPropsOpt.getOrElse(new ClusterProperties()), singleAppProvider, platform) + new AutoTuner(clusterPropsOpt.getOrElse(new ClusterProperties()), singleAppProvider, platform, + unsupportedOperators) } catch { case e: Exception => - handleException(e, singleAppProvider, platform) + handleException(e, singleAppProvider, platform, unsupportedOperators) } } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Platform.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Platform.scala deleted file mode 100644 index 46c6bc8e0..000000000 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Platform.scala +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids.tool.profiling - -/** - * Represents a platform and its associated recommendations. - */ -class Platform { - /** - * Recommendations to be excluded from the list of recommendations. - * These have the highest priority. - */ - val recommendationsToExclude: Seq[String] = Seq.empty - /** - * Recommendations to be included in the final list of recommendations. - * These properties should be specific to the platform and not general Spark properties. - * For example: "spark.databricks.optimizer.dynamicFilePruning" for the Databricks platform. - * - * Represented as a tuple of (propertyKey, propertyValue). - */ - val recommendationsToInclude: Seq[(String, String)] = Seq.empty - /** - * Dynamically calculates the recommendation for a specific Spark property by invoking - * the appropriate function based on `sparkProperty`. - * TODO: Implement this function and integrate with existing code in AutoTuner - * - * @param sparkProperty The Spark property for which the recommendation is calculated. - * @param args Variable list of arguments passed to the calculation function for dynamic - * processing. - * @return Optional string containing the recommendation, or `None` if unavailable. - */ - def getRecommendation(sparkProperty: String, args: Any*): Option[String] = None - - /** - * Checks if the `property` is valid: - * 1. It should not be in exclusion list - * OR - * 2. It should be in the inclusion list - */ - def isValidRecommendation(property: String): Boolean = { - !recommendationsToExclude.contains(property) || - recommendationsToInclude.map(_._1).contains(property) - } - - /** - * Checks if the `comment` is valid: - * 1. It should not have any property from the exclusion list - */ - def isValidComment(comment: String): Boolean = { - recommendationsToExclude.forall(excluded => !comment.contains(excluded)) - } -} - -class DatabricksPlatform extends Platform { - override val recommendationsToExclude: Seq[String] = Seq( - "spark.executor.cores", - "spark.executor.instances", - "spark.executor.memory", - "spark.executor.memoryOverhead" - ) - override val recommendationsToInclude: Seq[(String, String)] = Seq( - ("spark.databricks.optimizer.dynamicFilePruning", "false") - ) -} - -class DataprocPlatform extends Platform {} - -class EmrPlatform extends Platform {} - -class OnPremPlatform extends Platform {} diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala index b1044a4ed..839e3789e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala @@ -15,6 +15,7 @@ */ package com.nvidia.spark.rapids.tool.profiling +import com.nvidia.spark.rapids.tool.PlatformNames import org.rogach.scallop.{ScallopConf, ScallopOption} import org.rogach.scallop.exceptions.ScallopException @@ -70,9 +71,9 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* val platform: ScallopOption[String] = opt[String](required = false, descr = "Cluster platform where Spark GPU workloads were executed. Options include " + - "onprem, dataproc, emr, databricks." + - " Default is onprem.", - default = Some(Profiler.DEFAULT_PLATFORM)) + s"${PlatformNames.getAllNames.mkString(", ")}. " + + s"Default is ${PlatformNames.DEFAULT}.", + default = Some(PlatformNames.DEFAULT)) val generateTimeline: ScallopOption[Boolean] = opt[Boolean](required = false, descr = "Write an SVG graph out for the full application timeline.") diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileMain.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileMain.scala index d839f76a8..71759a9ae 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileMain.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileMain.scala @@ -77,10 +77,10 @@ object ProfileMain extends Logging { } val profiler = new Profiler(hadoopConf, appArgs, enablePB) - profiler.profile(eventLogFsFiltered) - if (driverLog.nonEmpty){ - profiler.profileDriver(driverLog) + if (driverLog.nonEmpty) { + profiler.profileDriver(driverLog, eventLogFsFiltered.isEmpty) } + profiler.profile(eventLogFsFiltered) (0, filteredLogs.size) } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index 44528966b..a3fff6067 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.control.NonFatal import com.nvidia.spark.rapids.ThreadFactoryBuilder -import com.nvidia.spark.rapids.tool.{EventLogInfo, EventLogPathProcessor} +import com.nvidia.spark.rapids.tool.{EventLogInfo, EventLogPathProcessor, PlatformFactory} import org.apache.hadoop.conf.Configuration import org.apache.spark.internal.Logging @@ -124,15 +124,21 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea progressBar.foreach(_.finishAll()) } - def profileDriver(driverLogInfos: String): Unit = { + def profileDriver(driverLogInfos: String, eventLogsEmpty: Boolean): Unit = { val profileOutputWriter = new ProfileOutputWriter(s"$outputDir/driver", Profiler.DRIVER_LOG_NAME, numOutputRows, true) - try { val driverLogProcessor = new DriverLogProcessor(driverLogInfos) - val unsupportedDrivers = driverLogProcessor.processDriverLog() + val unsupportedDriverOperators = driverLogProcessor.processDriverLog() profileOutputWriter.write(s"Unsupported operators in driver log", - unsupportedDrivers) + unsupportedDriverOperators) + if (eventLogsEmpty && useAutoTuner) { + // Since event logs are empty, AutoTuner will not run while processing event logs. + // We need to run it here explicitly. + val (properties, comments) = runAutoTuner(None, unsupportedDriverOperators) + profileOutputWriter.writeText("\n### A. Recommended Configuration ###\n") + profileOutputWriter.writeText(Profiler.getAutoTunerResultsAsString(properties, comments)) + } } finally { profileOutputWriter.close() } @@ -403,6 +409,26 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea appLogPath, ioAnalysisMetrics), compareRes) } + /** + * A wrapper method to run the AutoTuner. + * @param appInfo Summary of the application for tuning. + * @param unsupportedDriverOperators List of unsupported operators from driver log + */ + private def runAutoTuner(appInfo: Option[ApplicationSummaryInfo], + unsupportedDriverOperators: Seq[DriverLogUnsupportedOperators]) + : (Seq[RecommendedPropertyResult], Seq[RecommendedCommentResult]) = { + val appInfoProvider = appInfo.map(new SingleAppSummaryInfoProvider(_)).orNull + val workerInfoPath = appArgs.workerInfo.getOrElse(AutoTuner.DEFAULT_WORKER_INFO_PATH) + val platform = appArgs.platform() + val autoTuner: AutoTuner = AutoTuner.buildAutoTuner(workerInfoPath, appInfoProvider, + PlatformFactory.createInstance(platform), unsupportedDriverOperators) + + // The autotuner allows skipping some properties, + // e.g., getRecommendedProperties(Some(Seq("spark.executor.instances"))) skips the + // recommendation related to executor instances. + autoTuner.getRecommendedProperties() + } + def writeOutput(profileOutputWriter: ProfileOutputWriter, appsSum: Seq[ApplicationSummaryInfo], outputCombined: Boolean, comparedRes: Option[CompareSummaryInfo] = None): Unit = { @@ -464,7 +490,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea } else { appsSum } - sums.foreach { app => + sums.foreach { app: ApplicationSummaryInfo => profileOutputWriter.writeText("### A. Information Collected ###") profileOutputWriter.write("Application Information", app.appInfo) profileOutputWriter.write("Application Log Path Mapping", app.appLogPath) @@ -510,14 +536,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea Some("Unsupported SQL Ops")) if (useAutoTuner) { - val workerInfoPath = appArgs.workerInfo.getOrElse(AutoTuner.DEFAULT_WORKER_INFO_PATH) - val platform = appArgs.platform.getOrElse(Profiler.DEFAULT_PLATFORM) - val autoTuner: AutoTuner = AutoTuner.buildAutoTuner(workerInfoPath, - new SingleAppSummaryInfoProvider(app), platform) - // the autotuner allows skipping some properties - // e.g. getRecommendedProperties(Some(Seq("spark.executor.instances"))) skips the - // recommendation related to executor instances. - val (properties, comments) = autoTuner.getRecommendedProperties() + val (properties, comments) = runAutoTuner(Some(app), Seq.empty) profileOutputWriter.writeText("\n### D. Recommended Configuration ###\n") profileOutputWriter.writeText(Profiler.getAutoTunerResultsAsString(properties, comments)) } @@ -548,7 +567,6 @@ object Profiler { val COMPARE_LOG_FILE_NAME_PREFIX = "rapids_4_spark_tools_compare" val COMBINED_LOG_FILE_NAME_PREFIX = "rapids_4_spark_tools_combined" val SUBDIR = "rapids_4_spark_profile" - val DEFAULT_PLATFORM = "onprem" def getAutoTunerResultsAsString(props: Seq[RecommendedPropertyResult], comments: Seq[RecommendedCommentResult]): String = { diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala index 7baf5455e..0ff5bd614 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeChecker.scala @@ -20,6 +20,7 @@ import scala.collection.mutable.{ArrayBuffer,HashMap} import scala.io.{BufferedSource, Source} import scala.util.control.NonFatal +import com.nvidia.spark.rapids.tool.{Platform, PlatformFactory} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} @@ -32,7 +33,7 @@ import org.apache.spark.internal.Logging * by the plugin which lists the formats and types supported. * The class also supports a custom speedup factor file as input. */ -class PluginTypeChecker(platform: String = "onprem", +class PluginTypeChecker(platform: Platform = PlatformFactory.createInstance(), speedupFactorFile: Option[String] = None) extends Logging { private val NS = "NS" @@ -44,16 +45,6 @@ class PluginTypeChecker(platform: String = "onprem", private val NA = "NA" private val DEFAULT_DS_FILE = "supportedDataSource.csv" - private val OPERATORS_SCORE_FILE_ONPREM = "operatorsScore.csv" - private val OPERATORS_SCORE_FILE_DATAPROC_T4 = "operatorsScore-dataproc-t4.csv" - private val OPERATORS_SCORE_FILE_DATAPROC_L4 = "operatorsScore-dataproc-l4.csv" - private val OPERATORS_SCORE_FILE_DATAPROC_SL_L4 = "operatorsScore-dataproc-serverless-l4.csv" - private val OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 = "operatorsScore-dataproc-gke-t4.csv" - private val OPERATORS_SCORE_FILE_DATAPROC_GKE_L4 = "operatorsScore-dataproc-gke-l4.csv" - private val OPERATORS_SCORE_FILE_EMR_T4 = "operatorsScore-emr-t4.csv" - private val OPERATORS_SCORE_FILE_EMR_A10 = "operatorsScore-emr-a10.csv" - private val OPERATORS_SCORE_FILE_DATABRICKS_AWS = "operatorsScore-databricks-aws.csv" - private val OPERATORS_SCORE_FILE_DATABRICKS_AZURE = "operatorsScore-databricks-azure.csv" private val SUPPORTED_EXECS_FILE = "supportedExecs.csv" private val SUPPORTED_EXPRS_FILE = "supportedExprs.csv" @@ -101,20 +92,7 @@ class PluginTypeChecker(platform: String = "onprem", speedupFactorFile match { case None => logInfo(s"Reading operators scores with platform: $platform") - val file = platform match { - // if no GPU specified, then default to dataproc-t4 for backward compatibility - case "dataproc-t4" | "dataproc" => OPERATORS_SCORE_FILE_DATAPROC_T4 - case "dataproc-l4" => OPERATORS_SCORE_FILE_DATAPROC_L4 - case "dataproc-serverless-l4" => OPERATORS_SCORE_FILE_DATAPROC_SL_L4 - case "dataproc-gke-t4" => OPERATORS_SCORE_FILE_DATAPROC_GKE_T4 - case "dataproc-gke-l4" => OPERATORS_SCORE_FILE_DATAPROC_GKE_L4 - // if no GPU specified, then default to emr-t4 for backward compatibility - case "emr-t4" | "emr" => OPERATORS_SCORE_FILE_EMR_T4 - case "emr-a10" => OPERATORS_SCORE_FILE_EMR_A10 - case "databricks-aws" => OPERATORS_SCORE_FILE_DATABRICKS_AWS - case "databricks-azure" => OPERATORS_SCORE_FILE_DATABRICKS_AZURE - case _ => OPERATORS_SCORE_FILE_ONPREM - } + val file = platform.getOperatorScoreFile val source = Source.fromResource(file) readSupportedOperators(source, "score").map(x => (x._1, x._2.toDouble)) case Some(file) => diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala index d4eff43fc..bd1b91654 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala @@ -24,7 +24,7 @@ import com.nvidia.spark.rapids.tool.profiling.ProfileUtils.replaceDelimiter import com.nvidia.spark.rapids.tool.qualification.QualOutputWriter.{CLUSTER_ID, CLUSTER_ID_STR_SIZE, JOB_ID, JOB_ID_STR_SIZE, RUN_NAME, RUN_NAME_STR_SIZE, TEXT_DELIMITER} import org.apache.hadoop.conf.Configuration -import org.apache.spark.sql.rapids.tool.ToolUtils +import org.apache.spark.sql.rapids.tool.{IgnoreExecs, ToolUtils} import org.apache.spark.sql.rapids.tool.qualification.{EstimatedPerSQLSummaryInfo, EstimatedSummaryInfo, QualificationAppInfo, QualificationSummaryInfo, StatusSummaryInfo} import org.apache.spark.sql.rapids.tool.util._ @@ -151,13 +151,38 @@ class QualOutputWriter(outputDir: String, reportReadSchema: Boolean, val csvFileWriter = new ToolTextFileWriter(outputDir, s"${QualOutputWriter.LOGFILE_NAME}_unsupportedOperators.csv", "Unsupported Operators CSV Report", hadoopConf) - val headersAndSizes = QualOutputWriter.getUnsupportedOperatorsHeaderStringsAndSizes(sums) - csvFileWriter.write(QualOutputWriter.constructOutputRowFromMap(headersAndSizes, - QualOutputWriter.CSV_DELIMITER, false)) - sums.foreach { sum => - val rows = QualOutputWriter.constructUnsupportedOperatorsInfo(sum, headersAndSizes, - QualOutputWriter.CSV_DELIMITER, false) - rows.foreach(row => csvFileWriter.write(row)) + try { + val headersAndSizes = QualOutputWriter.getUnsupportedOperatorsHeaderStringsAndSizes(sums) + csvFileWriter.write(QualOutputWriter.constructOutputRowFromMap(headersAndSizes, + QualOutputWriter.CSV_DELIMITER, false)) + sums.foreach { sum => + val rows = QualOutputWriter.constructUnsupportedOperatorsInfo(sum, headersAndSizes, + QualOutputWriter.CSV_DELIMITER, false) + rows.foreach(row => csvFileWriter.write(row)) + } + } finally { + csvFileWriter.close() + } + } + + def writeUnsupportedOperatorsDetailedStageCSVReport( + sums: Seq[QualificationSummaryInfo], + order: String): Unit = { + val csvFileWriter = new ToolTextFileWriter(outputDir, + s"${QualOutputWriter.LOGFILE_NAME}_unsupportedOperatorsStageDuration.csv", + "Unsupported Operators StageDuration CSV Report", hadoopConf) + try { + val headersAndSizes = + QualOutputWriter.getUnsupportedOperatorsStageDurationsHeaderStringsAndSizes(sums) + csvFileWriter.write(QualOutputWriter.constructOutputRowFromMap(headersAndSizes, + QualOutputWriter.CSV_DELIMITER, false)) + sums.foreach { sum => + val rows = QualOutputWriter.constructUnsupportedStagesDurationInfo(sum, headersAndSizes, + QualOutputWriter.CSV_DELIMITER, false) + rows.foreach(row => csvFileWriter.write(row)) + } + } finally { + csvFileWriter.close() } } @@ -361,6 +386,7 @@ object QualOutputWriter { val SQL_DUR_STR = "SQL DF Duration" val TASK_DUR_STR = "SQL Dataframe Task Duration" val STAGE_DUR_STR = "Stage Task Duration" + val STAGE_WALLCLOCK_DUR_STR = "Stage Duration" val POT_PROBLEM_STR = "Potential Problems" val EXEC_CPU_PERCENT_STR = "Executor CPU Time Percent" val APP_DUR_ESTIMATED_STR = "App Duration Estimated" @@ -400,6 +426,7 @@ object QualOutputWriter { val UNSUPPORTED_TYPE = "Unsupported Type" val DETAILS = "Details" val NOTES = "Notes" + val IGNORE_OPERATOR = "Ignore Operator" val RUN_NAME = "RunName" val ESTIMATED_FREQUENCY = "Estimated Job Frequency (monthly)" val ML_FUNCTIONS = "ML Functions" @@ -540,11 +567,25 @@ object QualOutputWriter { APP_ID_STR -> QualOutputWriter.getAppIdSize(appInfos), UNSUPPORTED_TYPE -> UNSUPPORTED_TYPE.size, DETAILS -> DETAILS.size, - NOTES -> NOTES.size + NOTES -> NOTES.size, + IGNORE_OPERATOR -> IGNORE_OPERATOR.size ) detailedHeaderAndFields } + def getUnsupportedOperatorsStageDurationsHeaderStringsAndSizes( + appInfos: Seq[QualificationSummaryInfo]): LinkedHashMap[String, Int] = { + val detailedHeaderAndFields = LinkedHashMap[String, Int]( + APP_ID_STR -> QualOutputWriter.getAppIdSize(appInfos), + UNSUPPORTED_TYPE -> UNSUPPORTED_TYPE.size, + STAGE_ID_STR -> STAGE_ID_STR.size, + STAGE_WALLCLOCK_DUR_STR -> STAGE_WALLCLOCK_DUR_STR.size, + APP_DUR_STR -> APP_DUR_STR.size, + SPEEDUP_BUCKET_STR -> SPEEDUP_BUCKET_STR_SIZE, + IGNORE_OPERATOR -> IGNORE_OPERATOR.size + ) + detailedHeaderAndFields + } def getDetailedHeaderStringsAndSizes(appInfos: Seq[QualificationSummaryInfo], reportReadSchema: Boolean): LinkedHashMap[String, Int] = { @@ -886,7 +927,7 @@ object QualOutputWriter { } } - def constructUnsupportedOperatorsInfo( + def constructUnsupportedStagesDurationInfo( sumInfo: QualificationSummaryInfo, headersAndSizes: LinkedHashMap[String, Int], delimiter: String = TEXT_DELIMITER, @@ -895,81 +936,130 @@ object QualOutputWriter { val reformatCSVFunc: String => String = if (reformatCSV) str => StringUtils.reformatCSVString(str) else str => stringIfempty(str) val appId = sumInfo.appId - val readFormat = sumInfo.readFileFormatAndTypesNotSupported - val writeFormat = sumInfo.writeDataFormat - val unsupportedExecs = sumInfo.unSupportedExecs - val unsupportedExprs = sumInfo.unSupportedExprs - val unsupportedExecExprsMap = sumInfo.unsupportedExecstoExprsMap - val unsupportedOperatorsOutputRows = new ArrayBuffer[String]() + val appDuration = sumInfo.sparkSqlDFWallClockDuration + val recommendation = sumInfo.estimatedInfo.recommendation - if (readFormat.nonEmpty) { - val unsupportedReadFormatRows = readFormat.map { format => - val readFormatAndType = format.split("\\[") - val readFormat = readFormatAndType(0) - val readType = if (readFormatAndType.size > 1) { - s"Types not supported - ${readFormatAndType(1).replace("]", "")}" - } else { + sumInfo.stageInfo.collect { + case info if info.unsupportedExecs.nonEmpty => + val stageAppDuration = info.stageWallclockDuration + val allUnsupportedExecs = info.unsupportedExecs + if (allUnsupportedExecs.nonEmpty) { + allUnsupportedExecs.map { unsupportedExecsStr => + // Ignore operator is a boolean value which indicates if the operator should be + // considered for GPU acceleration or not. If the value is true, the operator will + // be ignored. + val ignoreUnsupportedExec = if ( + IgnoreExecs.getAllIgnoreExecs.contains(unsupportedExecsStr)) { + IgnoreExecs.True + } else { + IgnoreExecs.False + } + val data = ListBuffer[(String, Int)]( + reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR), + reformatCSVFunc(unsupportedExecsStr) -> headersAndSizes(UNSUPPORTED_TYPE), + info.stageId.toString -> headersAndSizes(STAGE_ID_STR), + stageAppDuration.toString -> headersAndSizes(STAGE_WALLCLOCK_DUR_STR), + appDuration.toString -> headersAndSizes(APP_DUR_STR), + recommendation -> headersAndSizes(SPEEDUP_BUCKET_STR), + ignoreUnsupportedExec -> headersAndSizes(IGNORE_OPERATOR) + ) + constructOutputRow(data, delimiter, prettyPrint) + }.mkString + } + else { "" } - val data = ListBuffer( - reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR), - reformatCSVFunc("Read")-> headersAndSizes(UNSUPPORTED_TYPE), - reformatCSVFunc(readFormat) -> headersAndSizes(DETAILS), - reformatCSVFunc(readType) -> headersAndSizes(NOTES) - ) - constructOutputRow(data, delimiter, prettyPrint) + } + } + + def constructUnsupportedOperatorsInfo( + sumInfo: QualificationSummaryInfo, + headersAndSizes: LinkedHashMap[String, Int], + delimiter: String = TEXT_DELIMITER, + prettyPrint: Boolean, + reformatCSV: Boolean = true): Seq[String] = { + + val reformatCSVFunc: String => String = + if (reformatCSV) str => StringUtils.reformatCSVString(str) else str => stringIfempty(str) + + val appId = reformatCSVFunc(sumInfo.appId) + val unsupportedOperatorsOutputRows = new ArrayBuffer[String]() + val unsupportedExprs = sumInfo.unSupportedExprs + val allExecs = getAllExecsFromPlan(sumInfo.planInfo) + val dataSetExecs = allExecs.collect { case x if x.dataSet => x.exec } + val udfExecs = allExecs.collect { case x if x.udf => x.exec } + + def createUnsupportedRow(exec: String, execType: String, notes: String, + ignoreOperator: String = IgnoreExecs.False): String = { + val data = ListBuffer( + appId -> headersAndSizes(APP_ID_STR), + reformatCSVFunc(execType) -> headersAndSizes(UNSUPPORTED_TYPE), + reformatCSVFunc(exec) -> headersAndSizes(DETAILS), + reformatCSVFunc(notes) -> headersAndSizes(NOTES), + reformatCSVFunc(ignoreOperator) -> headersAndSizes(IGNORE_OPERATOR) + ) + constructOutputRow(data, delimiter, prettyPrint) + } + + val readFormatRows = sumInfo.readFileFormatAndTypesNotSupported.map { format => + val readFormatAndType = format.split("\\[") + val readFormat = readFormatAndType(0) + val readType = if (readFormatAndType.size > 1) { + s"Types not supported - ${readFormatAndType(1).replace("]", "")}" + } else { + "" } - unsupportedOperatorsOutputRows ++= unsupportedReadFormatRows + createUnsupportedRow(readFormat,"Read", readType) } - if (unsupportedExecs.nonEmpty) { - val unsupportedExecRows = unsupportedExecs.split(";").map { exec => - val data = ListBuffer( - reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR), - reformatCSVFunc("Exec") -> headersAndSizes(UNSUPPORTED_TYPE), - reformatCSVFunc(exec) -> headersAndSizes(DETAILS), - reformatCSVFunc("") -> headersAndSizes(NOTES) - ) - constructOutputRow(data, delimiter, prettyPrint) + unsupportedOperatorsOutputRows ++= readFormatRows + + // Unsupported Execs and Execs that are not supported due to unsupported expressions, or if + // the operation is from a dataset, or if the operation contains a UDF. + val unsupportedExecExprsMap = sumInfo.unsupportedExecstoExprsMap + val unsupportedExecsSet = sumInfo.unSupportedExecs.split(";").toSet + val unsupportedExecsFiltered = unsupportedExecsSet.filterNot(unsupportedExecExprsMap.contains) + val actualunsupportedExecs = unsupportedExecsFiltered.filterNot(x => dataSetExecs.contains(x) + || udfExecs.contains(x) || unsupportedExecExprsMap.contains(x)) + val unsupportedExecRows = actualunsupportedExecs.map { exec => + // If the exec is in the ignore list, then set the ignore operator to true. + if (IgnoreExecs.getAllIgnoreExecs.contains(exec)) { + createUnsupportedRow(exec, "Exec", "", IgnoreExecs.True) + } else { + createUnsupportedRow(exec, "Exec", "", IgnoreExecs.False) } - unsupportedOperatorsOutputRows ++= unsupportedExecRows } - if (unsupportedExecExprsMap.nonEmpty) { - val unsupportedExecExprMapRows = unsupportedExecExprsMap.map { case (exec, exprs) => - val data = ListBuffer( - reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR), - reformatCSVFunc("Exec") -> headersAndSizes(UNSUPPORTED_TYPE), - reformatCSVFunc(exec) -> headersAndSizes(DETAILS), - reformatCSVFunc(s"$exec Exec is not supported as expressions are " + - s"not supported - `$exprs`") -> headersAndSizes(NOTES) - ) - constructOutputRow(data, delimiter, prettyPrint) - }.toArray - unsupportedOperatorsOutputRows ++= unsupportedExecExprMapRows + unsupportedOperatorsOutputRows ++= unsupportedExecRows + + val unsupportedDatasetExecRows = dataSetExecs.map { exec => + createUnsupportedRow(exec, "Exec", s"$exec Exec is not supported as " + + s"this operation is from dataset which is not supported") + } + unsupportedOperatorsOutputRows ++= unsupportedDatasetExecRows + + val unsupportedUdfExecRows = udfExecs.map { exec => + createUnsupportedRow(exec, "Exec", s"$exec Exec is " + + s"not supported as it contains UDF which is not supported") } + unsupportedOperatorsOutputRows ++= unsupportedUdfExecRows + + val unsupportedExecExprMapRows = sumInfo.unsupportedExecstoExprsMap.map { case (exec, exprs) => + createUnsupportedRow(exec, "Exec", s"$exec Exec is not" + + s" supported as expressions are not supported - `$exprs`") + }.toArray + unsupportedOperatorsOutputRows ++= unsupportedExecExprMapRows + if (unsupportedExprs.nonEmpty) { - val unsupportedExprRows = unsupportedExprs.split(";").map { expr => - val data = ListBuffer( - reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR), - reformatCSVFunc("Expression") -> headersAndSizes(UNSUPPORTED_TYPE), - reformatCSVFunc(expr) -> headersAndSizes(DETAILS), - reformatCSVFunc("") -> headersAndSizes(NOTES) - ) - constructOutputRow(data, delimiter, prettyPrint) + val unsupportedExprRows = sumInfo.unSupportedExprs.split(";").map { expr => + createUnsupportedRow(expr, "Expression", "") } unsupportedOperatorsOutputRows ++= unsupportedExprRows } - if (writeFormat.nonEmpty) { - val unsupportedwriteFormatRows = writeFormat.map { format => - val data = ListBuffer( - reformatCSVFunc(appId) -> headersAndSizes(APP_ID_STR), - reformatCSVFunc("Write") -> headersAndSizes(UNSUPPORTED_TYPE), - reformatCSVFunc(format) -> headersAndSizes(DETAILS), - reformatCSVFunc("") -> headersAndSizes(NOTES) - ) - constructOutputRow(data, delimiter, prettyPrint) - } - unsupportedOperatorsOutputRows ++= unsupportedwriteFormatRows + + val unsupportedWriteFormatRows = sumInfo.writeDataFormat.map { format => + createUnsupportedRow(format, "Write", "") } + unsupportedOperatorsOutputRows ++= unsupportedWriteFormatRows + unsupportedOperatorsOutputRows } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala index e733d73c2..0286cd582 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala @@ -93,6 +93,7 @@ class Qualification(outputDir: String, numRows: Int, hadoopConf: Configuration, qWriter.writeExecReport(allAppsSum, order) qWriter.writeStageReport(allAppsSum, order) qWriter.writeUnsupportedOperatorsCSVReport(allAppsSum, order) + qWriter.writeUnsupportedOperatorsDetailedStageCSVReport(allAppsSum, order) val appStatusResult = generateStatusSummary(appStatusReporter.asScala.values.toSeq) qWriter.writeStatusReport(appStatusResult, order) if (mlOpsEnabled) { diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala index d998cb0ff..c72477034 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala @@ -15,6 +15,7 @@ */ package com.nvidia.spark.rapids.tool.qualification +import com.nvidia.spark.rapids.tool.PlatformNames import org.rogach.scallop.{ScallopConf, ScallopOption} import org.rogach.scallop.exceptions.ScallopException @@ -155,10 +156,9 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* val platform: ScallopOption[String] = opt[String](required = false, descr = "Cluster platform where Spark CPU workloads were executed. Options include " + - "onprem, dataproc-t4, dataproc-l4, dataproc-serverless-l4, dataproc-gke-t4, " + - "dataproc-gke-l4, emr-t4, emr-a10, databricks-aws, and databricks-azure. Default " + - "is onprem.", - default = Some("onprem")) + s"${PlatformNames.getAllNames.mkString(", ")}. " + + s"Default is ${PlatformNames.DEFAULT}.", + default = Some(PlatformNames.DEFAULT)) val speedupFactorFile: ScallopOption[String] = opt[String](required = false, descr = "Custom speedup factor file used to get estimated GPU speedup that is specific " + diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala index cb8a3c583..454b27695 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala @@ -16,7 +16,7 @@ package com.nvidia.spark.rapids.tool.qualification -import com.nvidia.spark.rapids.tool.EventLogPathProcessor +import com.nvidia.spark.rapids.tool.{EventLogPathProcessor, PlatformFactory} import org.apache.spark.internal.Logging import org.apache.spark.sql.rapids.tool.AppFilterImpl @@ -58,14 +58,16 @@ object QualificationMain extends Logging { val order = appArgs.order.getOrElse("desc") val uiEnabled = appArgs.htmlReport.getOrElse(false) val reportSqlLevel = appArgs.perSql.getOrElse(false) - val platform = appArgs.platform.getOrElse("onprem") + val platform = appArgs.platform() val mlOpsEnabled = appArgs.mlFunctions.getOrElse(false) val penalizeTransitions = appArgs.penalizeTransitions.getOrElse(true) val hadoopConf = RapidsToolsConfUtil.newHadoopConf val pluginTypeChecker = try { - new PluginTypeChecker(platform, appArgs.speedupFactorFile.toOption) + new PluginTypeChecker( + PlatformFactory.createInstance(platform), + appArgs.speedupFactorFile.toOption) } catch { case ie: IllegalStateException => logError("Error creating the plugin type checker!", ie) diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala index 89deb7f1b..faeaf78c2 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala @@ -80,10 +80,14 @@ object ToolUtils extends Logging { val targetEx = i.getTargetException if (targetEx != null) { targetEx match { - case j: com.fasterxml.jackson.core.JsonParseException => + case j: com.fasterxml.jackson.core.io.JsonEOFException => + // Spark3.41+ embeds JsonEOFException in the InvocationTargetException + // We need to show a warning message instead of failing the entire app. + logWarning(s"Incomplete eventlog, ${j.getMessage}") + case k: com.fasterxml.jackson.core.JsonParseException => // this is a parser error thrown by spark-3.4+ which indicates the log is // malformed - throw j + throw k case z: ClassNotFoundException if z.getMessage != null => logWarning(s"ClassNotFoundException while parsing an event: ${z.getMessage}") case t: Throwable => @@ -94,10 +98,15 @@ object ToolUtils extends Logging { // Normally it should not happen that invocation target is null. logError(s"Unknown exception while parsing an event", i) } - case j: com.fasterxml.jackson.core.JsonParseException => + case j: com.fasterxml.jackson.core.io.JsonEOFException => + // Note that JsonEOFException is child of JsonParseException + // In case the eventlog is incomplete (i.e., inprogress), we show a warning message + // because we do not want to cause the entire app to fail. + logWarning(s"Incomplete eventlog, ${j.getMessage}") + case k: com.fasterxml.jackson.core.JsonParseException => // this is a parser error thrown by version prior to spark-3.4+ which indicates the // log is malformed - throw j + throw k } None } @@ -317,6 +326,37 @@ object SQLMetricsStats { } } +object IgnoreExecs { + // AdaptiveSparkPlan is not a real exec. It is a wrapper for the whole plan. + private val AdaptiveSparkPlan = "AdaptiveSparkPlan" + // Collect Limit replacement can be slower on the GPU. Disabled by default. + private val CollectLimit = "CollectLimit" + private val ScanExistingRDD = "Scan ExistingRDD" + private val ExistingRDD = "ExistingRDD" + // Some DDL's and table commands which can be ignored + private val ExecuteCreateViewCommand = "Execute CreateViewCommand" + private val LocalTableScan = "LocalTableScan" + private val ExecuteCreateDatabaseCommand = "Execute CreateDatabaseCommand" + private val ExecuteDropDatabaseCommand = "Execute DropDatabaseCommand" + private val ExecuteCreateTableAsSelectCommand = "Execute CreateTableAsSelectCommand" + private val ExecuteCreateTableCommand = "Execute CreateTableCommand" + private val ExecuteDropTableCommand = "Execute DropTableCommand" + private val ExecuteCreateDataSourceTableAsSelectCommand = "Execute " + + "CreateDataSourceTableAsSelectCommand" + private val SetCatalogAndNamespace = "SetCatalogAndNamespace" + private val ExecuteSetCommand = "Execute SetCommand" + + + val True = "true" + val False = "false" + + def getAllIgnoreExecs: Set[String] = Set(AdaptiveSparkPlan, CollectLimit, ScanExistingRDD, + ExecuteCreateViewCommand, ExistingRDD, LocalTableScan, ExecuteCreateTableCommand, + ExecuteDropTableCommand, ExecuteCreateDatabaseCommand, ExecuteDropDatabaseCommand, + ExecuteCreateTableAsSelectCommand, ExecuteCreateDataSourceTableAsSelectCommand, + SetCatalogAndNamespace, ExecuteSetCommand) +} + object MlOps { val sparkml = "spark.ml." val xgBoost = "spark.XGBoost" diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala index 61377d0b3..a58d1e646 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala @@ -31,7 +31,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.execution.SparkPlanInfo import org.apache.spark.sql.execution.ui.SparkPlanGraph -import org.apache.spark.sql.rapids.tool.{AppBase, GpuEventLogException, SupportedMLFuncsName, ToolUtils} +import org.apache.spark.sql.rapids.tool.{AppBase, GpuEventLogException, IgnoreExecs, SupportedMLFuncsName, ToolUtils} class QualificationAppInfo( eventLogInfo: Option[EventLogInfo], @@ -296,6 +296,7 @@ class QualificationAppInfo( val allSpeedupFactorAvg = SQLPlanParser.averageSpeedup(execInfos.map(_.speedupFactor)) val allFlattenedExecs = flattenedExecs(execInfos) val numUnsupported = allFlattenedExecs.filterNot(_.isSupported) + val unsupportedExecs = numUnsupported.map(_.exec) // if we have unsupported try to guess at how much time. For now divide // time by number of execs and give each one equal weight val eachExecTime = allStageTaskTime / allFlattenedExecs.size @@ -339,8 +340,13 @@ class QualificationAppInfo( eachStageUnsupported } + // Get stage info for the given stageId. + val stageInfos = stageIdToInfo.filterKeys { case (id, _) => id == stageId } + val wallclockStageDuration = stageInfos.values.map(x => x.duration.getOrElse(0L)).sum + StageQualSummaryInfo(stageId, allSpeedupFactorAvg, stageTaskTime, - finalEachStageUnsupported, numTransitions, transitionsTime, estimated) + finalEachStageUnsupported, numTransitions, transitionsTime, estimated, + wallclockStageDuration, unsupportedExecs) }.toSet } @@ -459,7 +465,7 @@ class QualificationAppInfo( c.filterNot(_.shouldRemove) } new ExecInfo(e.sqlID, e.exec, e.expr, e.speedupFactor, e.duration, - e.nodeId, e.isSupported, filteredChildren, e.stages, e.shouldRemove) + e.nodeId, e.isSupported, filteredChildren, e.stages, e.shouldRemove, e.unsupportedExprs) } val filteredPlanInfos = execFilteredChildren.filterNot(_.shouldRemove) p.copy(execInfo = filteredPlanInfos) @@ -586,8 +592,8 @@ class QualificationAppInfo( e.children.map(x => x.filterNot(_.isSupported)) }.flatten topLevelExecs ++ childrenExecs - }.map(_.exec).toSet.mkString(";").trim.replaceAll("\n", "") - .replace(",", ":") + }.map(_.exec).toSet.mkString(";").trim.replaceAll("\n", "").replace(",", ":") + // Get all the unsupported Expressions from the plan val unSupportedExprs = origPlanInfos.map(_.execInfo.flatMap( _.unsupportedExprs)).flatten.filter(_.nonEmpty).toSet.mkString(";") @@ -636,6 +642,9 @@ class QualificationAppInfo( 1 } + val wallClockSqlDFToUse = QualificationAppInfo.wallClockSqlDataFrameToUse( + sparkSQLDFWallClockDuration, appDuration) + val estimatedInfo = QualificationAppInfo.calculateEstimatedInfoSummary(estimatedGPURatio, sparkSQLDFWallClockDuration, appDuration, taskSpeedupFactor, appName, appId, sqlIdsWithFailures.nonEmpty, mlSpeedup, unSupportedExecs, unSupportedExprs, @@ -646,8 +655,8 @@ class QualificationAppInfo( notSupportFormatAndTypesString, getAllReadFileFormats, writeFormat, allComplexTypes, nestedComplexTypes, longestSQLDuration, sqlDataframeTaskDuration, nonSQLTaskDuration, unsupportedSQLTaskDuration, supportedSQLTaskDuration, - taskSpeedupFactor, info.sparkUser, info.startTime, origPlanInfos, - perSqlStageSummary.map(_.stageSum).flatten, estimatedInfo, perSqlInfos, + taskSpeedupFactor, info.sparkUser, info.startTime, wallClockSqlDFToUse, + origPlanInfos, perSqlStageSummary.map(_.stageSum).flatten, estimatedInfo, perSqlInfos, unSupportedExecs, unSupportedExprs, clusterTags, allClusterTagsMap, mlFunctions, mlTotalStageDuration, unsupportedExecExprsMap) } @@ -861,6 +870,7 @@ case class QualificationSummaryInfo( taskSpeedupFactor: Double, user: String, startTime: Long, + sparkSqlDFWallClockDuration: Long, planInfo: Seq[PlanInfo], stageInfo: Seq[StageQualSummaryInfo], estimatedInfo: EstimatedAppInfo, @@ -881,7 +891,9 @@ case class StageQualSummaryInfo( unsupportedTaskDur: Long, numTransitions: Int, transitionTime: Long, - estimated: Boolean = false) + estimated: Boolean = false, + stageWallclockDuration: Long = 0, + unsupportedExecs: Seq[String] = Seq.empty) object QualificationAppInfo extends Logging { // define recommendation constants @@ -926,19 +938,19 @@ object QualificationAppInfo extends Logging { } } + def wallClockSqlDataFrameToUse(sqlDataFrameDuration: Long, appDuration: Long): Long = { + // If our app duration is shorter than our sql duration, estimate the sql duration down + // to app duration + math.min(sqlDataFrameDuration, appDuration) + } + // Summarize and estimate based on wall clock times def calculateEstimatedInfoSummary(estimatedRatio: Double, sqlDataFrameDuration: Long, appDuration: Long, sqlSpeedupFactor: Double, appName: String, appId: String, hasFailures: Boolean, mlSpeedupFactor: Option[MLFuncsSpeedupAndDuration] = None, unsupportedExecs: String = "", unsupportedExprs: String = "", allClusterTagsMap: Map[String, String] = Map.empty[String, String]): EstimatedAppInfo = { - val sqlDataFrameDurationToUse = if (sqlDataFrameDuration > appDuration) { - // our app duration is shorter then our sql duration, estimate the sql duration down - // to app duration - appDuration - } else { - sqlDataFrameDuration - } + val sqlDataFrameDurationToUse = wallClockSqlDataFrameToUse(sqlDataFrameDuration, appDuration) // get the average speedup and duration for ML funcs supported on GPU val (mlSpeedup, mlDuration) = if (mlSpeedupFactor.isDefined) { diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala index 36831386b..755c4f6c8 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/AutoTunerSuite.scala @@ -21,6 +21,7 @@ import java.util import scala.collection.JavaConverters._ import scala.collection.mutable +import com.nvidia.spark.rapids.tool.{PlatformFactory, PlatformNames} import org.scalatest.{BeforeAndAfterEach, FunSuite} import org.scalatest.Matchers.convertToAnyShouldWrapper import org.yaml.snakeyaml.{DumperOptions, Yaml} @@ -1283,16 +1284,17 @@ class AutoTunerSuite extends FunSuite with BeforeAndAfterEach with Logging { assert(expectedResults == autoTunerOutput) } - test("test recommendations for databricks platform argument") { + test("test recommendations for databricks-aws platform argument") { val databricksWorkerInfo = buildWorkerInfoAsString() + val platform = PlatformFactory.createInstance(PlatformNames.DATABRICKS_AWS) val autoTuner = AutoTuner.buildAutoTunerFromProps(databricksWorkerInfo, - getGpuAppMockInfoProvider, "databricks") + getGpuAppMockInfoProvider, platform) val (properties, comments) = autoTuner.getRecommendedProperties() // Assert recommendations are excluded in properties - assert(properties.map(_.property).forall(autoTuner.selectedPlatform.isValidRecommendation)) + assert(properties.map(_.property).forall(autoTuner.platform.isValidRecommendation)) // Assert recommendations are skipped in comments - assert(comments.map(_.comment).forall(autoTuner.selectedPlatform.isValidComment)) + assert(comments.map(_.comment).forall(autoTuner.platform.isValidComment)) } // When spark is running as a standalone, the memoryOverhead should not be listed as a @@ -1357,4 +1359,152 @@ class AutoTunerSuite extends FunSuite with BeforeAndAfterEach with Logging { // scalastyle:on line.size.limit assert(expectedResults == autoTunerOutput) } + + test("Recommendations generated for unsupported operators from driver logs only") { + val customProps = mutable.LinkedHashMap( + "spark.executor.cores" -> "8", + "spark.executor.memory" -> "47222m", + "spark.rapids.sql.concurrentGpuTasks" -> "2", + "spark.task.resource.gpu.amount" -> "0.0625") + val unsupportedDriverOperators = Seq( + DriverLogUnsupportedOperators( + "FromUnixTime", 1, + "Only UTC zone id is supported. Actual default zone id: America/Los_Angeles; " + + "CORRECTED format 'yyyyMMdd' on the GPU is not guaranteed to produce the same " + + "results as Spark on CPU. Set spark.rapids.sql.incompatibleDateFormats.enabled=true " + + "to force onto GPU.") + ) + val workerInfo = buildWorkerInfoAsString(Some(customProps)) + val autoTuner: AutoTuner = AutoTuner.buildAutoTunerFromProps(workerInfo, null, + PlatformFactory.createInstance(), unsupportedDriverOperators) + val (properties, comments) = autoTuner.getRecommendedProperties() + val autoTunerOutput = Profiler.getAutoTunerResultsAsString(properties, comments) + // scalastyle:off line.size.limit + val expectedResults = + s"""| + |Spark Properties: + |--conf spark.rapids.sql.incompatibleDateFormats.enabled=true + | + |Comments: + |- 'spark.rapids.sql.incompatibleDateFormats.enabled' was not set. + |- AutoTuner recommendations only support eventlogs generated by Spark applications utilizing RAPIDS Accelerator for Apache Spark + |- RAPIDS Accelerator for Apache Spark jar is missing in "spark.plugins". Please refer to https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html + |- ${AutoTuner.commentForExperimentalConfig("spark.rapids.sql.incompatibleDateFormats.enabled")} + |""".stripMargin + // scalastyle:on line.size.limit + assert(expectedResults == autoTunerOutput) + } + + test("Recommendations generated for unsupported operators from driver and event logs") { + val customProps = mutable.LinkedHashMap( + "spark.executor.cores" -> "8", + "spark.executor.memory" -> "47222m", + "spark.rapids.sql.concurrentGpuTasks" -> "2", + "spark.task.resource.gpu.amount" -> "0.0625") + val unsupportedDriverOperators = Seq( + DriverLogUnsupportedOperators( + "FromUnixTime", 1, + "Only UTC zone id is supported. Actual default zone id: America/Los_Angeles; " + + "CORRECTED format 'yyyyMMdd' on the GPU is not guaranteed to produce the same " + + "results as Spark on CPU. Set spark.rapids.sql.incompatibleDateFormats.enabled=true " + + "to force onto GPU.") + ) + val workerInfo = buildWorkerInfoAsString(Some(customProps)) + val autoTuner: AutoTuner = AutoTuner.buildAutoTunerFromProps(workerInfo, + getGpuAppMockInfoProvider, PlatformFactory.createInstance(), unsupportedDriverOperators) + val (properties, comments) = autoTuner.getRecommendedProperties() + val autoTunerOutput = Profiler.getAutoTunerResultsAsString(properties, comments) + // scalastyle:off line.size.limit + val expectedResults = + s"""| + |Spark Properties: + |--conf spark.executor.cores=16 + |--conf spark.executor.instances=8 + |--conf spark.executor.memory=32768m + |--conf spark.executor.memoryOverhead=8396m + |--conf spark.rapids.memory.pinnedPool.size=4096m + |--conf spark.rapids.shuffle.multiThreaded.reader.threads=16 + |--conf spark.rapids.shuffle.multiThreaded.writer.threads=16 + |--conf spark.rapids.sql.incompatibleDateFormats.enabled=true + |--conf spark.rapids.sql.multiThreadedRead.numThreads=20 + |--conf spark.shuffle.manager=com.nvidia.spark.rapids.spark311.RapidsShuffleManager + |--conf spark.sql.adaptive.advisoryPartitionSizeInBytes=128m + |--conf spark.sql.adaptive.coalescePartitions.minPartitionNum=128 + |--conf spark.sql.files.maxPartitionBytes=512m + |--conf spark.sql.shuffle.partitions=200 + | + |Comments: + |- 'spark.executor.instances' was not set. + |- 'spark.executor.memoryOverhead' was not set. + |- 'spark.rapids.memory.pinnedPool.size' was not set. + |- 'spark.rapids.shuffle.multiThreaded.reader.threads' was not set. + |- 'spark.rapids.shuffle.multiThreaded.writer.threads' was not set. + |- 'spark.rapids.sql.incompatibleDateFormats.enabled' was not set. + |- 'spark.rapids.sql.multiThreadedRead.numThreads' was not set. + |- 'spark.shuffle.manager' was not set. + |- 'spark.sql.adaptive.advisoryPartitionSizeInBytes' was not set. + |- 'spark.sql.adaptive.coalescePartitions.minPartitionNum' was not set. + |- 'spark.sql.adaptive.enabled' should be enabled for better performance. + |- 'spark.sql.files.maxPartitionBytes' was not set. + |- 'spark.sql.shuffle.partitions' was not set. + |- ${AutoTuner.classPathComments("rapids.jars.missing")} + |- ${AutoTuner.classPathComments("rapids.shuffle.jars")} + |- ${AutoTuner.commentForExperimentalConfig("spark.rapids.sql.incompatibleDateFormats.enabled")} + |""".stripMargin + // scalastyle:on line.size.limit + assert(expectedResults == autoTunerOutput) + } + + + test("Recommendations generated for empty unsupported operators from driver logs only") { + val customProps = mutable.LinkedHashMap( + "spark.executor.cores" -> "8", + "spark.executor.memory" -> "47222m", + "spark.rapids.sql.concurrentGpuTasks" -> "2", + "spark.task.resource.gpu.amount" -> "0.0625") + val workerInfo = buildWorkerInfoAsString(Some(customProps)) + val autoTuner: AutoTuner = AutoTuner.buildAutoTunerFromProps(workerInfo, null, + PlatformFactory.createInstance(), Seq.empty) + val (properties, comments) = autoTuner.getRecommendedProperties() + val autoTunerOutput = Profiler.getAutoTunerResultsAsString(properties, comments) + // scalastyle:off line.size.limit + val expectedResults = + s"""|Cannot recommend properties. See Comments. + | + |Comments: + |- AutoTuner recommendations only support eventlogs generated by Spark applications utilizing RAPIDS Accelerator for Apache Spark + |- RAPIDS Accelerator for Apache Spark jar is missing in "spark.plugins". Please refer to https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html + |""".stripMargin + // scalastyle:on line.size.limit + assert(expectedResults == autoTunerOutput) + } + + test("Recommendations not generated for unsupported operators from driver logs") { + // This test does not generate any recommendations for the unsupported operator 'Literal' + val customProps = mutable.LinkedHashMap( + "spark.executor.cores" -> "8", + "spark.executor.memory" -> "47222m", + "spark.rapids.sql.concurrentGpuTasks" -> "2", + "spark.task.resource.gpu.amount" -> "0.0625") + val unsupportedDriverOperators = Seq( + DriverLogUnsupportedOperators( + "Literal", 3, + "expression Literal 1700518632630000 produces an unsupported type TimestampType") + ) + val workerInfo = buildWorkerInfoAsString(Some(customProps)) + val autoTuner: AutoTuner = AutoTuner.buildAutoTunerFromProps(workerInfo, null, + PlatformFactory.createInstance(), unsupportedDriverOperators) + val (properties, comments) = autoTuner.getRecommendedProperties() + val autoTunerOutput = Profiler.getAutoTunerResultsAsString(properties, comments) + // scalastyle:off line.size.limit + val expectedResults = + s"""|Cannot recommend properties. See Comments. + | + |Comments: + |- AutoTuner recommendations only support eventlogs generated by Spark applications utilizing RAPIDS Accelerator for Apache Spark + |- RAPIDS Accelerator for Apache Spark jar is missing in "spark.plugins". Please refer to https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html + |""".stripMargin + // scalastyle:on line.size.limit + assert(expectedResults == autoTunerOutput) + } } diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala index 61e8acf40..9a3640986 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/PluginTypeCheckerSuite.scala @@ -19,7 +19,7 @@ package com.nvidia.spark.rapids.tool.qualification import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} -import com.nvidia.spark.rapids.tool.ToolTestUtils +import com.nvidia.spark.rapids.tool.{PlatformFactory, PlatformNames, ToolTestUtils} import com.nvidia.spark.rapids.tool.planparser.DataWritingCommandExecParser import org.scalatest.FunSuite @@ -153,68 +153,33 @@ class PluginTypeCheckerSuite extends FunSuite with Logging { assert(result(2) == "ORC") } - test("supported operator score from onprem") { - val checker = new PluginTypeChecker("onprem") - assert(checker.getSpeedupFactor("UnionExec") == 3.0) - assert(checker.getSpeedupFactor("Ceil") == 4) - } - - test("supported operator score from dataproc-t4") { - val checker = new PluginTypeChecker("dataproc-t4") - assert(checker.getSpeedupFactor("UnionExec") == 4.88) - assert(checker.getSpeedupFactor("Ceil") == 4.88) - } - - test("supported operator score from emr-t4") { - val checker = new PluginTypeChecker("emr-t4") - assert(checker.getSpeedupFactor("UnionExec") == 2.07) - assert(checker.getSpeedupFactor("Ceil") == 2.07) - } - - test("supported operator score from databricks-aws") { - val checker = new PluginTypeChecker("databricks-aws") - assert(checker.getSpeedupFactor("UnionExec") == 2.45) - assert(checker.getSpeedupFactor("Ceil") == 2.45) - } - - test("supported operator score from databricks-azure") { - val checker = new PluginTypeChecker("databricks-azure") - assert(checker.getSpeedupFactor("UnionExec") == 2.73) - assert(checker.getSpeedupFactor("Ceil") == 2.73) - } - - test("supported operator score from dataproc-serverless-l4") { - val checker = new PluginTypeChecker("dataproc-serverless-l4") - assert(checker.getSpeedupFactor("WindowExec") == 4.25) - assert(checker.getSpeedupFactor("Ceil") == 4.25) - } - - test("supported operator score from dataproc-l4") { - val checker = new PluginTypeChecker("dataproc-l4") - assert(checker.getSpeedupFactor("UnionExec") == 4.16) - assert(checker.getSpeedupFactor("Ceil") == 4.16) - } - - test("supported operator score from dataproc-gke-t4") { - val checker = new PluginTypeChecker("dataproc-gke-t4") - assert(checker.getSpeedupFactor("WindowExec") == 3.65) - assert(checker.getSpeedupFactor("Ceil") == 3.65) - } - - test("supported operator score from dataproc-gke-l4") { - val checker = new PluginTypeChecker("dataproc-gke-l4") - assert(checker.getSpeedupFactor("WindowExec") == 3.74) - assert(checker.getSpeedupFactor("Ceil") == 3.74) - } - - test("supported operator score from emr-a10") { - val checker = new PluginTypeChecker("emr-a10") - assert(checker.getSpeedupFactor("UnionExec") == 2.59) - assert(checker.getSpeedupFactor("Ceil") == 2.59) + val platformSpeedupEntries: Seq[(String, Map[String, Double])] = Seq( + (PlatformNames.ONPREM, Map("UnionExec" -> 3.0, "Ceil" -> 4.0)), + (PlatformNames.DATAPROC_T4, Map("UnionExec" -> 4.88, "Ceil" -> 4.88)), + (PlatformNames.EMR_T4, Map("UnionExec" -> 2.07, "Ceil" -> 2.07)), + (PlatformNames.DATABRICKS_AWS, Map("UnionExec" -> 2.45, "Ceil" -> 2.45)), + (PlatformNames.DATABRICKS_AZURE, Map("UnionExec" -> 2.73, "Ceil" -> 2.73)), + (PlatformNames.DATAPROC_SL_L4, Map("WindowExec" -> 4.25, "Ceil" -> 4.25)), + (PlatformNames.DATAPROC_L4, Map("UnionExec" -> 4.16, "Ceil" -> 4.16)), + (PlatformNames.DATAPROC_GKE_T4, Map("WindowExec" -> 3.65, "Ceil" -> 3.65)), + (PlatformNames.DATAPROC_GKE_L4, Map("WindowExec" -> 3.74, "Ceil" -> 3.74)), + (PlatformNames.EMR_A10, Map("UnionExec" -> 2.59, "Ceil" -> 2.59)) + ) + + platformSpeedupEntries.foreach { case (platformName, speedupMap) => + test(s"supported operator score from $platformName") { + val platform = PlatformFactory.createInstance(platformName) + val checker = new PluginTypeChecker(platform) + speedupMap.foreach { case (operator, speedup) => + assert(checker.getSpeedupFactor(operator) == speedup) + } + } } test("supported operator score from custom speedup factor file") { - val speedupFactorFile = ToolTestUtils.getTestResourcePath("operatorsScore-databricks-azure.csv") + // Using databricks azure speedup factor as custom file + val platform = PlatformFactory.createInstance(PlatformNames.DATABRICKS_AZURE) + val speedupFactorFile = ToolTestUtils.getTestResourcePath(platform.getOperatorScoreFile) val checker = new PluginTypeChecker(speedupFactorFile=Some(speedupFactorFile)) assert(checker.getSpeedupFactor("SortExec") == 13.11) assert(checker.getSpeedupFactor("FilterExec") == 3.14) diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala index 307dcb00a..447c203c9 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala @@ -16,14 +16,14 @@ package com.nvidia.spark.rapids.tool.qualification -import java.io.File +import java.io.{File, PrintWriter} import java.util.concurrent.TimeUnit.NANOSECONDS import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.io.Source import com.nvidia.spark.rapids.BaseTestSuite -import com.nvidia.spark.rapids.tool.{EventLogPathProcessor, StatusReportCounts, ToolTestUtils} +import com.nvidia.spark.rapids.tool.{EventLogPathProcessor, PlatformNames, StatusReportCounts, ToolTestUtils} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.ml.feature.PCA @@ -460,6 +460,67 @@ class QualificationSuite extends BaseTestSuite { runQualificationTest(logFiles, "nds_q86_test_expectation.csv", expectedStatus = expectedStatus) } + test("incomplete json file does not cause entire app to fail") { + // The purpose of this test is to make sure that the app is not skipped when the JSON parser + // encounters an unexpected EOF. + // There are two cases to evaluate: + // 1- An eventlog that has an end-to-end application but for some reason the EOF is incorrect + // 2- An eventlog of an unfinished app (missing SparkListenerApplicationEnd) + + TrampolineUtil.withTempDir { eventLogDir => + // generate the original eventlog + val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, + "WholeStageFilterProject") { spark => + import spark.implicits._ + val df = spark.sparkContext.makeRDD(1 to 100, 3).toDF + val df2 = spark.sparkContext.makeRDD(1 to 100, 3).toDF + df.select($"value" as "a") + .join(df2.select($"value" as "b"), $"a" === $"b") + .filter("(((b < 100) AND (a > 50)) OR (a = 0))") + .sort($"b") + } + // create the following files: + // 1- inprogress eventlog that does not contain "SparkListenerApplicationEnd" (unfinished) + // 2- inprogress eventlog with a terminated app (incomplete) + val unfinishedLog = new File(s"$eventLogDir/unfinished.inprogress") + val incompleteLog = new File(s"$eventLogDir/eventlog.inprogress") + val pwList = Array(new PrintWriter(unfinishedLog), new PrintWriter(incompleteLog)) + val bufferedSource = Source.fromFile(eventLog) + try { + val allEventLines = bufferedSource.getLines.toList + val selectedLines: List[String] = allEventLines.dropRight(1) + selectedLines.foreach { line => + pwList.foreach(pw => pw.println(line)) + } + // add the "SparkListenerApplicationEnd" to the incompleteLog + pwList(1).println(allEventLines.last) + pwList.foreach( pw => + pw.print("{\"Event\":\"SparkListenerEnvironmentUpdate\"," + + "\"JVM Information\":{\"Java Home:") + ) + } finally { + bufferedSource.close() + pwList.foreach(pw => pw.close()) + } + // All the eventlogs should be parsed successfully + // Status counts: 3 SUCCESS, 0 FAILURE, 0 UNKNOWN + val logFiles = Array(eventLog, incompleteLog.getAbsolutePath, unfinishedLog.getAbsolutePath) + // test Qualification + val outpath = new File(s"$eventLogDir/output_folder") + val allArgs = Array( + "--output-directory", + outpath.getAbsolutePath()) + + val appArgs = new QualificationArgs(allArgs ++ logFiles) + val (exit, appSum) = QualificationMain.mainInternal(appArgs) + assert(exit == 0) + assert(appSum.size == 3) + // test Profiler + val apps = ToolTestUtils.processProfileApps(logFiles, sparkSession) + assert(apps.size == 3) + } + } + test("spark2 eventlog") { val profileLogDir = ToolTestUtils.getTestResourcePath("spark-events-profiling") val log = s"$profileLogDir/spark2-eventlog.zstd" @@ -1150,7 +1211,10 @@ class QualificationSuite extends BaseTestSuite { val filename = s"$outpath/rapids_4_spark_qualification_output/" + s"rapids_4_spark_qualification_output_unsupportedOperators.csv" + val stageDurationFile = s"$outpath/rapids_4_spark_qualification_output/" + + s"rapids_4_spark_qualification_output_unsupportedOperatorsStageDuration.csv" val inputSource = Source.fromFile(filename) + val unsupportedStageDuration = Source.fromFile(stageDurationFile) try { val lines = inputSource.getLines.toSeq // 1 for header, 1 for values @@ -1166,6 +1230,11 @@ class QualificationSuite extends BaseTestSuite { assert(lines.size == expLinesSize) assert(lines.head.contains("App ID,Unsupported Type,")) assert(lines(1).contains("\"Read\",\"JSON\",\"Types not supported - bigint:int\"")) + + val stageDurationLines = unsupportedStageDuration.getLines.toSeq + assert(stageDurationLines.head.contains("" + + "Stage Duration,App Duration,Recommendation")) + assert(stageDurationLines(1).contains("Not Recommended")) } finally { inputSource.close() } @@ -1337,292 +1406,29 @@ class QualificationSuite extends BaseTestSuite { spark.sql("SELECT id, hour(current_timestamp()), second(to_timestamp(timestamp)) FROM t1") } - // run the qualification tool for onprem - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "onprem", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for emr. It should default to emr-t4. - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "emr", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for emr-t4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "emr-t4", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for emr-a10 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "emr-a10", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for dataproc. It should default to dataproc-t4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "dataproc", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for dataproc-t4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "dataproc-t4", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for dataproc-l4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "dataproc-l4", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for dataproc-serverless-l4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "dataproc-serverless-l4", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for dataproc-gke-t4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "dataproc-gke-t4", - eventLog)) - - val (exit, _) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for dataproc-gke-l4 - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "dataproc-gke-l4", - eventLog)) - - val (exit, _) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for databricks-aws - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "databricks-aws", - eventLog)) - - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) - - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() - - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) - } - - // run the qualification tool for databricks-azure - TrampolineUtil.withTempDir { outpath => - val appArgs = new QualificationArgs(Array( - "--output-directory", - outpath.getAbsolutePath, - "--platform", - "databricks-azure", - eventLog)) + PlatformNames.getAllNames.foreach { platform => + // run the qualification tool for each platform + TrampolineUtil.withTempDir { outPath => + val appArgs = new QualificationArgs(Array( + "--output-directory", + outPath.getAbsolutePath, + "--platform", + platform, + eventLog)) - val (exit, sumInfo) = - QualificationMain.mainInternal(appArgs) - assert(exit == 0) + val (exit, _) = QualificationMain.mainInternal(appArgs) + assert(exit == 0) - // the code above that runs the Spark query stops the Sparksession - // so create a new one to read in the csv file - createSparkSession() + // the code above that runs the Spark query stops the Spark Session, + // so create a new one to read in the csv file + createSparkSession() - // validate that the SQL description in the csv file escapes commas properly - val outputResults = s"$outpath/rapids_4_spark_qualification_output/" + - s"rapids_4_spark_qualification_output.csv" - val outputActual = readExpectedFile(new File(outputResults)) - assert(outputActual.collect().size == 1) + // validate that the SQL description in the csv file escapes commas properly + val outputResults = s"$outPath/rapids_4_spark_qualification_output/" + + s"rapids_4_spark_qualification_output.csv" + val outputActual = readExpectedFile(new File(outputResults)) + assert(outputActual.collect().length == 1) + } } } } diff --git a/user_tools/custom_speedup_factors/operatorsList.csv b/user_tools/custom_speedup_factors/operatorsList.csv index 7327d8054..986b88ed7 100644 --- a/user_tools/custom_speedup_factors/operatorsList.csv +++ b/user_tools/custom_speedup_factors/operatorsList.csv @@ -98,6 +98,7 @@ Expm1 First Flatten Floor +FormatNumber FromUTCTimestamp FromUnixTime GetArrayItem @@ -168,6 +169,7 @@ Not NthValue OctetLength Or +Percentile PercentRank PivotFirst Pmod @@ -207,6 +209,7 @@ SortOrder SparkPartitionID SpecifiedWindowFrame Sqrt +Stack StartsWith StddevPop StddevSamp @@ -222,6 +225,7 @@ StringTranslate StringTrim StringTrimLeft StringTrimRight +StructsToJson Substring SubstringIndex Subtract diff --git a/user_tools/docs/user-tools-databricks-aws.md b/user_tools/docs/user-tools-databricks-aws.md index 8e94e654d..2e9198af4 100644 --- a/user_tools/docs/user-tools-databricks-aws.md +++ b/user_tools/docs/user-tools-databricks-aws.md @@ -43,7 +43,7 @@ Before running any command, you can set environment variables to specify configu - RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`: - `RAPIDS_USER_TOOLS_CACHE_FOLDER`: specifies the location of a local directory that the RAPIDS-cli uses to store and cache the downloaded resources. The default is `/var/tmp/spark_rapids_user_tools_cache`. Note that caching the resources locally has an impact on the total execution time of the command. - `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY`: specifies the location of a local directory that the RAPIDS-cli uses to generate the output. The wrapper CLI arguments override that environment variable (`--local_folder` for Qualification). -- For Databricks CLI, some environment variables can be set and picked by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/en/dev-tools/auth.html#environment-variables-and-fields-for-client-unified-authentication). +- For Databricks CLI, some environment variables can be set and picked by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/en/dev-tools/auth/index.html#environment-variables-and-fields-for-client-unified-authentication). - For AWS CLI, some environment variables can be set and picked by the RAPIDS-user tools such as: `AWS_SHARED_CREDENTIALS_FILE`, `AWS_CONFIG_FILE`, `AWS_REGION`, `AWS_DEFAULT_REGION`, `AWS_PROFILE` and `AWS_DEFAULT_OUTPUT`. See the full list of variables in [aws-cli-configure-envvars](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html). ## Qualification command diff --git a/user_tools/docs/user-tools-databricks-azure.md b/user_tools/docs/user-tools-databricks-azure.md index 2605b70e8..96cf6888e 100644 --- a/user_tools/docs/user-tools-databricks-azure.md +++ b/user_tools/docs/user-tools-databricks-azure.md @@ -47,7 +47,7 @@ Before running any command, you can set environment variables to specify configu - RAPIDS variables have a naming pattern `RAPIDS_USER_TOOLS_*`: - `RAPIDS_USER_TOOLS_CACHE_FOLDER`: specifies the location of a local directory that the RAPIDS-cli uses to store and cache the downloaded resources. The default is `/var/tmp/spark_rapids_user_tools_cache`. Note that caching the resources locally has an impact on the total execution time of the command. - `RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY`: specifies the location of a local directory that the RAPIDS-cli uses to generate the output. The wrapper CLI arguments override that environment variable (`--local_folder` for Qualification). -- For Databricks CLI, some environment variables can be set and picked up by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/en/dev-tools/auth.html#environment-variables-and-fields-for-client-unified-authentication). +- For Databricks CLI, some environment variables can be set and picked up by the RAPIDS-user tools such as: `DATABRICKS_CONFIG_FILE`, `DATABRICKS_HOST` and `DATABRICKS_TOKEN`. See the description of the variables in [Environment variables](https://docs.databricks.com/en/dev-tools/auth/index.html#environment-variables-and-fields-for-client-unified-authentication). - For Azure CLI, some environment variables can be set and picked up by the RAPIDS-user tools such as: `AZURE_CONFIG_FILE` and `AZURE_DEFAULTS_LOCATION`. ## Qualification command diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml index 803708c94..a9797e3b2 100644 --- a/user_tools/pyproject.toml +++ b/user_tools/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "pylint-pydantic==0.3.0", # used for common API to access remote filesystems like local/s3/gcs/hdfs # this will include numpy - "pyarrow==12.0.1", + "pyarrow==14.0.1", # used for ADLS filesystem implementation # Issue-568: use 12.17.0 as the new 12.18.0 causes an error in runtime "azure-storage-blob==12.17.0", @@ -49,7 +49,7 @@ dynamic=["entry-points", "version"] [project.scripts] spark_rapids_user_tools = "spark_rapids_pytools.wrapper:main" -ascli = "spark_rapids_tools.cmdli.tools_cli:main" +spark_rapids = "spark_rapids_tools.cmdli.tools_cli:main" [tool.setuptools] package-dir = {"" = "src"} diff --git a/user_tools/src/spark_rapids_pytools/__init__.py b/user_tools/src/spark_rapids_pytools/__init__.py index 8672e1571..ecf7a8978 100644 --- a/user_tools/src/spark_rapids_pytools/__init__.py +++ b/user_tools/src/spark_rapids_pytools/__init__.py @@ -16,5 +16,5 @@ from spark_rapids_pytools.build import get_version -VERSION = '23.10.1' +VERSION = '23.10.2' __version__ = get_version(VERSION) diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py b/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py index c2fc5f19b..7f6b5abe8 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/onprem.py @@ -70,9 +70,8 @@ def get_platform_name(self) -> str: This used to get the lower case of the platform of the runtime. :return: the name of the platform of the runtime in lower_case. """ - if self.platform is not None: - if self.platform == 'dataproc': - self_id = CspEnv.DATAPROC + if self.platform is not None and self.platform == 'dataproc': + self_id = CspEnv.DATAPROC else: self_id = self.type_id return CspEnv.pretty_print(self_id) diff --git a/user_tools/src/spark_rapids_pytools/common/utilities.py b/user_tools/src/spark_rapids_pytools/common/utilities.py index fb2666cd7..5ac5cdd97 100644 --- a/user_tools/src/spark_rapids_pytools/common/utilities.py +++ b/user_tools/src/spark_rapids_pytools/common/utilities.py @@ -45,6 +45,7 @@ class Utils: """Utility class used to enclose common helpers and utilities.""" + warning_issued = False @classmethod def gen_random_string(cls, str_length: int) -> str: @@ -208,6 +209,26 @@ def gen_multiline_str(cls, *items) -> str: def get_os_name(cls) -> str: return os.uname().sysname + @classmethod + def get_value_or_pop(cls, provided_value, options_dict, short_flag, default_value=None): + """ + Gets a value or pops it from the provided options dictionary if the value is not explicitly provided. + + :param provided_value: The value to return if not None. + :param options_dict: Dictionary containing options. + :param short_flag: Flag to look for in options_dict. + :param default_value: The default value to return if the target_key is not found. Defaults to None. + :return: provided_value or the value from options_dict or the default_value. + """ + if provided_value is not None: + return provided_value + if short_flag in options_dict: + if not cls.warning_issued: + cls.warning_issued = True + print('Warning: Instead of using short flags for argument, consider providing the value directly.') + return options_dict.pop(short_flag) + return default_value + class ToolLogging: """Holds global utilities used for logging.""" diff --git a/user_tools/src/spark_rapids_pytools/rapids/profiling.py b/user_tools/src/spark_rapids_pytools/rapids/profiling.py index 0c6e1cfcc..2c31af406 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/profiling.py +++ b/user_tools/src/spark_rapids_pytools/rapids/profiling.py @@ -247,7 +247,7 @@ def _process_output(self): self.__generate_report_with_recommendations() def _init_rapids_arg_list(self) -> List[str]: - return self._create_autotuner_rapids_args() + return super()._init_rapids_arg_list() + self._create_autotuner_rapids_args() @dataclass diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py index 7b5ed96b1..7ed1d3cff 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py +++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py @@ -770,10 +770,6 @@ def _write_summary(self): if wrapper_out_content is not None: print(Utils.gen_multiline_str(wrapper_out_content)) - def _init_rapids_arg_list(self) -> List[str]: - # TODO: Make sure we add this argument only for jar versions 23.02+ - return ['--platform', self.ctxt.platform.get_platform_name().replace('_', '-')] - def _generate_section_lines(self, sec_conf: dict) -> List[str]: # TODO: we may like to show the scripts even when the gpu-cluster is not defined # this requires that we allow to generate the script without the gpu-cluster diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index 7fbbd9b09..41214f65c 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -756,7 +756,8 @@ def _process_local_job_submission_args(self): self.ctxt.update_job_args(job_args) def _init_rapids_arg_list(self) -> List[str]: - return [] + # TODO: Make sure we add this argument only for jar versions 23.02+ + return ['--platform', self.ctxt.platform.get_platform_name().replace('_', '-')] @timeit('Building Job Arguments and Executing Job CMD') # pylint: disable=too-many-function-args def _prepare_local_job_arguments(self): diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py index 8411eb6db..a62ad9286 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py @@ -16,7 +16,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AWS.""" from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.diagnostic import Diagnostic from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, QualGpuClusterReshapeType @@ -40,8 +40,8 @@ def qualification(cpu_cluster: str = None, filter_apps: str = QualFilterApp.tostring(QualFilterApp.SAVINGS), gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, cpu_discount: int = None, gpu_discount: int = None, global_discount: int = None, @@ -105,6 +105,15 @@ def qualification(cpu_cluster: str = None, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + profile = Utils.get_value_or_pop(profile, rapids_options, 'p') + aws_profile = Utils.get_value_or_pop(aws_profile, rapids_options, 'a') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() @@ -150,8 +159,8 @@ def profiling(gpu_cluster: str = None, remote_folder: str = None, tools_jar: str = None, credentials_file: str = None, - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, **rapids_options) -> None: """ The Profiling tool analyzes both CPU or GPU generated event logs and generates information @@ -192,6 +201,17 @@ def profiling(gpu_cluster: str = None, For more details on Profiling tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html#profiling-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + profile = Utils.get_value_or_pop(profile, rapids_options, 'p') + aws_profile = Utils.get_value_or_pop(aws_profile, rapids_options, 'a') + credentials_file = Utils.get_value_or_pop(credentials_file, rapids_options, 'c') + gpu_cluster = Utils.get_value_or_pop(gpu_cluster, rapids_options, 'g') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + worker_info = Utils.get_value_or_pop(worker_info, rapids_options, 'w') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py index 197c42a04..f29582f20 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py @@ -16,7 +16,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AZURE.""" from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.diagnostic import Diagnostic from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, QualGpuClusterReshapeType @@ -39,8 +39,8 @@ def qualification(cpu_cluster: str = None, filter_apps: str = QualFilterApp.tostring(QualFilterApp.SAVINGS), gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, cpu_discount: int = None, gpu_discount: int = None, global_discount: int = None, @@ -103,6 +103,14 @@ def qualification(cpu_cluster: str = None, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + profile = Utils.get_value_or_pop(profile, rapids_options, 'p') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() @@ -146,8 +154,8 @@ def profiling(gpu_cluster: str = None, remote_folder: str = None, tools_jar: str = None, credentials_file: str = None, - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, **rapids_options) -> None: """ The Profiling tool analyzes both CPU or GPU generated event logs and generates information @@ -186,6 +194,16 @@ def profiling(gpu_cluster: str = None, For more details on Profiling tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html#profiling-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + profile = Utils.get_value_or_pop(profile, rapids_options, 'p') + credentials_file = Utils.get_value_or_pop(credentials_file, rapids_options, 'c') + gpu_cluster = Utils.get_value_or_pop(gpu_cluster, rapids_options, 'g') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + worker_info = Utils.get_value_or_pop(worker_info, rapids_options, 'w') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() diff --git a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_gke_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_gke_wrapper.py index 251347a28..23868aa77 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_gke_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_gke_wrapper.py @@ -16,7 +16,7 @@ from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, QualGpuClusterReshapeType @@ -36,8 +36,8 @@ def qualification(cpu_cluster: str = None, filter_apps: str = QualFilterApp.tostring(QualFilterApp.SAVINGS), gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, cpu_discount: int = None, gpu_discount: int = None, global_discount: int = None, @@ -100,6 +100,13 @@ def qualification(cpu_cluster: str = None, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() diff --git a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py index a02fee408..8736d9cd2 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/dataproc_wrapper.py @@ -16,7 +16,7 @@ from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.bootstrap import Bootstrap from spark_rapids_pytools.rapids.diagnostic import Diagnostic from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal @@ -39,8 +39,8 @@ def qualification(cpu_cluster: str = None, filter_apps: str = QualFilterApp.tostring(QualFilterApp.SAVINGS), gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, cpu_discount: int = None, gpu_discount: int = None, global_discount: int = None, @@ -102,6 +102,13 @@ def qualification(cpu_cluster: str = None, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() @@ -143,8 +150,8 @@ def profiling(gpu_cluster: str = None, remote_folder: str = None, tools_jar: str = None, credentials_file: str = None, - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, **rapids_options) -> None: """ The Profiling tool analyzes both CPU or GPU generated event logs and generates information @@ -183,6 +190,15 @@ def profiling(gpu_cluster: str = None, For more details on Profiling tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html#profiling-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + credentials_file = Utils.get_value_or_pop(credentials_file, rapids_options, 'c') + gpu_cluster = Utils.get_value_or_pop(gpu_cluster, rapids_options, 'g') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + worker_info = Utils.get_value_or_pop(worker_info, rapids_options, 'w') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() diff --git a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py index fea22b044..9cad16338 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/emr_wrapper.py @@ -16,7 +16,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on AWS-EMR.""" from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.bootstrap import Bootstrap from spark_rapids_pytools.rapids.diagnostic import Diagnostic from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, \ @@ -40,8 +40,8 @@ def qualification(cpu_cluster: str = None, filter_apps: str = QualFilterApp.tostring(QualFilterApp.SAVINGS), gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, cpu_discount: int = None, gpu_discount: int = None, global_discount: int = None, @@ -100,6 +100,14 @@ def qualification(cpu_cluster: str = None, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + profile = Utils.get_value_or_pop(profile, rapids_options, 'p') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() @@ -140,8 +148,8 @@ def profiling(gpu_cluster: str = None, local_folder: str = None, remote_folder: str = None, tools_jar: str = None, - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, **rapids_options) -> None: """ The Profiling tool analyzes both CPU or GPU generated event logs and generates information @@ -177,6 +185,15 @@ def profiling(gpu_cluster: str = None, For more details on Profiling tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html#profiling-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + profile = Utils.get_value_or_pop(profile, rapids_options, 'p') + gpu_cluster = Utils.get_value_or_pop(gpu_cluster, rapids_options, 'g') + remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r') + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + worker_info = Utils.get_value_or_pop(worker_info, rapids_options, 'w') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() diff --git a/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py index 4d50b0c28..048f3582a 100644 --- a/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py +++ b/user_tools/src/spark_rapids_pytools/wrappers/onprem_wrapper.py @@ -16,7 +16,7 @@ """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on On-Prem cluster.""" from spark_rapids_tools import CspEnv from spark_rapids_pytools.cloud_api.sp_types import DeployMode -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, QualGpuClusterReshapeType @@ -36,8 +36,8 @@ def qualification(cpu_cluster: str = None, target_platform: str = None, gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, cpu_discount: int = None, gpu_discount: int = None, global_discount: int = None, @@ -80,6 +80,11 @@ def qualification(cpu_cluster: str = None, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() @@ -132,8 +137,8 @@ def profiling(worker_info: str = None, eventlogs: str = None, local_folder: str = None, tools_jar: str = None, - jvm_heap_size: int = 24, - verbose: bool = False, + jvm_heap_size: int = None, + verbose: bool = None, **rapids_options) -> None: """ The Profiling tool analyzes both CPU or GPU generated event logs and generates information @@ -158,7 +163,12 @@ def profiling(worker_info: str = None, For more details on Profiling tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html#profiling-tool-options """ - + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) + jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', 24) + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't') + worker_info = Utils.get_value_or_pop(worker_info, rapids_options, 'w') + local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l') if verbose: # when debug is set to true set it in the environment. ToolLogging.enable_debug_mode() diff --git a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py index 67624ac7a..6605ecb3b 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py +++ b/user_tools/src/spark_rapids_tools/cmdli/argprocessor.py @@ -168,11 +168,21 @@ def detect_platform_from_eventlogs_prefix(self): self.p_args['toolArgs']['platform'] = map_storage_to_platform[storage_type] def validate_onprem_with_cluster_name(self): - if self.platform == CspEnv.ONPREM: + # this field has already been populated during initialization + selected_platform = self.p_args['toolArgs']['platform'] + if selected_platform == CspEnv.ONPREM: raise PydanticCustomError( 'invalid_argument', f'Cannot run cluster by name with platform [{CspEnv.ONPREM}]\n Error:') + def validate_onprem_with_cluster_props_without_eventlogs(self): + # this field has already been populated during initialization + selected_platform = self.p_args['toolArgs']['platform'] + if selected_platform == CspEnv.ONPREM: + raise PydanticCustomError( + 'invalid_argument', + f'Cannot run cluster by properties with platform [{CspEnv.ONPREM}] without event logs\n Error:') + def init_extra_arg_cases(self) -> list: return [] @@ -202,21 +212,24 @@ def define_extra_arg_cases(self): def build_tools_args(self) -> dict: pass - def apply_arg_cases(self): - for curr_cases in [self.rejected, self.detected, self.extra]: + def apply_arg_cases(self, cases_list: list): + for curr_cases in cases_list: for case_key, case_value in curr_cases.items(): if any(ArgValueCase.array_equal(self.argv_cases, case_i) for case_i in case_value['cases']): # debug the case key self.logger.info('...applying argument case: %s', case_key) case_value['callable']() + def apply_all_arg_cases(self): + self.apply_arg_cases([self.rejected, self.detected, self.extra]) + def validate_arguments(self): self.init_tool_args() self.init_arg_cases() self.define_invalid_arg_cases() self.define_detection_cases() self.define_extra_arg_cases() - self.apply_arg_cases() + self.apply_all_arg_cases() def get_or_set_platform(self) -> CspEnv: if self.p_args['toolArgs']['platform'] is None: @@ -224,17 +237,14 @@ def get_or_set_platform(self) -> CspEnv: runtime_platform = CspEnv.get_default() else: runtime_platform = self.p_args['toolArgs']['platform'] - self.post_platform_assignment_validation(runtime_platform) + self.post_platform_assignment_validation() return runtime_platform - def post_platform_assignment_validation(self, assigned_platform): - # do some validation after we decide the cluster type - if self.argv_cases[1] == ArgValueCase.VALUE_A: - if assigned_platform == CspEnv.ONPREM: - # it is not allowed to run cluster_by_name on an OnPrem platform - raise PydanticCustomError( - 'invalid_argument', - f'Cannot run cluster by name with platform [{CspEnv.ONPREM}]\n Error:') + def post_platform_assignment_validation(self): + # Update argv_cases to reflect the platform + self.argv_cases[0] = ArgValueCase.VALUE_A + # Any validation post platform assignment should be done here + self.apply_arg_cases([self.rejected, self.extra]) @dataclass @@ -278,6 +288,13 @@ def define_invalid_arg_cases(self): [ArgValueCase.VALUE_A, ArgValueCase.VALUE_A, ArgValueCase.IGNORE] ] } + self.rejected['Cluster By Properties Cannot go with OnPrem'] = { + 'valid': False, + 'callable': partial(self.validate_onprem_with_cluster_props_without_eventlogs), + 'cases': [ + [ArgValueCase.VALUE_A, ArgValueCase.VALUE_B, ArgValueCase.UNDEFINED] + ] + } def define_detection_cases(self): self.detected['Define Platform from Cluster Properties file'] = { diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py index 238f83a89..5a63aebe5 100644 --- a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py +++ b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py @@ -19,7 +19,7 @@ from spark_rapids_tools.enums import QualGpuClusterReshapeType from spark_rapids_tools.utils.util import gen_app_banner, init_environment -from spark_rapids_pytools.common.utilities import ToolLogging +from spark_rapids_pytools.common.utilities import Utils, ToolLogging from spark_rapids_pytools.rapids.bootstrap import Bootstrap from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal from spark_rapids_pytools.rapids.qualification import QualificationAsLocal @@ -48,7 +48,7 @@ def qualification(self, global_discount: int = None, gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring( QualGpuClusterReshapeType.get_default()), - verbose: bool = False, + verbose: bool = None, **rapids_options): """The Qualification cmd provides estimated running costs and speedups by migrating Apache Spark applications to GPU accelerated clusters. @@ -105,6 +105,11 @@ def qualification(self, For more details on Qualification tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html#qualification-tool-options """ + platform = Utils.get_value_or_pop(platform, rapids_options, 'p') + target_platform = Utils.get_value_or_pop(target_platform, rapids_options, 't') + output_folder = Utils.get_value_or_pop(output_folder, rapids_options, 'o') + filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f') + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) if verbose: ToolLogging.enable_debug_mode() init_environment('qual') @@ -133,7 +138,7 @@ def profiling(self, cluster: str = None, platform: str = None, output_folder: str = None, - verbose: bool = False, + verbose: bool = None, **rapids_options): """The Profiling cmd provides information which can be used for debugging and profiling Apache Spark applications running on accelerated GPU cluster. @@ -159,6 +164,11 @@ def profiling(self, For more details on Profiling tool options, please visit https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html#profiling-tool-options """ + eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e') + cluster = Utils.get_value_or_pop(cluster, rapids_options, 'c') + platform = Utils.get_value_or_pop(platform, rapids_options, 'p') + output_folder = Utils.get_value_or_pop(output_folder, rapids_options, 'o') + verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False) if verbose: ToolLogging.enable_debug_mode() init_environment('prof') diff --git a/user_tools/src/spark_rapids_tools/utils/util.py b/user_tools/src/spark_rapids_tools/utils/util.py index e8ed7e05d..8ff5a1975 100644 --- a/user_tools/src/spark_rapids_tools/utils/util.py +++ b/user_tools/src/spark_rapids_tools/utils/util.py @@ -91,7 +91,7 @@ def to_snake_case(word: str) -> str: def dump_tool_usage(tool_name: Optional[str], raise_sys_exit: Optional[bool] = True): imported_module = __import__('spark_rapids_tools.cmdli', globals(), locals(), ['ToolsCLI']) wrapper_clzz = getattr(imported_module, 'ToolsCLI') - help_name = 'ascli' + help_name = 'spark_rapids' usage_cmd = f'{tool_name} -- --help' try: fire.Fire(wrapper_clzz(), name=help_name, command=usage_cmd) diff --git a/user_tools/tests/spark_rapids_tools_ut/conftest.py b/user_tools/tests/spark_rapids_tools_ut/conftest.py index e29cefc3c..145355f24 100644 --- a/user_tools/tests/spark_rapids_tools_ut/conftest.py +++ b/user_tools/tests/spark_rapids_tools_ut/conftest.py @@ -16,7 +16,7 @@ import sys -import pytest # pylint: disable=import-error +import pytest # pylint: disable=import-error def get_test_resources_path(): @@ -46,9 +46,10 @@ def gen_cpu_cluster_props(): # all csps except onprem csps = ['dataproc', 'dataproc_gke', 'emr', 'databricks_aws', 'databricks_azure'] all_csps = csps + ['onprem'] +autotuner_prop_path = 'worker_info.yaml' -class SparkRapidsToolsUT: # pylint: disable=too-few-public-methods +class SparkRapidsToolsUT: # pylint: disable=too-few-public-methods @pytest.fixture(autouse=True) def get_ut_data_dir(self): diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/worker_info.yaml b/user_tools/tests/spark_rapids_tools_ut/resources/worker_info.yaml new file mode 100644 index 000000000..d9aaa14d5 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/resources/worker_info.yaml @@ -0,0 +1,19 @@ +system: + numCores: 32 + memory: 212992MiB + numWorkers: 5 +gpu: + memory: 15109MiB + count: 4 + name: T4 +softwareProperties: + spark.driver.maxResultSize: 7680m + spark.driver.memory: 15360m + spark.executor.cores: '8' + spark.executor.instances: '2' + spark.executor.memory: 47222m + spark.executorEnv.OPENBLAS_NUM_THREADS: '1' + spark.scheduler.mode: FAIR + spark.sql.cbo.enabled: 'true' + spark.ui.port: '0' + spark.yarn.am.memory: 640m diff --git a/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py index 0e5d496d0..76e694c81 100644 --- a/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py +++ b/user_tools/tests/spark_rapids_tools_ut/test_tool_argprocessor.py @@ -15,16 +15,16 @@ """Test Tool argument validators""" import dataclasses +import warnings from collections import defaultdict from typing import Dict, Callable, List -import fire import pytest # pylint: disable=import-error from spark_rapids_tools import CspEnv from spark_rapids_tools.cmdli.argprocessor import AbsToolUserArgModel, ArgValueCase from spark_rapids_tools.enums import QualFilterApp -from .conftest import SparkRapidsToolsUT, all_cpu_cluster_props, csp_cpu_cluster_props, csps +from .conftest import SparkRapidsToolsUT, autotuner_prop_path, all_cpu_cluster_props, all_csps @dataclasses.dataclass @@ -52,6 +52,7 @@ def decorator(func_cb: Callable): triplet_test_registry[obj_k] = argv_obj argv_obj.tests.append(func_cb.__name__) return func_cb + return decorator @@ -74,112 +75,238 @@ def validate_args_w_savings_disabled(tool_name: str, t_args: dict): # filterApps should be set to savings assert t_args['filterApps'] == QualFilterApp.SPEEDUPS - @pytest.mark.parametrize('tool_name', ['qualification', 'profiling', 'bootstrap']) - @register_triplet_test([ArgValueCase.IGNORE, ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED]) - def test_no_args(self, tool_name): - fire.core.Display = lambda lines, out: out.write('\n'.join(lines) + '\n') - with pytest.raises(SystemExit) as pytest_wrapped_e: - AbsToolUserArgModel.create_tool_args(tool_name) - assert pytest_wrapped_e.type == SystemExit + @staticmethod + def create_tool_args_should_pass(tool_name: str, platform=None, cluster=None, eventlogs=None): + return AbsToolUserArgModel.create_tool_args(tool_name, + platform=platform, + cluster=cluster, + eventlogs=eventlogs) - @pytest.mark.parametrize('tool_name', ['qualification', 'profiling', 'bootstrap']) - @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.UNDEFINED]) - def test_cluster__name_no_hints(self, tool_name): - fire.core.Display = lambda lines, out: out.write('\n'.join(lines) + '\n') + @staticmethod + def create_tool_args_should_fail(tool_name: str, platform=None, cluster=None, eventlogs=None): with pytest.raises(SystemExit) as pytest_wrapped_e: - AbsToolUserArgModel.create_tool_args(tool_name, cluster='mycluster') + AbsToolUserArgModel.create_tool_args(tool_name, + platform=platform, + cluster=cluster, + eventlogs=eventlogs) assert pytest_wrapped_e.type == SystemExit - @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) - @pytest.mark.parametrize('csp,prop_path', all_cpu_cluster_props) - @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_B, ArgValueCase.VALUE_A]) - def test_with_eventlogs(self, get_ut_data_dir, tool_name, csp, prop_path): - cluster_prop_file = f'{get_ut_data_dir}/{prop_path}' - tool_args = AbsToolUserArgModel.create_tool_args(tool_name, - cluster=f'{cluster_prop_file}', - eventlogs=f'{get_ut_data_dir}/eventlogs') - assert tool_args['runtimePlatform'] == CspEnv(csp) - # for qualification, passing the cluster properties should be enabled unless it is - # onprem platform that requires target_platform - if CspEnv(csp) != CspEnv.ONPREM: - self.validate_args_w_savings_enabled(tool_name, tool_args) + @staticmethod + def validate_tool_args(tool_name: str, tool_args: dict, cost_savings_enabled, expected_platform): + assert tool_args['runtimePlatform'] == CspEnv(expected_platform) + if cost_savings_enabled: + TestToolArgProcessor.validate_args_w_savings_enabled(tool_name, tool_args) else: - self.validate_args_w_savings_disabled(tool_name, tool_args) + TestToolArgProcessor.validate_args_w_savings_disabled(tool_name, tool_args) @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) - @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A]) - def test_no_cluster_props(self, get_ut_data_dir, tool_name): - # all eventlogs are stored on local path. There is no way to find which cluster - # we refer to. - tool_args = AbsToolUserArgModel.create_tool_args(tool_name, - eventlogs=f'{get_ut_data_dir}/eventlogs') - assert tool_args['runtimePlatform'] == CspEnv.ONPREM - # for qualification, cost savings should be disabled - self.validate_args_w_savings_disabled(tool_name, tool_args) + @pytest.mark.parametrize('csp', all_csps) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED]) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED]) + def test_with_platform(self, tool_name, csp): + # should fail: platform provided; cannot run with platform only + self.create_tool_args_should_fail(tool_name, platform=csp) + + # should fail: platform not provided; cannot run with no args + self.create_tool_args_should_fail(tool_name=tool_name) @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) - @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.VALUE_A]) - @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_A, ArgValueCase.IGNORE]) - def test_onprem_disallow_cluster_by_name(self, get_ut_data_dir, tool_name): - # onprem platform cannot run when the cluster is by_name - with pytest.raises(SystemExit) as pytest_exit_e: - AbsToolUserArgModel.create_tool_args(tool_name, - cluster='my_cluster', - eventlogs=f'{get_ut_data_dir}/eventlogs') - assert pytest_exit_e.type == SystemExit - with pytest.raises(SystemExit) as pytest_wrapped_e: - AbsToolUserArgModel.create_tool_args(tool_name, - platform='onprem', - cluster='my_cluster') - assert pytest_wrapped_e.type == SystemExit + @pytest.mark.parametrize('csp', all_csps) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A]) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A]) + def test_with_platform_with_eventlogs(self, get_ut_data_dir, tool_name, csp): + # should pass: platform and event logs are provided + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + eventlogs=f'{get_ut_data_dir}/eventlogs') + # for qualification, cost savings should be disabled because cluster is not provided + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=False, + expected_platform=csp) + + # should pass: platform not provided; event logs are provided + tool_args = self.create_tool_args_should_pass(tool_name, + eventlogs=f'{get_ut_data_dir}/eventlogs') + # for qualification, cost savings should be disabled because cluster is not provided + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=False, + expected_platform=CspEnv.ONPREM) @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) - @pytest.mark.parametrize('csp', csps) + @pytest.mark.parametrize('csp', all_csps) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_A, ArgValueCase.VALUE_A]) @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_A, ArgValueCase.UNDEFINED]) - def test_cluster_name_no_eventlogs(self, tool_name, csp): - # Missing eventlogs should be accepted for all CSPs (except onPrem) - # because the eventlogs can be retrieved from the cluster - tool_args = AbsToolUserArgModel.create_tool_args(tool_name, - platform=csp, - cluster='my_cluster') - assert tool_args['runtimePlatform'] == CspEnv(csp) - self.validate_args_w_savings_enabled(tool_name, tool_args) + def test_with_platform_with_cluster_name_with_eventlogs(self, get_ut_data_dir, tool_name, csp): + if CspEnv(csp) != CspEnv.ONPREM: + # should pass: platform, cluster name and eventlogs are provided + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + cluster='my_cluster', + eventlogs=f'{get_ut_data_dir}/eventlogs') + # for qualification, cost savings should be enabled because cluster is provided + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=True, + expected_platform=csp) + + # should pass: event logs not provided; missing eventlogs should be accepted for + # all CSPs (except onPrem) because the event logs can be retrieved from the cluster + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + cluster='my_cluster') + # for qualification, cost savings should be enabled because cluster is provided + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=True, + expected_platform=csp) + else: + # should fail: platform, cluster name and eventlogs are provided; onprem platform + # cannot run when the cluster is by name + self.create_tool_args_should_fail(tool_name, + platform=csp, + cluster='my_cluster', + eventlogs=f'{get_ut_data_dir}/eventlogs') + + # should fail: event logs not provided; onprem platform cannot run when the cluster is by name + self.create_tool_args_should_fail(tool_name, + platform=csp, + cluster='my_cluster') @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) - @pytest.mark.parametrize('csp,prop_path', csp_cpu_cluster_props) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.VALUE_A]) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.UNDEFINED]) + def test_with_cluster_name_with_eventlogs(self, get_ut_data_dir, tool_name): + # should fail: eventlogs provided; defaults platform to onprem, cannot run when the cluster is by name + self.create_tool_args_should_fail(tool_name, + cluster='my_cluster', + eventlogs=f'{get_ut_data_dir}/eventlogs') + + # should fail: eventlogs not provided; defaults platform to onprem, cannot run when the cluster is by name + self.create_tool_args_should_fail(tool_name, + cluster='my_cluster') + + @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) + @pytest.mark.parametrize('csp,prop_path', all_cpu_cluster_props) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_B, ArgValueCase.UNDEFINED]) @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_B, ArgValueCase.UNDEFINED]) - def test_cluster_props_no_eventlogs(self, get_ut_data_dir, tool_name, csp, prop_path): - # Missing eventlogs should be accepted for all CSPs (except onPrem) - # because the eventlogs can be retrieved from the cluster + def test_with_platform_with_cluster_props(self, get_ut_data_dir, tool_name, csp, prop_path): cluster_prop_file = f'{get_ut_data_dir}/{prop_path}' - tool_args = AbsToolUserArgModel.create_tool_args(tool_name, - cluster=f'{cluster_prop_file}') - assert tool_args['runtimePlatform'] == CspEnv(csp) - self.validate_args_w_savings_enabled(tool_name, tool_args) + if CspEnv(csp) != CspEnv.ONPREM: + # should pass: platform provided; missing eventlogs should be accepted for all CSPs (except onPrem) + # because the eventlogs can be retrieved from the cluster properties + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + cluster=cluster_prop_file) + # for qualification, cost savings should be enabled because cluster is provided + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=True, + expected_platform=csp) + + # should pass: platform not provided; missing eventlogs should be accepted for all CSPs (except onPrem) + # because the eventlogs can be retrieved from the cluster properties + tool_args = self.create_tool_args_should_pass(tool_name, + cluster=cluster_prop_file) + # for qualification, cost savings should be enabled because cluster is provided + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=True, + expected_platform=csp) + else: + # should fail: onprem platform cannot retrieve eventlogs from cluster properties + self.create_tool_args_should_fail(tool_name, + platform=csp, + cluster=cluster_prop_file) + + # should fail: platform not provided; defaults platform to onprem, cannot retrieve eventlogs from + # cluster properties + self.create_tool_args_should_fail(tool_name, + cluster=cluster_prop_file) @pytest.mark.parametrize('tool_name', ['qualification', 'profiling']) - @register_triplet_test([ArgValueCase.IGNORE, ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED]) - def test_cluster_props_no_eventlogs_on_prem(self, capsys, tool_name): - # Missing eventlogs is not accepted for onPrem - with pytest.raises(SystemExit) as pytest_wrapped_e: - AbsToolUserArgModel.create_tool_args(tool_name, - platform='onprem') - assert pytest_wrapped_e.type == SystemExit - captured = capsys.readouterr() - # Verify there is no URL in error message except for the one from the documentation - assert 'https://' not in captured.err or 'docs.nvidia.com' in captured.err + @pytest.mark.parametrize('csp,prop_path', all_cpu_cluster_props) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_B, ArgValueCase.VALUE_A]) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_B, ArgValueCase.VALUE_A]) + def test_with_platform_with_cluster_props_with_eventlogs(self, get_ut_data_dir, tool_name, csp, prop_path): + # should pass: platform, cluster properties and eventlogs are provided + cluster_prop_file = f'{get_ut_data_dir}/{prop_path}' + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + cluster=cluster_prop_file, + eventlogs=f'{get_ut_data_dir}/eventlogs') + # for qualification, cost savings should be enabled because cluster is provided (except for onprem) + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=CspEnv(csp) != CspEnv.ONPREM, + expected_platform=csp) + + # should pass: platform not provided; cluster properties and eventlogs are provided + tool_args = self.create_tool_args_should_pass(tool_name, + cluster=cluster_prop_file, + eventlogs=f'{get_ut_data_dir}/eventlogs') + # for qualification, cost savings should be enabled because cluster is provided (except for onprem) + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=CspEnv(csp) != CspEnv.ONPREM, + expected_platform=csp) + + @pytest.mark.parametrize('tool_name', ['profiling']) + @pytest.mark.parametrize('csp', all_csps) + @pytest.mark.parametrize('prop_path', [autotuner_prop_path]) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_C, ArgValueCase.UNDEFINED]) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_C, ArgValueCase.UNDEFINED]) + def test_with_platform_with_autotuner(self, get_ut_data_dir, tool_name, csp, prop_path): + # should fail: platform provided; autotuner needs eventlogs + autotuner_prop_file = f'{get_ut_data_dir}/{prop_path}' + self.create_tool_args_should_fail(tool_name, + platform=csp, + cluster=autotuner_prop_file) + + # should fail: platform not provided; autotuner needs eventlogs + self.create_tool_args_should_fail(tool_name, + cluster=autotuner_prop_file) + + @pytest.mark.parametrize('tool_name', ['profiling']) + @pytest.mark.parametrize('csp', all_csps) + @pytest.mark.parametrize('prop_path', [autotuner_prop_path]) + @register_triplet_test([ArgValueCase.VALUE_A, ArgValueCase.VALUE_C, ArgValueCase.VALUE_A]) + @register_triplet_test([ArgValueCase.UNDEFINED, ArgValueCase.VALUE_C, ArgValueCase.VALUE_A]) + def test_with_platform_with_autotuner_with_eventlogs(self, get_ut_data_dir, tool_name, csp, prop_path): + # should pass: platform, autotuner properties and eventlogs are provided + autotuner_prop_file = f'{get_ut_data_dir}/{prop_path}' + tool_args = self.create_tool_args_should_pass(tool_name, + platform=csp, + cluster=autotuner_prop_file, + eventlogs=f'{get_ut_data_dir}/eventlogs') + # cost savings should be disabled for profiling + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=False, + expected_platform=csp) + + # should pass: platform not provided; autotuner properties and eventlogs are provided + tool_args = self.create_tool_args_should_pass(tool_name, + cluster=autotuner_prop_file, + eventlogs=f'{get_ut_data_dir}/eventlogs') + # cost savings should be disabled for profiling + self.validate_tool_args(tool_name=tool_name, tool_args=tool_args, + cost_savings_enabled=False, + expected_platform=CspEnv.ONPREM) - @pytest.mark.skip(reason='Unit tests are not completed yet') def test_arg_cases_coverage(self): - args_keys = [ - [ArgValueCase.IGNORE, ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED], - [ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.UNDEFINED], - [ArgValueCase.VALUE_A, ArgValueCase.VALUE_A, ArgValueCase.IGNORE], - [ArgValueCase.UNDEFINED, ArgValueCase.VALUE_B, ArgValueCase.IGNORE], - [ArgValueCase.UNDEFINED, ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A], - [ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.VALUE_A], - [ArgValueCase.IGNORE, ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A] - ] - - for arg_key in args_keys: - assert str(arg_key) in triplet_test_registry + """ + This test ensures that above tests have covered all possible states of the `platform`, `cluster`, + and `event logs` fields. + + Possible States: + - platform:`undefined` or `actual value`. + - cluster: `undefined`, `cluster name`, `cluster property file` or `auto tuner file`. + - event logs: `undefined` or `actual value`. + """ + arg_platform_cases = [ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A] + arg_cluster_cases = [ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A, ArgValueCase.VALUE_B, ArgValueCase.VALUE_C] + arg_eventlogs_cases = [ArgValueCase.UNDEFINED, ArgValueCase.VALUE_A] + + all_args_keys = [str([p, c, e]) for p in arg_platform_cases for c in arg_cluster_cases for e in + arg_eventlogs_cases] + args_covered = set(triplet_test_registry.keys()) + args_not_covered = set(all_args_keys) - args_covered + + if args_not_covered: + # cases not covered + args_not_covered_str = '\n'.join(args_not_covered) + warnings.warn(f'Cases not covered:\n{args_not_covered_str}') + warnings.warn(f'Coverage of all argument cases: {len(args_covered)}/{len(all_args_keys)}')