Skip to content

Commit

Permalink
Allow spark dependency to be configured dynamically (#1326)
Browse files Browse the repository at this point in the history
* Allow spark dependency to be configured dynamically

Signed-off-by: Ahmed Hussein <[email protected]>

Fixes #1316

Allow user-tools to pick the SPARK dependencies based on a runtime
env_var. The value format follows the same format of `buildver` in the
scala pom file.
Currently 333 and 350 (default) are supported.
If user specifies an invalid value, there will be a warning message,
then the process fails running the java cmd.

**Changes**

- Add dependency key to the platform config-file
- A platform can define its own default dependency versions using
  `activeBuildVer` key
- Add a default `RUNTIME_BUILDVER` in the `__init__.py` to allow
  upgrades of spark release during official releases
- Read an env_var `RAPIDS_USER_TOOLS_RUNTIME_BUILDVER` to pick the
  correct dependency.
- Currently, only `333` and `350` are supported. Default is `350`

* remove value used to test

Signed-off-by: Ahmed Hussein <[email protected]>

* Change behavior to give precedence to the rnv_var

Signed-off-by: Ahmed Hussein <[email protected]>

* rename env_var and raise error for invalid dep version

Signed-off-by: Ahmed Hussein <[email protected]>

* Rename buildver to spark_dep

Signed-off-by: Ahmed Hussein <[email protected]>

---------

Signed-off-by: Ahmed Hussein <[email protected]>
  • Loading branch information
amahussein authored Sep 6, 2024
1 parent 20d0c32 commit 4747d14
Show file tree
Hide file tree
Showing 10 changed files with 296 additions and 244 deletions.
5 changes: 4 additions & 1 deletion user_tools/src/spark_rapids_pytools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

"""init file of the spark_rapids_pytools package."""

from spark_rapids_pytools.build import get_version
from spark_rapids_pytools.build import get_version, get_spark_dep_version

VERSION = '24.08.2'
# defines the default runtime build version for the user tools environment
SPARK_DEP_VERSION = '350'
__version__ = get_version(VERSION)
__spark_dep_version__ = get_spark_dep_version()
22 changes: 20 additions & 2 deletions user_tools/src/spark_rapids_pytools/build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -18,7 +18,7 @@
import os


def get_version(main=None):
def get_version(main: str = None) -> str:
if main is None:
# pylint: disable=import-outside-toplevel
from spark_rapids_pytools import VERSION as main
Expand All @@ -27,3 +27,21 @@ def get_version(main=None):
if nightly == '1':
suffix = '.dev' + datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
return main + suffix


def get_spark_dep_version(spark_dep_arg: str = None) -> str:
"""
Get the runtime SPARK build_version for the user tools environment.
Note that the env_var always have precedence over the input argument and the default values
:param spark_dep_arg: optional argument to specify the build version
:return: the first value set in the following order:
1- env_var RAPIDS_USER_TOOLS_SPARK_DEP_VERSION
2- the input buildver_arg
3- default value SPARK_DEV_VERSION
"""
if spark_dep_arg is None:
# pylint: disable=import-outside-toplevel
from spark_rapids_pytools import SPARK_DEP_VERSION
spark_dep_arg = SPARK_DEP_VERSION
# the env_var should have precedence because this is the way user can override the default configs
return os.environ.get('RAPIDS_USER_TOOLS_SPARK_DEP_VERSION', spark_dep_arg)
22 changes: 17 additions & 5 deletions user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from logging import Logger
from typing import Any, Callable, Dict, List
from typing import Any, Callable, Dict, List, Optional

import yaml

import spark_rapids_pytools
from spark_rapids_pytools import get_spark_dep_version
from spark_rapids_pytools.cloud_api.sp_types import get_platform, \
ClusterBase, DeployMode, NodeHWInfo
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer, AbstractPropertiesContainer
from spark_rapids_pytools.common.sys_storage import FSUtil, FileVerifier
from spark_rapids_pytools.common.utilities import ToolLogging, Utils, ToolsSpinner
from spark_rapids_pytools.rapids.rapids_job import RapidsJobPropContainer
Expand Down Expand Up @@ -389,6 +390,19 @@ def _calculate_spark_settings(self, worker_info: NodeHWInfo) -> dict:
}
return res

@classmethod
def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPropertiesContainer) -> Optional[list]:
"""
Get the tools dependencies from the platform configuration.
"""
# allow defining default buildver per platform
buildver_from_conf = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, 'activeBuildVer')
active_buildver = get_spark_dep_version(buildver_from_conf)
depend_arr = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, active_buildver)
if depend_arr is None:
raise ValueError(f'Invalid SPARK dependency version [{active_buildver}]')
return depend_arr


@dataclass
class RapidsJarTool(RapidsTool):
Expand Down Expand Up @@ -581,9 +595,7 @@ def cache_all_dependencies(dep_arr: List[dict]):

# TODO: Verify the downloaded file by checking their MD5
deploy_mode = DeployMode.tostring(self.ctxt.get_deploy_mode())
depend_arr = self.ctxt.platform.configs.get_value_silent('dependencies',
'deployMode',
deploy_mode)
depend_arr = self.get_rapids_tools_dependencies(deploy_mode, self.ctxt.platform.configs)
if depend_arr:
dep_list = cache_all_dependencies(depend_arr)
if any(dep_item is None for dep_item in dep_list):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,58 +1,61 @@
{
"dependencies": {
"deployMode": {
"LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
],
"SPARK333-LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
]
"LOCAL": {
"//activeBuildVer": "Define this key in order to set the default buildVer for that platform",
"350": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
]
}
}
},
"environment": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,42 +1,45 @@
{
"dependencies": {
"deployMode": {
"LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
],
"SPARK333-LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
]
"LOCAL": {
"//activeBuildVer": "Define this key in order to set the default buildVer for that platform",
"350": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
]
}
}
},
"environment": {
Expand Down Expand Up @@ -370,4 +373,3 @@
"minWorkerNodes": 2
}
}

75 changes: 39 additions & 36 deletions user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
Original file line number Diff line number Diff line change
@@ -1,42 +1,45 @@
{
"dependencies": {
"deployMode": {
"LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar",
"type": "jar",
"md5": "2ee6ad7215304cf5da8e731afb36ad72",
"sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170",
"size": 39359477
}
],
"SPARK333-LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar",
"type": "jar",
"md5": "41aea3add826dfbf3384a2c638148709",
"sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66",
"size": 38413466
}
]
"LOCAL": {
"//activeBuildVer": "Define this key in order to set the default buildVer for that platform",
"350": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar",
"type": "jar",
"md5": "2ee6ad7215304cf5da8e731afb36ad72",
"sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170",
"size": 39359477
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar",
"type": "jar",
"md5": "41aea3add826dfbf3384a2c638148709",
"sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66",
"size": 38413466
}
]
}
}
},
"environment": {
Expand Down
Loading

0 comments on commit 4747d14

Please sign in to comment.