diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml new file mode 100644 index 00000000000..e7f62399436 --- /dev/null +++ b/.github/workflows/license-header-check.yml @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# A workflow to check copyright/license header +name: license header check + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + license-header-check: + runs-on: ubuntu-latest + if: "!contains(github.event.pull_request.title, '[bot]')" + steps: + - name: Get checkout depth + run: | + echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV + + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: ${{ env.PR_FETCH_DEPTH }} + + - name: license-header-check + uses: NVIDIA/spark-rapids-common/license-header-check@main + with: + included_file_patterns: | + *.yml, + *.yaml, + *.sh, + *.xml, + *.properties, + *.scala, + *.py, + build/*, + *.cpp, + *Dockerfile*, + *Jenkinsfile*, + *.ini, + *.java, + *.fbs + excluded_file_patterns: | + *target/*, + thirdparty/*, + sql-plugin/src/main/java/com/nvidia/spark/rapids/format/* + \ No newline at end of file diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 0aca7bc3655..b58799c6110 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -53,7 +53,8 @@ jobs: id: generateCacheKey run: | set -x - cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.12) + cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}" echo "dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT - name: Cache local Maven repository id: cache @@ -165,7 +166,8 @@ jobs: id: generateCacheKey run: | set -x - cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')" + depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.13) + cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}" echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT - name: Cache local Maven repository id: cache diff --git a/.github/workflows/mvn-verify-check/get-deps-sha1.sh b/.github/workflows/mvn-verify-check/get-deps-sha1.sh new file mode 100755 index 00000000000..aa7129bd3ef --- /dev/null +++ b/.github/workflows/mvn-verify-check/get-deps-sha1.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +scala_ver=${1:-"2.12"} +base_URL="https://oss.sonatype.org/service/local/artifact/maven/resolve" +project_jni="spark-rapids-jni" +project_private="rapids-4-spark-private_${scala_ver}" + +jni_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-jni.version -DforceStdout) +private_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout) + +jni_sha1=$(curl -s -H "Accept: application/json" \ + "${base_URL}?r=snapshots&g=com.nvidia&a=${project_jni}&v=${jni_ver}&c=&e=jar&wt=json" \ + | jq .data.sha1) || $(date +'%Y-%m-%d') +private_sha1=$(curl -s -H "Accept: application/json" \ + "${base_URL}?r=snapshots&g=com.nvidia&a=${project_private}&v=${private_ver}&c=&e=jar&wt=json" \ + | jq .data.sha1) || $(date +'%Y-%m-%d') + +sha1md5=$(echo -n "${jni_sha1}_${private_sha1}" | md5sum | awk '{print $1}') + +echo $sha1md5 diff --git a/.github/workflows/mvn-verify-check/populate-daily-cache.sh b/.github/workflows/mvn-verify-check/populate-daily-cache.sh index b93cd0b6b49..d4e9b07d1a7 100755 --- a/.github/workflows/mvn-verify-check/populate-daily-cache.sh +++ b/.github/workflows/mvn-verify-check/populate-daily-cache.sh @@ -14,22 +14,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -x -max_retry=3; delay=30; i=1 +set -e +set -o pipefail + if [[ $SCALA_VER == '2.12' ]]; then pom='pom.xml' elif [[ $SCALA_VER == '2.13' ]]; then pom='scala2.13/pom.xml' fi + +max_retry=3; delay=30; i=1 while true; do + buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) && { - python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \ - xargs -n 1 -I {} bash -c \ - "mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies" - + for buildver in "${buildvers[@]}"; do + mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies + done + } && { # compile base versions to cache scala compiler and compiler bridge - mvn $COMMON_MVN_FLAGS --file $pom \ - process-test-resources -pl sql-plugin-api -am + mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am } && break || { if [[ $i -le $max_retry ]]; then echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2)) @@ -37,4 +40,4 @@ while true; do echo "mvn command failed. Exit 1"; exit 1 fi } -done \ No newline at end of file +done diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 83b30747abd..e4077ee5994 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -127,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -178,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` diff --git a/README.md b/README.md index 94b73565190..61914e49df0 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index c7a6c220247..a47745776bc 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT aggregator diff --git a/api_validation/pom.xml b/api_validation/pom.xml index cddcf0c1ce1..f3339375806 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT api_validation diff --git a/build/get_buildvers.py b/build/get_buildvers.py index 5fe864670b5..263003ea99f 100644 --- a/build/get_buildvers.py +++ b/build/get_buildvers.py @@ -34,7 +34,7 @@ def _get_buildvers(buildvers, pom_file, logger=None): else: no_snapshots.append(release) excluded_shims = pom.find(".//pom:dyn.shim.excluded.releases", ns) - if excluded_shims is not None: + if excluded_shims is not None and excluded_shims.text: for removed_shim in [x.strip() for x in excluded_shims.text.split(",")]: if removed_shim in snapshots: snapshots.remove(removed_shim) diff --git a/datagen/README.md b/datagen/README.md index 022cc2f1eba..1c49c8db58e 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.12.0 release would be -`target/datagen_2.12-24.12.0-spark330.jar` +for example a Spark 3.3.0 jar for the 25.02.0 release would be +`target/datagen_2.12-25.02.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.12.0-spark330.jar +spark-shell --jars target/datagen_2.12-25.02.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index a728ad9a13e..8e692173f5f 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.12.0-SNAPSHOT-spark332.jar \ +./target/datagen_2.12-25.02.0-SNAPSHOT-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index 9bdf897cfd7..fc2d8bc677c 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT datagen diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala index 2884968660d..3480718dbc7 100644 --- a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala +++ b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala @@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.internal.ExpressionUtils.{column, expression} object DataGenExprShims { - def columnToExpr(c: Column): Expression = c - def exprToColumn(e: Expression): Column = e + def columnToExpr(c: Column): Expression = expression(c) + def exprToColumn(e: Expression): Column = column(e) } diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala index 0c212d6842a..5b26943a541 100644 --- a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala +++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala @@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD} +import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.ThreadUtils @@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec( private[sql] lazy val readMetrics = SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext) - override lazy val additionalMetrics: Map[String, GpuMetric] = Map( - "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"), - "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"), - "rapidsShuffleSerializationTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"), - "rapidsShuffleDeserializationTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"), - "rapidsShuffleWriteTime" -> - createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"), - "rapidsShuffleCombineTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"), - "rapidsShuffleWriteIoTime" -> - createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"), - "rapidsShuffleReadTime" -> - createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time") - ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics) + override lazy val additionalMetrics : Map[String, GpuMetric] = { + createAdditionalExchangeMetrics(this) ++ + GpuMetric.wrap(readMetrics) ++ + GpuMetric.wrap(writeMetrics) + } override lazy val allMetrics: Map[String, GpuMetric] = { Map( @@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec( } private lazy val serializer: Serializer = - new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"), + new GpuColumnarBatchSerializer(allMetrics, child.output.map(_.dataType).toArray, RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf)) diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index 1d41911c767..ba5443a7be2 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index 7514088ca3a..602686d79ab 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index 2ed0ea3b159..7867c573607 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index 1daebdd0efb..f537de0be36 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 36ec92b70c0..443681b6cb3 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index 95f54c6807c..4812c9d0097 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index 4d792ee1ca5..306553caa43 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index 4b229e2e5b5..c7b4a4e2738 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/delta-lake/delta-spark350db143/pom.xml b/delta-lake/delta-spark350db143/pom.xml index 1bca394b67c..1e166244e1e 100644 --- a/delta-lake/delta-spark350db143/pom.xml +++ b/delta-lake/delta-spark350db143/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../shim-deps/pom.xml rapids-4-spark-delta-spark350db143_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT false diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index 6d0471f9f01..31b8e03b366 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/dist/pom.xml b/dist/pom.xml index d628dd4ba3b..b34292a25cd 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.12.0-SNAPSHOT + 25.02.0-SNAPSHOT com.nvidia diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index 07346a5b850..5519e56b419 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -33,6 +33,7 @@ Name | Description | Default Value | Applicable at spark.rapids.filecache.blockPathRegexp|A regular expression to decide which paths will not be cached when the file cache is enabled. If a path is blocked by this regexp but is allowed by spark.rapids.filecache.allowPathRegexp, then the path is blocked.|None|Startup spark.rapids.filecache.checkStale|Controls whether the cached is checked for being out of date with respect to the input file. When enabled, the data that has been cached locally for a file will be invalidated if the file is updated after being cached. This feature is only necessary if an input file for a Spark application can be changed during the lifetime of the application. If an individual input file will not be overwritten during the Spark application then performance may be improved by setting this to false.|true|Startup spark.rapids.filecache.maxBytes|Controls the maximum amount of data that will be cached locally. If left unspecified, it will use half of the available disk space detected on startup for the configured Spark local disks.|None|Startup +spark.rapids.filecache.minimumFreeSpace.bytes|Specify the minimum amount of free space in the Spark local disks. When the amount of free space on the Spark local disks drops below this value, cache data will be removed automatically to free disk space. A zero or negative value will disable this feature. Note if multiple Spark applications running on the same node, or there are other applications running with heavy disk writing, the filecache may not drop caches in time and may cause full disk errors. Please increase this value for this case.|32212254720|Startup spark.rapids.filecache.useChecksums|Whether to write out and verify checksums for the cached local files.|false|Startup spark.rapids.gpu.resourceName|The name of the Spark resource that represents a GPU that you want the plugin to use if using custom resources with Spark.|gpu|Startup spark.rapids.memory.gpu.allocFraction|The fraction of available (free) GPU memory that should be allocated for pooled memory. This must be less than or equal to the maximum limit configured via spark.rapids.memory.gpu.maxAllocFraction, and greater than or equal to the minimum limit configured via spark.rapids.memory.gpu.minAllocFraction.|1.0|Startup @@ -385,6 +386,8 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.ToUnixTimestamp|`to_unix_timestamp`|Returns the UNIX timestamp of the given time|true|None| spark.rapids.sql.expression.TransformKeys|`transform_keys`|Transform keys in a map using a transform function|true|None| spark.rapids.sql.expression.TransformValues|`transform_values`|Transform values in a map using a transform function|true|None| +spark.rapids.sql.expression.TruncDate|`trunc`|Truncate the date to the unit specified by the given string format|true|None| +spark.rapids.sql.expression.TruncTimestamp|`date_trunc`|Truncate the timestamp to the unit specified by the given string format|true|None| spark.rapids.sql.expression.UnaryMinus|`negative`|Negate a numeric value|true|None| spark.rapids.sql.expression.UnaryPositive|`positive`|A numeric value with a + in front of it|true|None| spark.rapids.sql.expression.UnboundedFollowing$| |Special boundary for a window frame, indicating all rows preceding the current row|true|None| diff --git a/docs/configs.md b/docs/configs.md index 7f9544496c4..04aecb41f02 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \ +${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.concurrentGpuTasks=2 ``` diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml index 165d30dde06..9f5c3c100dc 100644 --- a/docs/dev/idea-code-style-settings.xml +++ b/docs/dev/idea-code-style-settings.xml @@ -1,3 +1,19 @@ + +