NVIDIA · amahussein · Feb 6, 2025 · Feb 4, 2025 · Feb 5, 2025 · cindyyuanjiang
diff --git a/core/pom.xml b/core/pom.xml
@@ -417,6 +417,7 @@
         <maven.artifact.version>3.9.0</maven.artifact.version>
         <scala.javac.args>-Xlint:all,-serial,-path,-try</scala.javac.args>
         <rapids.shade.package>com.nvidia.shaded.spark</rapids.shade.package>
+        <benchmarks.checkpoints>noOp</benchmarks.checkpoints>
         <jsoup.version>1.16.1</jsoup.version>
         <!-- properties used for DeltaLake -->
         <delta10x.version>1.0.1</delta10x.version>

diff --git a/core/src/main/resources/configs/build.properties b/core/src/main/resources/configs/build.properties
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,3 +23,4 @@ build.spark.version=${spark.version}
 build.hadoop.version=${hadoop.version}
 build.java.version=${java.version}
 build.scala.version=${scala.version}
+build.benchmarks.checkpoints=${benchmarks.checkpoints}
diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/DevRuntimeCheckpoint.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/DevRuntimeCheckpoint.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rapids.tool.benchmarks
+
+import org.apache.spark.sql.rapids.tool.util.RuntimeUtil
+
+/**
+ * A simple implementation to insert checkpoints during runtime to pull some performance metrics
+ * related to Tools. This is disabled by default and can be enabled by setting the build
+ * property `benchmarks.checkpoints`.
+ */
+class DevRuntimeCheckpoint extends RuntimeCheckpointTrait {
+  /**
+   * Insert a memory marker with the given label. This will print the memory information.
+   * @param label the label for the memory marker
+   */
+  override def insertMemoryMarker(label: String): Unit = {
+    val memoryInfo = RuntimeUtil.getJVMHeapInfo(runGC = true)
+    // scalastyle:off println
+    println(s"Memory Marker: $label, ${memoryInfo.mkString("\n")}")
+    // scalastyle:on println
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/NoOpRuntimeCheckpoint.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/NoOpRuntimeCheckpoint.scala
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rapids.tool.benchmarks
+
+import scala.annotation.nowarn
+
+/**
+ * An empty implementation of the Checkpoint interface that inserts NoOps.
+ * This is the default implementation that will be used in production and normal builds.
+ */
+class NoOpRuntimeCheckpoint extends RuntimeCheckpointTrait {
+  override def insertMemoryMarker(@nowarn label: String): Unit = {
+    // Do nothing. This is a noOp
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/RuntimeCheckpointTrait.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/RuntimeCheckpointTrait.scala
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rapids.tool.benchmarks
+
+/**
+ * API for inserting checkpoints in runtime.
+ * This is used for debugging and benchmarking purposes.
+ */
+trait RuntimeCheckpointTrait {
+  /**
+   * Insert a memory marker with the given label.
+   * @param label the label for the memory marker
+   */
+  def insertMemoryMarker(label: String): Unit
+}
diff --git a/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/RuntimeInjector.scala b/core/src/main/scala/org/apache/spark/rapids/tool/benchmarks/RuntimeInjector.scala
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rapids.tool.benchmarks
+
+import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
+
+/**
+ * The global runtime injector that will be used to insert checkpoints during runtime.
+ * This is used to pull some performance metrics related to Tools.
+ */
+object RuntimeInjector extends RuntimeCheckpointTrait {
+  /**
+   * Initializes the runtime injector based on the build properties "benchmarks.checkpoints".
+   * @return the runtime injector
+   */
+  private def loadRuntimeCheckPoint(): RuntimeCheckpointTrait = {
+    val buildProps = RapidsToolsConfUtil.loadBuildProperties
+    if (buildProps.getProperty("build.benchmarks.checkpoints").contains("dev")) {
+      // The benchmark injection is enabled.
+      new DevRuntimeCheckpoint
+    } else { // loads the noOp implementation by default
+      new NoOpRuntimeCheckpoint
+    }
+  }
+  private lazy val runtimeCheckpoint: RuntimeCheckpointTrait = loadRuntimeCheckPoint()
+
+  override def insertMemoryMarker(label: String): Unit = {
+    runtimeCheckpoint.insertMemoryMarker(label)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
@@ -33,6 +33,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.deploy.history.{EventLogFileReader, EventLogFileWriter}
 import org.apache.spark.internal.Logging
+import org.apache.spark.rapids.tool.benchmarks.RuntimeInjector
 import org.apache.spark.scheduler.{SparkListenerEvent, StageInfo}
 import org.apache.spark.sql.execution.SparkPlanInfo
 import org.apache.spark.sql.execution.ui.SparkPlanGraphNode
@@ -492,6 +493,7 @@ abstract class AppBase(
   def processEvents(): Unit = {
     processEventsInternal()
     postCompletion()
+    RuntimeInjector.insertMemoryMarker("Post processing events")
   }
 
   /**

diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/StageModel.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/store/StageModel.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ import com.nvidia.spark.rapids.tool.profiling.ProfileUtils
 
 import org.apache.spark.scheduler.StageInfo
 import org.apache.spark.sql.rapids.tool.annotation.{Calculated, Since, WallClock}
+import org.apache.spark.sql.rapids.tool.util.stubs.StageInfoStub
 
 /**
  * StageModel is a class to store the information of a stage.
@@ -31,16 +32,16 @@ import org.apache.spark.sql.rapids.tool.annotation.{Calculated, Since, WallClock
 @Since("24.02.3")
 class StageModel private(sInfo: StageInfo) {
 
-  var stageInfo: StageInfo = _
+  var stageInfo: StageInfoStub = _
   updateInfo(sInfo)
 
   /**
    * @param newStageInfo
    * @return a new StageInfo object.
    * TODO: https://github.com/NVIDIA/spark-rapids-tools/issues/1260
    */
-  private def initStageInfo(newStageInfo: StageInfo): StageInfo = {
-    newStageInfo
+  private def initStageInfo(newStageInfo: StageInfo): StageInfoStub = {
+    StageInfoStub.fromStageInfo(newStageInfo)
   }
 
   @WallClock

diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/RuntimeUtil.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/RuntimeUtil.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,4 +103,16 @@ object RuntimeUtil extends Logging {
         }
     }.toMap
   }
+
+  def getJVMHeapInfo(runGC: Boolean = true): Map[String, String] = {
+    if (runGC) {
+      System.gc()
+    }
+    val runtime = Runtime.getRuntime
+    Map(
+      "jvm.heap.max" -> runtime.maxMemory().toString,
+      "jvm.heap.total" -> runtime.totalMemory().toString,
+      "jvm.heap.free" -> runtime.freeMemory().toString
+    )
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/stubs/StageInfoStub.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/stubs/StageInfoStub.scala
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.tool.util.stubs
+
+import org.apache.spark.scheduler.StageInfo
+import org.apache.spark.sql.rapids.tool.annotation.ToolsReflection
+
+@ToolsReflection("Common",
+  "StageInfo is a common class used in all versions of Spark but the constructor signature is" +
+    " different across versions.")
+case class StageInfoStub(
+    stageId: Int,
+    attemptId: Int,
+    name: String,
+    numTasks: Int,
+    details: String,
+    /** When this stage was submitted from the DAGScheduler to a TaskScheduler. */
+    submissionTime: Option[Long] = None,
+    /** Time when the stage completed or when the stage was cancelled. */
+    completionTime: Option[Long] = None,
+    /** If the stage failed, the reason why. */
+    failureReason: Option[String] = None) {
+
+  def attemptNumber(): Int = attemptId
+}
+
+object StageInfoStub {
+  def fromStageInfo(stageInfo: StageInfo): StageInfoStub = {
+    StageInfoStub(
+      stageInfo.stageId,
+      stageInfo.attemptNumber(),
+      stageInfo.name,
+      stageInfo.numTasks,
+      stageInfo.details,
+      stageInfo.submissionTime,
+      stageInfo.completionTime,
+      stageInfo.failureReason)
+  }
+}