From 1da78d6d3dd90a04cececa41f48131d2a0f9afc7 Mon Sep 17 00:00:00 2001 From: Jacob Salway Date: Fri, 18 Oct 2024 03:25:20 +1100 Subject: [PATCH 01/14] Add Quick Start guide to README (#2259) Signed-off-by: Jacob Salway --- README.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c9493e2615..fc5386278f 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,33 @@ # Kubeflow Spark Operator -[![Integration Test](https://github.com/kubeflow/spark-operator/actions/workflows/integration.yaml/badge.svg)](https://github.com/kubeflow/spark-operator/actions/workflows/integration.yaml)[![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/spark-operator)](https://goreportcard.com/report/github.com/kubeflow/spark-operator) +[![Integration Test](https://github.com/kubeflow/spark-operator/actions/workflows/integration.yaml/badge.svg)](https://github.com/kubeflow/spark-operator/actions/workflows/integration.yaml) +[![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/spark-operator)](https://goreportcard.com/report/github.com/kubeflow/spark-operator) ## What is Spark Operator? The Kubernetes Operator for Apache Spark aims to make specifying and running [Spark](https://github.com/apache/spark) applications as easy and idiomatic as running other workloads on Kubernetes. It uses [Kubernetes custom resources](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) for specifying, running, and surfacing status of Spark applications. +## Quick Start + +For a more detailed guide, please refer to the [Getting Started guide](https://www.kubeflow.org/docs/components/spark-operator/getting-started/). + +```bash +# Add the Helm repository +helm repo add spark-operator https://kubeflow.github.io/spark-operator +helm repo update + +# Install the operator into the spark-operator namespace and wait for deployments to be ready +helm install spark-operator spark-operator/spark-operator \ + --namespace spark-operator --create-namespace --wait + +# Create an example application in the default namespace +kubectl apply -f https://raw.githubusercontent.com/kubeflow/spark-operator/refs/heads/master/examples/spark-pi.yaml + +# Get the status of the application +kubectl get sparkapp spark-pi +``` + ## Overview For a complete reference of the custom resource definitions, please refer to the [API Definition](docs/api-docs.md). For details on its design, please refer to the [Architecture](https://www.kubeflow.org/docs/components/spark-operator/overview/#architecture). It requires Spark 2.3 and above that supports Kubernetes as a native scheduler backend. From ff2b26b6693cf5fad7b1a3e6ac583cbd3bb7ac30 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 17 Oct 2024 16:54:21 +0000 Subject: [PATCH 02/14] Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.63.3 to 1.65.3 (#2249) Bumps [github.com/aws/aws-sdk-go-v2/service/s3](https://github.com/aws/aws-sdk-go-v2) from 1.63.3 to 1.65.3. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/s3/v1.63.3...service/s3/v1.65.3) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/service/s3 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 10 +++++----- go.sum | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index cf39252322..c3133ce81e 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( cloud.google.com/go/storage v1.44.0 github.com/aws/aws-sdk-go-v2 v1.32.2 github.com/aws/aws-sdk-go-v2/config v1.27.43 - github.com/aws/aws-sdk-go-v2/service/s3 v1.63.3 + github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3 github.com/golang/glog v1.2.2 github.com/google/uuid v1.6.0 github.com/olekukonko/tablewriter v0.0.5 @@ -56,18 +56,18 @@ require ( github.com/Microsoft/hcsshim v0.12.4 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/aws/aws-sdk-go v1.55.5 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.6 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.17.41 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.17 // indirect github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.21 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.21 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.0 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.2 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.2 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.2 // indirect github.com/aws/aws-sdk-go-v2/service/sso v1.24.2 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.2 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.32.2 // indirect diff --git a/go.sum b/go.sum index 0bc23ad614..ef603e94a3 100644 --- a/go.sum +++ b/go.sum @@ -68,8 +68,8 @@ github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/aws/aws-sdk-go-v2 v1.32.2 h1:AkNLZEyYMLnx/Q/mSKkcMqwNFXMAvFto9bNsHqcTduI= github.com/aws/aws-sdk-go-v2 v1.32.2/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 h1:xDAuZTn4IMm8o1LnBZvmrL8JA1io4o3YWNXgohbf20g= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5/go.mod h1:wYSv6iDS621sEFLfKvpPE2ugjTuGlAG7iROg0hLOkfc= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.6 h1:pT3hpW0cOHRJx8Y0DfJUEQuqPild8jRGmSFmBgvydr0= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.6/go.mod h1:j/I2++U0xX+cr44QjHay4Cvxj6FUbnxrgmqN3H1jTZA= github.com/aws/aws-sdk-go-v2/config v1.27.43 h1:p33fDDihFC390dhhuv8nOmX419wjOSDQRb+USt20RrU= github.com/aws/aws-sdk-go-v2/config v1.27.43/go.mod h1:pYhbtvg1siOOg8h5an77rXle9tVG8T+BWLWAo7cOukc= github.com/aws/aws-sdk-go-v2/credentials v1.17.41 h1:7gXo+Axmp+R4Z+AK8YFQO0ZV3L0gizGINCOWxSLY9W8= @@ -84,18 +84,18 @@ github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 h1:6jZVETqmYCadGFvrYE github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21/go.mod h1:1SR0GbLlnN3QUmYaflZNiH1ql+1qrSiB2vwcJ+4UM60= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 h1:VaRN3TlFdd6KxX1x3ILT5ynH6HvKgqdiXoTxAF4HQcQ= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1/go.mod h1:FbtygfRFze9usAadmnGJNc8KsP346kEe+y2/oyhGAGc= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18 h1:OWYvKL53l1rbsUmW7bQyJVsYU/Ii3bbAAQIIFNbM0Tk= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.18/go.mod h1:CUx0G1v3wG6l01tUB+j7Y8kclA8NSqK4ef0YG79a4cg= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.21 h1:7edmS3VOBDhK00b/MwGtGglCm7hhwNYnjJs/PgFdMQE= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.21/go.mod h1:Q9o5h4HoIWG8XfzxqiuK/CGUbepCJ8uTlaE3bAbxytQ= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.0 h1:TToQNkvGguu209puTojY/ozlqy2d/SFNcoLIqTFi42g= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.0/go.mod h1:0jp+ltwkf+SwG2fm/PKo8t4y8pJSgOCO4D8Lz3k0aHQ= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20 h1:rTWjG6AvWekO2B1LHeM3ktU7MqyX9rzWQ7hgzneZW7E= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.20/go.mod h1:RGW2DDpVc8hu6Y6yG8G5CHVmVOAn1oV8rNKOHRJyswg= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.2 h1:4FMHqLfk0efmTqhXVRL5xYRqlEBNBiRI7N6w4jsEdd4= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.2/go.mod h1:LWoqeWlK9OZeJxsROW2RqrSPvQHKTpp69r/iDjwsSaw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.2 h1:s7NA1SOw8q/5c0wr8477yOPp0z+uBaXBnLE0XYb0POA= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.2/go.mod h1:fnjjWyAW/Pj5HYOxl9LJqWtEwS7W2qgcRLWP+uWbss0= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18 h1:eb+tFOIl9ZsUe2259/BKPeniKuz4/02zZFH/i4Nf8Rg= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.18/go.mod h1:GVCC2IJNJTmdlyEsSmofEy7EfJncP7DNnXDzRjJ5Keg= -github.com/aws/aws-sdk-go-v2/service/s3 v1.63.3 h1:3zt8qqznMuAZWDTDpcwv9Xr11M/lVj2FsRR7oYBt0OA= -github.com/aws/aws-sdk-go-v2/service/s3 v1.63.3/go.mod h1:NLTqRLe3pUNu3nTEHI6XlHLKYmc8fbHUdMxAB6+s41Q= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.2 h1:t7iUP9+4wdc5lt3E41huP+GvQZJD38WLsgVp4iOtAjg= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.2/go.mod h1:/niFCtmuQNxqx9v8WAPq5qh7EH25U4BF6tjoyq9bObM= +github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3 h1:xxHGZ+wUgZNACQmxtdvP5tgzfsxGS3vPpTP5Hy3iToE= +github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3/go.mod h1:cB6oAuus7YXRZhWCc1wIwPywwZ1XwweNp2TVAEGYeB8= github.com/aws/aws-sdk-go-v2/service/sso v1.24.2 h1:bSYXVyUzoTHoKalBmwaZxs97HU9DWWI3ehHSAMa7xOk= github.com/aws/aws-sdk-go-v2/service/sso v1.24.2/go.mod h1:skMqY7JElusiOUjMJMOv1jJsP7YUg7DrhgqZZWuzu1U= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.2 h1:AhmO1fHINP9vFYUE0LHzCWg/LfUWUF+zFPEcY9QXb7o= From 1491550391f224224b621fa64fc2f3e05a4cbe2a Mon Sep 17 00:00:00 2001 From: Roberto Devesa <15369573+Roberdvs@users.noreply.github.com> Date: Fri, 18 Oct 2024 02:54:20 +0200 Subject: [PATCH 03/14] Make sure enable-ui-service flag is set to false when controller.uiService.enable is set to false (#2261) Signed-off-by: Roberto Devesa <15369573+Roberdvs@users.noreply.github.com> --- .../spark-operator-chart/templates/controller/deployment.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/charts/spark-operator-chart/templates/controller/deployment.yaml b/charts/spark-operator-chart/templates/controller/deployment.yaml index 8c292d7ad1..1efd126568 100644 --- a/charts/spark-operator-chart/templates/controller/deployment.yaml +++ b/charts/spark-operator-chart/templates/controller/deployment.yaml @@ -64,9 +64,7 @@ spec: {{- end }} {{- end }} - --controller-threads={{ .Values.controller.workers }} - {{- with .Values.controller.uiService.enable }} - - --enable-ui-service=true - {{- end }} + - --enable-ui-service={{ .Values.controller.uiService.enable }} {{- if .Values.controller.uiIngress.enable }} {{- with .Values.controller.uiIngress.urlFormat }} - --ingress-url-format={{ . }} From 5ff8dcf3508859bbf6ceed83758f0eb685c8c9be Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 18 Oct 2024 12:06:20 +0100 Subject: [PATCH 04/14] `omitempty` corrections (#2255) * Still working on tests Signed-off-by: Thomas Newton * Maybe progress Signed-off-by: Thomas Newton * First working validation Signed-off-by: Thomas Newton * Lots of cleanup needed but it actually reproduced Signed-off-by: Thomas Newton * Working but ugly get schema from CRD Signed-off-by: Thomas Newton * Satisfactory test Signed-off-by: Thomas Newton * Add missing omitempty for optional values Signed-off-by: Thomas Newton * Remove omitempty on required fields Signed-off-by: Thomas Newton * Run update-crd Signed-off-by: Thomas Newton * Remove temp schema config Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * go import Signed-off-by: Thomas Newton * Cover more test cases Signed-off-by: Thomas Newton * Add tests that spec and metadata are required Signed-off-by: Thomas Newton * Add tests against error content Signed-off-by: Thomas Newton * `go mod tidy` Signed-off-by: Thomas Newton * Fix lint Signed-off-by: Thomas Newton * Remove test - hopefully we can add a better test as a follow up Signed-off-by: Thomas Newton * Make `mainApplicationFile` required Signed-off-by: Thomas Newton * Regenerated api-docs Signed-off-by: Thomas Newton * Add `MainApplicationFile` in tests Signed-off-by: Thomas Newton --------- Signed-off-by: Thomas Newton --- api/v1beta2/scheduledsparkapplication_types.go | 4 ++-- api/v1beta2/sparkapplication_types.go | 13 ++++++------- ...perator.k8s.io_scheduledsparkapplications.yaml | 4 ++++ .../sparkoperator.k8s.io_sparkapplications.yaml | 4 ++++ ...perator.k8s.io_scheduledsparkapplications.yaml | 4 ++++ .../sparkoperator.k8s.io_sparkapplications.yaml | 4 ++++ docs/api-docs.md | 2 -- .../scheduledsparkapplication/controller_test.go | 2 ++ .../sparkapplication/controller_test.go | 15 +++++++++++++++ 9 files changed, 41 insertions(+), 11 deletions(-) diff --git a/api/v1beta2/scheduledsparkapplication_types.go b/api/v1beta2/scheduledsparkapplication_types.go index 486a890a1a..8530d10ee2 100644 --- a/api/v1beta2/scheduledsparkapplication_types.go +++ b/api/v1beta2/scheduledsparkapplication_types.go @@ -88,9 +88,9 @@ type ScheduledSparkApplicationStatus struct { // ScheduledSparkApplication is the Schema for the scheduledsparkapplications API. type ScheduledSparkApplication struct { metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` + metav1.ObjectMeta `json:"metadata"` - Spec ScheduledSparkApplicationSpec `json:"spec,omitempty"` + Spec ScheduledSparkApplicationSpec `json:"spec"` Status ScheduledSparkApplicationStatus `json:"status,omitempty"` } diff --git a/api/v1beta2/sparkapplication_types.go b/api/v1beta2/sparkapplication_types.go index 9f3f2fb2fb..71a810cbf2 100644 --- a/api/v1beta2/sparkapplication_types.go +++ b/api/v1beta2/sparkapplication_types.go @@ -62,7 +62,6 @@ type SparkApplicationSpec struct { // +optional MainClass *string `json:"mainClass,omitempty"` // MainFile is the path to a bundled JAR, Python, or R file of the application. - // +optional MainApplicationFile *string `json:"mainApplicationFile"` // Arguments is a list of arguments to be passed to the application. // +optional @@ -187,9 +186,9 @@ type SparkApplicationStatus struct { // SparkApplication is the Schema for the sparkapplications API type SparkApplication struct { metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` + metav1.ObjectMeta `json:"metadata"` - Spec SparkApplicationSpec `json:"spec,omitempty"` + Spec SparkApplicationSpec `json:"spec"` Status SparkApplicationStatus `json:"status,omitempty"` } @@ -280,15 +279,15 @@ type SparkUIConfiguration struct { // ServicePort allows configuring the port at service level that might be different from the targetPort. // TargetPort should be the same as the one defined in spark.ui.port // +optional - ServicePort *int32 `json:"servicePort"` + ServicePort *int32 `json:"servicePort,omitempty"` // ServicePortName allows configuring the name of the service port. // This may be useful for sidecar proxies like Envoy injected by Istio which require specific ports names to treat traffic as proper HTTP. // Defaults to spark-driver-ui-port. // +optional - ServicePortName *string `json:"servicePortName"` + ServicePortName *string `json:"servicePortName,omitempty"` // ServiceType allows configuring the type of the service. Defaults to ClusterIP. // +optional - ServiceType *corev1.ServiceType `json:"serviceType"` + ServiceType *corev1.ServiceType `json:"serviceType,omitempty"` // ServiceAnnotations is a map of key,value pairs of annotations that might be added to the service object. // +optional ServiceAnnotations map[string]string `json:"serviceAnnotations,omitempty"` @@ -312,7 +311,7 @@ type DriverIngressConfiguration struct { ServicePortName *string `json:"servicePortName"` // ServiceType allows configuring the type of the service. Defaults to ClusterIP. // +optional - ServiceType *corev1.ServiceType `json:"serviceType"` + ServiceType *corev1.ServiceType `json:"serviceType,omitempty"` // ServiceAnnotations is a map of key,value pairs of annotations that might be added to the service object. // +optional ServiceAnnotations map[string]string `json:"serviceAnnotations,omitempty"` diff --git a/charts/spark-operator-chart/crds/sparkoperator.k8s.io_scheduledsparkapplications.yaml b/charts/spark-operator-chart/crds/sparkoperator.k8s.io_scheduledsparkapplications.yaml index 963e8b9078..0ce3ee05e1 100644 --- a/charts/spark-operator-chart/crds/sparkoperator.k8s.io_scheduledsparkapplications.yaml +++ b/charts/spark-operator-chart/crds/sparkoperator.k8s.io_scheduledsparkapplications.yaml @@ -11573,6 +11573,7 @@ spec: required: - driver - executor + - mainApplicationFile - sparkVersion - type type: object @@ -11621,6 +11622,9 @@ spec: application. type: string type: object + required: + - metadata + - spec type: object served: true storage: true diff --git a/charts/spark-operator-chart/crds/sparkoperator.k8s.io_sparkapplications.yaml b/charts/spark-operator-chart/crds/sparkoperator.k8s.io_sparkapplications.yaml index bd23fcd022..70034a0f9c 100644 --- a/charts/spark-operator-chart/crds/sparkoperator.k8s.io_sparkapplications.yaml +++ b/charts/spark-operator-chart/crds/sparkoperator.k8s.io_sparkapplications.yaml @@ -11475,6 +11475,7 @@ spec: required: - driver - executor + - mainApplicationFile - sparkVersion - type type: object @@ -11555,6 +11556,9 @@ spec: required: - driverInfo type: object + required: + - metadata + - spec type: object served: true storage: true diff --git a/config/crd/bases/sparkoperator.k8s.io_scheduledsparkapplications.yaml b/config/crd/bases/sparkoperator.k8s.io_scheduledsparkapplications.yaml index 963e8b9078..0ce3ee05e1 100644 --- a/config/crd/bases/sparkoperator.k8s.io_scheduledsparkapplications.yaml +++ b/config/crd/bases/sparkoperator.k8s.io_scheduledsparkapplications.yaml @@ -11573,6 +11573,7 @@ spec: required: - driver - executor + - mainApplicationFile - sparkVersion - type type: object @@ -11621,6 +11622,9 @@ spec: application. type: string type: object + required: + - metadata + - spec type: object served: true storage: true diff --git a/config/crd/bases/sparkoperator.k8s.io_sparkapplications.yaml b/config/crd/bases/sparkoperator.k8s.io_sparkapplications.yaml index bd23fcd022..70034a0f9c 100644 --- a/config/crd/bases/sparkoperator.k8s.io_sparkapplications.yaml +++ b/config/crd/bases/sparkoperator.k8s.io_sparkapplications.yaml @@ -11475,6 +11475,7 @@ spec: required: - driver - executor + - mainApplicationFile - sparkVersion - type type: object @@ -11555,6 +11556,9 @@ spec: required: - driverInfo type: object + required: + - metadata + - spec type: object served: true storage: true diff --git a/docs/api-docs.md b/docs/api-docs.md index d3fe46904d..531c53f9ab 100644 --- a/docs/api-docs.md +++ b/docs/api-docs.md @@ -1913,7 +1913,6 @@ string -(Optional)

MainFile is the path to a bundled JAR, Python, or R file of the application.

@@ -2355,7 +2354,6 @@ string -(Optional)

MainFile is the path to a bundled JAR, Python, or R file of the application.

diff --git a/internal/controller/scheduledsparkapplication/controller_test.go b/internal/controller/scheduledsparkapplication/controller_test.go index 6cb1f525a9..e2d2802826 100644 --- a/internal/controller/scheduledsparkapplication/controller_test.go +++ b/internal/controller/scheduledsparkapplication/controller_test.go @@ -29,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/kubeflow/spark-operator/api/v1beta2" + "github.com/kubeflow/spark-operator/pkg/util" ) var _ = Describe("ScheduledSparkApplication Controller", func() { @@ -61,6 +62,7 @@ var _ = Describe("ScheduledSparkApplication Controller", func() { RestartPolicy: v1beta2.RestartPolicy{ Type: v1beta2.RestartPolicyNever, }, + MainApplicationFile: util.StringPtr("local:///dummy.jar"), }, }, // TODO(user): Specify other spec details if needed. diff --git a/internal/controller/sparkapplication/controller_test.go b/internal/controller/sparkapplication/controller_test.go index 30d1fd0071..66afe0ca4b 100644 --- a/internal/controller/sparkapplication/controller_test.go +++ b/internal/controller/sparkapplication/controller_test.go @@ -95,6 +95,9 @@ var _ = Describe("SparkApplication Controller", func() { common.LabelSparkAppName: app.Name, }, }, + Spec: v1beta2.SparkApplicationSpec{ + MainApplicationFile: util.StringPtr("local:///dummy.jar"), + }, } v1beta2.SetSparkApplicationDefaults(app) Expect(k8sClient.Create(ctx, app)).To(Succeed()) @@ -146,6 +149,9 @@ var _ = Describe("SparkApplication Controller", func() { Name: appName, Namespace: appNamespace, }, + Spec: v1beta2.SparkApplicationSpec{ + MainApplicationFile: util.StringPtr("local:///dummy.jar"), + }, } v1beta2.SetSparkApplicationDefaults(app) Expect(k8sClient.Create(ctx, app)).To(Succeed()) @@ -202,6 +208,9 @@ var _ = Describe("SparkApplication Controller", func() { Name: appName, Namespace: appNamespace, }, + Spec: v1beta2.SparkApplicationSpec{ + MainApplicationFile: util.StringPtr("local:///dummy.jar"), + }, } v1beta2.SetSparkApplicationDefaults(app) Expect(k8sClient.Create(ctx, app)).To(Succeed()) @@ -253,6 +262,9 @@ var _ = Describe("SparkApplication Controller", func() { Name: appName, Namespace: appNamespace, }, + Spec: v1beta2.SparkApplicationSpec{ + MainApplicationFile: util.StringPtr("local:///dummy.jar"), + }, } v1beta2.SetSparkApplicationDefaults(app) Expect(k8sClient.Create(ctx, app)).To(Succeed()) @@ -309,6 +321,9 @@ var _ = Describe("SparkApplication Controller", func() { Name: appName, Namespace: appNamespace, }, + Spec: v1beta2.SparkApplicationSpec{ + MainApplicationFile: util.StringPtr("local:///dummy.jar"), + }, } v1beta2.SetSparkApplicationDefaults(app) Expect(k8sClient.Create(ctx, app)).To(Succeed()) From 1509b341d4e6210f58518339e6b675c9cae640b5 Mon Sep 17 00:00:00 2001 From: Jacob Salway Date: Fri, 18 Oct 2024 22:33:20 +1100 Subject: [PATCH 05/14] Add release badge to README (#2263) Signed-off-by: Jacob Salway --- README.md | 1 + codecov.yaml | 10 ---------- 2 files changed, 1 insertion(+), 10 deletions(-) delete mode 100644 codecov.yaml diff --git a/README.md b/README.md index fc5386278f..84cc8dd2de 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![Integration Test](https://github.com/kubeflow/spark-operator/actions/workflows/integration.yaml/badge.svg)](https://github.com/kubeflow/spark-operator/actions/workflows/integration.yaml) [![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/spark-operator)](https://goreportcard.com/report/github.com/kubeflow/spark-operator) +[![GitHub release](https://img.shields.io/github/v/release/kubeflow/spark-operator)](https://github.com/kubeflow/spark-operator/releases) ## What is Spark Operator? diff --git a/codecov.yaml b/codecov.yaml deleted file mode 100644 index 4e7d7af670..0000000000 --- a/codecov.yaml +++ /dev/null @@ -1,10 +0,0 @@ -coverage: - status: - project: - default: - threshold: 0.1% - patch: - default: - target: 60% -ignore: - - "**/*_generated.*" From d0f62854b37a51715c280205d8a86d5a20b0fd64 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 02:04:28 +0000 Subject: [PATCH 06/14] Bump helm.sh/helm/v3 from 3.16.1 to 3.16.2 (#2275) Bumps [helm.sh/helm/v3](https://github.com/helm/helm) from 3.16.1 to 3.16.2. - [Release notes](https://github.com/helm/helm/releases) - [Commits](https://github.com/helm/helm/compare/v3.16.1...v3.16.2) --- updated-dependencies: - dependency-name: helm.sh/helm/v3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 16 ++++++++-------- go.sum | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index c3133ce81e..0c11ff3266 100644 --- a/go.mod +++ b/go.mod @@ -21,10 +21,10 @@ require ( gocloud.dev v0.40.0 golang.org/x/net v0.30.0 golang.org/x/time v0.7.0 - helm.sh/helm/v3 v3.16.1 - k8s.io/api v0.31.0 - k8s.io/apiextensions-apiserver v0.31.0 - k8s.io/apimachinery v0.31.0 + helm.sh/helm/v3 v3.16.2 + k8s.io/api v0.31.1 + k8s.io/apiextensions-apiserver v0.31.1 + k8s.io/apimachinery v0.31.1 k8s.io/client-go v1.5.2 k8s.io/kubernetes v1.30.2 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 @@ -218,12 +218,12 @@ require ( gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiserver v0.31.0 // indirect - k8s.io/cli-runtime v0.31.0 // indirect - k8s.io/component-base v0.31.0 // indirect + k8s.io/apiserver v0.31.1 // indirect + k8s.io/cli-runtime v0.31.1 // indirect + k8s.io/component-base v0.31.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20240709000822-3c01b740850f // indirect - k8s.io/kubectl v0.31.0 // indirect + k8s.io/kubectl v0.31.1 // indirect oras.land/oras-go v1.2.5 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/kustomize/api v0.17.2 // indirect diff --git a/go.sum b/go.sum index ef603e94a3..5d4e760920 100644 --- a/go.sum +++ b/go.sum @@ -705,8 +705,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= -helm.sh/helm/v3 v3.16.1 h1:cER6tI/8PgUAsaJaQCVBUg3VI9KN4oVaZJgY60RIc0c= -helm.sh/helm/v3 v3.16.1/go.mod h1:r+xBHHP20qJeEqtvBXMf7W35QDJnzY/eiEBzt+TfHps= +helm.sh/helm/v3 v3.16.2 h1:Y9v7ry+ubQmi+cb5zw1Llx8OKHU9Hk9NQ/+P+LGBe2o= +helm.sh/helm/v3 v3.16.2/go.mod h1:SyTXgKBjNqi2NPsHCW5dDAsHqvGIu0kdNYNH9gQaw70= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= k8s.io/api v0.29.3 h1:2ORfZ7+bGC3YJqGpV0KSDDEVf8hdGQ6A03/50vj8pmw= From 2a9b278318f355a0d1ef2485c19d41659e53ea2b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 02:43:28 +0000 Subject: [PATCH 07/14] Bump github.com/prometheus/client_golang from 1.20.4 to 1.20.5 (#2274) Bumps [github.com/prometheus/client_golang](https://github.com/prometheus/client_golang) from 1.20.4 to 1.20.5. - [Release notes](https://github.com/prometheus/client_golang/releases) - [Changelog](https://github.com/prometheus/client_golang/blob/main/CHANGELOG.md) - [Commits](https://github.com/prometheus/client_golang/compare/v1.20.4...v1.20.5) --- updated-dependencies: - dependency-name: github.com/prometheus/client_golang dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 0c11ff3266..5c4adb92b6 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/olekukonko/tablewriter v0.0.5 github.com/onsi/ginkgo/v2 v2.20.2 github.com/onsi/gomega v1.34.2 - github.com/prometheus/client_golang v1.20.4 + github.com/prometheus/client_golang v1.20.5 github.com/robfig/cron/v3 v3.0.1 github.com/spf13/cobra v1.8.1 github.com/spf13/viper v1.19.0 diff --git a/go.sum b/go.sum index 5d4e760920..febd031aa6 100644 --- a/go.sum +++ b/go.sum @@ -420,8 +420,8 @@ github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjz github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.1.0/go.mod h1:I1FGZT9+L76gKKOs5djB6ezCbFQP1xR9D75/vuwEF3g= -github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI= -github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= From f56ba30d5c36feeaaba5c89ddd48a3f663060f0d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 03:03:29 +0000 Subject: [PATCH 08/14] Bump cloud.google.com/go/storage from 1.44.0 to 1.45.0 (#2273) Bumps [cloud.google.com/go/storage](https://github.com/googleapis/google-cloud-go) from 1.44.0 to 1.45.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/pubsub/v1.44.0...spanner/v1.45.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/storage dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 5c4adb92b6..e590e1ba49 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/kubeflow/spark-operator go 1.23.1 require ( - cloud.google.com/go/storage v1.44.0 + cloud.google.com/go/storage v1.45.0 github.com/aws/aws-sdk-go-v2 v1.32.2 github.com/aws/aws-sdk-go-v2/config v1.27.43 github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3 diff --git a/go.sum b/go.sum index febd031aa6..cd2390f07b 100644 --- a/go.sum +++ b/go.sum @@ -17,8 +17,8 @@ cloud.google.com/go/longrunning v0.6.1 h1:lOLTFxYpr8hcRtcwWir5ITh1PAKUD/sG2lKrTS cloud.google.com/go/longrunning v0.6.1/go.mod h1:nHISoOZpBcmlwbJmiVk5oDRz0qG/ZxPynEGs1iZ79s0= cloud.google.com/go/monitoring v1.21.0 h1:EMc0tB+d3lUewT2NzKC/hr8cSR9WsUieVywzIHetGro= cloud.google.com/go/monitoring v1.21.0/go.mod h1:tuJ+KNDdJbetSsbSGTqnaBvbauS5kr3Q/koy3Up6r+4= -cloud.google.com/go/storage v1.44.0 h1:abBzXf4UJKMmQ04xxJf9dYM/fNl24KHoTuBjyJDX2AI= -cloud.google.com/go/storage v1.44.0/go.mod h1:wpPblkIuMP5jCB/E48Pz9zIo2S/zD8g+ITmxKkPCITE= +cloud.google.com/go/storage v1.45.0 h1:5av0QcIVj77t+44mV4gffFC/LscFRUhto6UBMB5SimM= +cloud.google.com/go/storage v1.45.0/go.mod h1:wpPblkIuMP5jCB/E48Pz9zIo2S/zD8g+ITmxKkPCITE= cloud.google.com/go/trace v1.11.0 h1:UHX6cOJm45Zw/KIbqHe4kII8PupLt/V5tscZUkeiJVI= cloud.google.com/go/trace v1.11.0/go.mod h1:Aiemdi52635dBR7o3zuc9lLjXo3BwGaChEjCa3tJNmM= dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= From 9f83e2a87ae0fa5840726319c8111588bc0831f6 Mon Sep 17 00:00:00 2001 From: Jacob Salway Date: Tue, 22 Oct 2024 17:19:29 +1100 Subject: [PATCH 09/14] Run e2e tests with Kubernetes version matrix (#2266) Signed-off-by: Jacob Salway --- .github/workflows/integration.yaml | 5 ++++- Makefile | 8 +++++++- charts/spark-operator-chart/ci/kind-config.yaml | 2 -- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 532cd98d67..e428fe0775 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -209,6 +209,9 @@ jobs: e2e-test: runs-on: ubuntu-latest + strategy: + matrix: + k8s_version: [v1.28.13, v1.29.8, v1.30.4, v1.31.1] steps: - name: Checkout source code uses: actions/checkout@v4 @@ -221,7 +224,7 @@ jobs: go-version-file: go.mod - name: Create a Kind cluster - run: make kind-create-cluster + run: make kind-create-cluster KIND_K8S_VERSION=${{ matrix.k8s_version }} - name: Build and load image to Kind cluster run: | diff --git a/Makefile b/Makefile index 901dbfc5ea..3283f2a4ab 100644 --- a/Makefile +++ b/Makefile @@ -52,6 +52,7 @@ LOCALBIN ?= $(shell pwd)/bin KUSTOMIZE_VERSION ?= v5.4.1 CONTROLLER_TOOLS_VERSION ?= v0.15.0 KIND_VERSION ?= v0.23.0 +KIND_K8S_VERSION ?= v1.29.3 ENVTEST_VERSION ?= release-0.18 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION ?= 1.29.3 @@ -249,7 +250,12 @@ endif .PHONY: kind-create-cluster kind-create-cluster: kind ## Create a kind cluster for integration tests. if ! $(KIND) get clusters 2>/dev/null | grep -q "^$(KIND_CLUSTER_NAME)$$"; then \ - $(KIND) create cluster --name $(KIND_CLUSTER_NAME) --config $(KIND_CONFIG_FILE) --kubeconfig $(KIND_KUBE_CONFIG) --wait=1m; \ + $(KIND) create cluster \ + --name $(KIND_CLUSTER_NAME) \ + --config $(KIND_CONFIG_FILE) \ + --image kindest/node:$(KIND_K8S_VERSION) \ + --kubeconfig $(KIND_KUBE_CONFIG) \ + --wait=1m; \ fi .PHONY: kind-load-image diff --git a/charts/spark-operator-chart/ci/kind-config.yaml b/charts/spark-operator-chart/ci/kind-config.yaml index 4e8cae8d94..cebee6d5b9 100644 --- a/charts/spark-operator-chart/ci/kind-config.yaml +++ b/charts/spark-operator-chart/ci/kind-config.yaml @@ -2,6 +2,4 @@ kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane - image: kindest/node:v1.29.2 - role: worker - image: kindest/node:v1.29.2 From 641007cf08643662c5f5e573153776da9f9a30c2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:00:29 +0000 Subject: [PATCH 10/14] Bump aquasecurity/trivy-action from 0.27.0 to 0.28.0 (#2270) Bumps [aquasecurity/trivy-action](https://github.com/aquasecurity/trivy-action) from 0.27.0 to 0.28.0. - [Release notes](https://github.com/aquasecurity/trivy-action/releases) - [Commits](https://github.com/aquasecurity/trivy-action/compare/0.27.0...0.28.0) --- updated-dependencies: - dependency-name: aquasecurity/trivy-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/trivy-image-scanning.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trivy-image-scanning.yaml b/.github/workflows/trivy-image-scanning.yaml index a7bd2a958f..e0ba85c8b9 100644 --- a/.github/workflows/trivy-image-scanning.yaml +++ b/.github/workflows/trivy-image-scanning.yaml @@ -15,7 +15,7 @@ jobs: run: make print-IMAGE >> $GITHUB_ENV - name: trivy scan for github security tab - uses: aquasecurity/trivy-action@0.27.0 + uses: aquasecurity/trivy-action@0.28.0 with: image-ref: '${{ env.IMAGE }}' format: 'sarif' From 345d611810e9a8e87d92281948211f44d97d68d2 Mon Sep 17 00:00:00 2001 From: Jacob Salway Date: Wed, 23 Oct 2024 20:41:29 +1100 Subject: [PATCH 11/14] Allow --ingress-class-name to be specified in chart (#2278) Signed-off-by: Jacob Salway --- charts/spark-operator-chart/README.md | 1 + .../templates/controller/deployment.yaml | 3 +++ .../tests/controller/deployment_test.yaml | 13 +++++++++++++ charts/spark-operator-chart/values.yaml | 2 ++ 4 files changed, 19 insertions(+) diff --git a/charts/spark-operator-chart/README.md b/charts/spark-operator-chart/README.md index 9c945d33b4..1df28cdc7f 100644 --- a/charts/spark-operator-chart/README.md +++ b/charts/spark-operator-chart/README.md @@ -90,6 +90,7 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum | controller.uiService.enable | bool | `true` | Specifies whether to create service for Spark web UI. | | controller.uiIngress.enable | bool | `false` | Specifies whether to create ingress for Spark web UI. `controller.uiService.enable` must be `true` to enable ingress. | | controller.uiIngress.urlFormat | string | `""` | Ingress URL format. Required if `controller.uiIngress.enable` is true. | +| controller.uiIngress.ingressClassName | string | `""` | Optionally set the ingressClassName. | | controller.batchScheduler.enable | bool | `false` | Specifies whether to enable batch scheduler for spark jobs scheduling. If enabled, users can specify batch scheduler name in spark application. | | controller.batchScheduler.kubeSchedulerNames | list | `[]` | Specifies a list of kube-scheduler names for scheduling Spark pods. | | controller.batchScheduler.default | string | `""` | Default batch scheduler to be used if not specified by the user. If specified, this value must be either "volcano" or "yunikorn". Specifying any other value will cause the controller to error on startup. | diff --git a/charts/spark-operator-chart/templates/controller/deployment.yaml b/charts/spark-operator-chart/templates/controller/deployment.yaml index 1efd126568..2a1fd5f19e 100644 --- a/charts/spark-operator-chart/templates/controller/deployment.yaml +++ b/charts/spark-operator-chart/templates/controller/deployment.yaml @@ -69,6 +69,9 @@ spec: {{- with .Values.controller.uiIngress.urlFormat }} - --ingress-url-format={{ . }} {{- end }} + {{- with .Values.controller.uiIngress.ingressClassName }} + - --ingress-class-name={{ . }} + {{- end }} {{- end }} {{- if .Values.controller.batchScheduler.enable }} - --enable-batch-scheduler=true diff --git a/charts/spark-operator-chart/tests/controller/deployment_test.yaml b/charts/spark-operator-chart/tests/controller/deployment_test.yaml index c2513dd32f..67c475da40 100644 --- a/charts/spark-operator-chart/tests/controller/deployment_test.yaml +++ b/charts/spark-operator-chart/tests/controller/deployment_test.yaml @@ -171,6 +171,19 @@ tests: path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args content: --ingress-url-format={{$appName}}.example.com/{{$appNamespace}}/{{$appName}} + - it: Should contain `--ingress-class-name` arg if `controller.uiIngress.enable` is set to `true` and `controller.uiIngress.ingressClassName` is set + set: + controller: + uiService: + enable: true + uiIngress: + enable: true + ingressClassName: nginx + asserts: + - contains: + path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args + content: --ingress-class-name=nginx + - it: Should contain `--enable-batch-scheduler` arg if `controller.batchScheduler.enable` is `true` set: controller: diff --git a/charts/spark-operator-chart/values.yaml b/charts/spark-operator-chart/values.yaml index 75cf5cac88..672d622528 100644 --- a/charts/spark-operator-chart/values.yaml +++ b/charts/spark-operator-chart/values.yaml @@ -65,6 +65,8 @@ controller: # -- Ingress URL format. # Required if `controller.uiIngress.enable` is true. urlFormat: "" + # -- Optionally set the ingressClassName. + ingressClassName: "" batchScheduler: # -- Specifies whether to enable batch scheduler for spark jobs scheduling. From 117d5f05ef543a2b75758cd8e7bf8e4b4b48202d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 23 Oct 2024 11:46:30 +0000 Subject: [PATCH 12/14] Bump github.com/aws/aws-sdk-go-v2/service/s3 from 1.65.3 to 1.66.0 (#2271) Bumps [github.com/aws/aws-sdk-go-v2/service/s3](https://github.com/aws/aws-sdk-go-v2) from 1.65.3 to 1.66.0. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/service/s3/v1.65.3...service/s3/v1.66.0) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/service/s3 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e590e1ba49..52d71b7546 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( cloud.google.com/go/storage v1.45.0 github.com/aws/aws-sdk-go-v2 v1.32.2 github.com/aws/aws-sdk-go-v2/config v1.27.43 - github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3 + github.com/aws/aws-sdk-go-v2/service/s3 v1.66.0 github.com/golang/glog v1.2.2 github.com/google/uuid v1.6.0 github.com/olekukonko/tablewriter v0.0.5 diff --git a/go.sum b/go.sum index cd2390f07b..d92fd45350 100644 --- a/go.sum +++ b/go.sum @@ -94,8 +94,8 @@ github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.2 h1:s7NA1SOw8 github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.2/go.mod h1:fnjjWyAW/Pj5HYOxl9LJqWtEwS7W2qgcRLWP+uWbss0= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.2 h1:t7iUP9+4wdc5lt3E41huP+GvQZJD38WLsgVp4iOtAjg= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.2/go.mod h1:/niFCtmuQNxqx9v8WAPq5qh7EH25U4BF6tjoyq9bObM= -github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3 h1:xxHGZ+wUgZNACQmxtdvP5tgzfsxGS3vPpTP5Hy3iToE= -github.com/aws/aws-sdk-go-v2/service/s3 v1.65.3/go.mod h1:cB6oAuus7YXRZhWCc1wIwPywwZ1XwweNp2TVAEGYeB8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.66.0 h1:xA6XhTF7PE89BCNHJbQi8VvPzcgMtmGC5dr8S8N7lHk= +github.com/aws/aws-sdk-go-v2/service/s3 v1.66.0/go.mod h1:cB6oAuus7YXRZhWCc1wIwPywwZ1XwweNp2TVAEGYeB8= github.com/aws/aws-sdk-go-v2/service/sso v1.24.2 h1:bSYXVyUzoTHoKalBmwaZxs97HU9DWWI3ehHSAMa7xOk= github.com/aws/aws-sdk-go-v2/service/sso v1.24.2/go.mod h1:skMqY7JElusiOUjMJMOv1jJsP7YUg7DrhgqZZWuzu1U= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.2 h1:AhmO1fHINP9vFYUE0LHzCWg/LfUWUF+zFPEcY9QXb7o= From d130b08fa5c1ce40f0dbbe6e059b21fd9e898b90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 23 Oct 2024 12:23:30 +0000 Subject: [PATCH 13/14] Bump github.com/aws/aws-sdk-go-v2/config from 1.27.43 to 1.28.0 (#2272) Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.27.43 to 1.28.0. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.27.43...v1.28.0) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/config dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 52d71b7546..d5e600b9cb 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.23.1 require ( cloud.google.com/go/storage v1.45.0 github.com/aws/aws-sdk-go-v2 v1.32.2 - github.com/aws/aws-sdk-go-v2/config v1.27.43 + github.com/aws/aws-sdk-go-v2/config v1.28.0 github.com/aws/aws-sdk-go-v2/service/s3 v1.66.0 github.com/golang/glog v1.2.2 github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index d92fd45350..496567c078 100644 --- a/go.sum +++ b/go.sum @@ -70,8 +70,8 @@ github.com/aws/aws-sdk-go-v2 v1.32.2 h1:AkNLZEyYMLnx/Q/mSKkcMqwNFXMAvFto9bNsHqcT github.com/aws/aws-sdk-go-v2 v1.32.2/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.6 h1:pT3hpW0cOHRJx8Y0DfJUEQuqPild8jRGmSFmBgvydr0= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.6/go.mod h1:j/I2++U0xX+cr44QjHay4Cvxj6FUbnxrgmqN3H1jTZA= -github.com/aws/aws-sdk-go-v2/config v1.27.43 h1:p33fDDihFC390dhhuv8nOmX419wjOSDQRb+USt20RrU= -github.com/aws/aws-sdk-go-v2/config v1.27.43/go.mod h1:pYhbtvg1siOOg8h5an77rXle9tVG8T+BWLWAo7cOukc= +github.com/aws/aws-sdk-go-v2/config v1.28.0 h1:FosVYWcqEtWNxHn8gB/Vs6jOlNwSoyOCA/g/sxyySOQ= +github.com/aws/aws-sdk-go-v2/config v1.28.0/go.mod h1:pYhbtvg1siOOg8h5an77rXle9tVG8T+BWLWAo7cOukc= github.com/aws/aws-sdk-go-v2/credentials v1.17.41 h1:7gXo+Axmp+R4Z+AK8YFQO0ZV3L0gizGINCOWxSLY9W8= github.com/aws/aws-sdk-go-v2/credentials v1.17.41/go.mod h1:u4Eb8d3394YLubphT4jLEwN1rLNq2wFOlT6OuxFwPzU= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.17 h1:TMH3f/SCAWdNtXXVPPu5D6wrr4G5hI1rAxbcocKfC7Q= From 735c7fc9e5b647394b58ac9933d0f6812a067117 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 23 Oct 2024 14:13:30 +0100 Subject: [PATCH 14/14] Fix retries (#2241) * Attempt to requeue after correct period Signed-off-by: Thomas Newton * Syntactically correct Signed-off-by: Thomas Newton * I think correct requeueing Signed-off-by: Thomas Newton * Same treatment for the other retries Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * Requeue after deleting resources Signed-off-by: Thomas Newton * Try to fix submission status updates Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * Correct usage of submitSparkApplication Signed-off-by: Thomas Newton * Fix error logging Signed-off-by: Thomas Newton * Bring back ExecutionAttempts increment that I forgot about Signed-off-by: Thomas Newton * Log after reconcile complete Signed-off-by: Thomas Newton * Fix setting submission ID Signed-off-by: Thomas Newton * Tidy logging Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * Update comment Signed-off-by: Thomas Newton * Start a new test Signed-off-by: Thomas Newton * Working Fails submission and retries until retries are exhausted test Signed-off-by: Thomas Newton * Add Application fails and retries until retries are exhausted Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * Comments Signed-off-by: Thomas Newton * Tidy Signed-off-by: Thomas Newton * Move fail configs out of the examples directory Signed-off-by: Thomas Newton * Fix lint Signed-off-by: Thomas Newton * Move TimeUntilNextRetryDue to `pkg/util/sparkapplication.go` Signed-off-by: Thomas Newton * Update internal/controller/sparkapplication/controller.go Co-authored-by: Yi Chen Signed-off-by: Thomas Newton * Update test/e2e/sparkapplication_test.go Co-authored-by: Yi Chen Signed-off-by: Thomas Newton * camelCase Signed-off-by: Thomas Newton * make fo-fmt Signed-off-by: Thomas Newton * PR comments Signed-off-by: Thomas Newton --------- Signed-off-by: Thomas Newton Co-authored-by: Yi Chen --- .../controller/sparkapplication/controller.go | 108 ++++++++-------- .../controller/sparkapplication/submission.go | 12 +- pkg/util/sparkapplication.go | 21 ++++ test/e2e/bad_examples/fail-application.yaml | 44 +++++++ test/e2e/bad_examples/fail-submission.yaml | 44 +++++++ test/e2e/sparkapplication_test.go | 117 ++++++++++++++++++ test/e2e/suit_test.go | 23 ++++ 7 files changed, 307 insertions(+), 62 deletions(-) create mode 100644 test/e2e/bad_examples/fail-application.yaml create mode 100644 test/e2e/bad_examples/fail-submission.yaml diff --git a/internal/controller/sparkapplication/controller.go b/internal/controller/sparkapplication/controller.go index 3b96a310ae..7a707df001 100644 --- a/internal/controller/sparkapplication/controller.go +++ b/internal/controller/sparkapplication/controller.go @@ -172,6 +172,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{Requeue: true}, err } logger.Info("Reconciling SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State) + defer logger.Info("Finished reconciling SparkApplication", "name", app.Name, "namespace", app.Namespace) // Check if the spark application is being deleted if !app.DeletionTimestamp.IsZero() { @@ -259,17 +260,7 @@ func (r *Reconciler) reconcileNewSparkApplication(ctx context.Context, req ctrl. } app := old.DeepCopy() - if err := r.submitSparkApplication(app); err != nil { - logger.Error(err, "Failed to submit SparkApplication", "name", app.Name, "namespace", app.Namespace) - app.Status = v1beta2.SparkApplicationStatus{ - AppState: v1beta2.ApplicationState{ - State: v1beta2.ApplicationStateFailedSubmission, - ErrorMessage: err.Error(), - }, - SubmissionAttempts: app.Status.SubmissionAttempts + 1, - LastSubmissionAttemptTime: metav1.Now(), - } - } + _ = r.submitSparkApplication(app) if err := r.updateSparkApplicationStatus(ctx, app); err != nil { return err } @@ -315,6 +306,9 @@ func (r *Reconciler) reconcileSubmittedSparkApplication(ctx context.Context, req func (r *Reconciler) reconcileFailedSubmissionSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { key := req.NamespacedName + + var result ctrl.Result + retryErr := retry.RetryOnConflict( retry.DefaultRetry, func() error { @@ -328,15 +322,22 @@ func (r *Reconciler) reconcileFailedSubmissionSparkApplication(ctx context.Conte app := old.DeepCopy() if util.ShouldRetry(app) { - if isNextRetryDue(app) { + timeUntilNextRetryDue, err := util.TimeUntilNextRetryDue(app) + if err != nil { + return err + } + if timeUntilNextRetryDue <= 0 { if r.validateSparkResourceDeletion(ctx, app) { _ = r.submitSparkApplication(app) } else { if err := r.deleteSparkResources(ctx, app); err != nil { logger.Error(err, "failed to delete resources associated with SparkApplication", "name", app.Name, "namespace", app.Namespace) } - return err + return fmt.Errorf("resources associated with SparkApplication name: %s namespace: %s, needed to be deleted", app.Name, app.Namespace) } + } else { + // If we're waiting before retrying then reconcile will not modify anything, so we need to requeue. + result.RequeueAfter = timeUntilNextRetryDue } } else { app.Status.AppState.State = v1beta2.ApplicationStateFailed @@ -352,9 +353,9 @@ func (r *Reconciler) reconcileFailedSubmissionSparkApplication(ctx context.Conte ) if retryErr != nil { logger.Error(retryErr, "Failed to reconcile SparkApplication", "name", key.Name, "namespace", key.Namespace) - return ctrl.Result{}, retryErr + return result, retryErr } - return ctrl.Result{}, nil + return result, nil } func (r *Reconciler) reconcileRunningSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -408,9 +409,7 @@ func (r *Reconciler) reconcilePendingRerunSparkApplication(ctx context.Context, logger.Info("Successfully deleted resources associated with SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State) r.recordSparkApplicationEvent(app) r.resetSparkApplicationStatus(app) - if err = r.submitSparkApplication(app); err != nil { - logger.Error(err, "Failed to run spark-submit", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State) - } + _ = r.submitSparkApplication(app) } if err := r.updateSparkApplicationStatus(ctx, app); err != nil { return err @@ -497,6 +496,9 @@ func (r *Reconciler) reconcileSucceedingSparkApplication(ctx context.Context, re func (r *Reconciler) reconcileFailingSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { key := req.NamespacedName + + var result ctrl.Result + retryErr := retry.RetryOnConflict( retry.DefaultRetry, func() error { @@ -510,12 +512,19 @@ func (r *Reconciler) reconcileFailingSparkApplication(ctx context.Context, req c app := old.DeepCopy() if util.ShouldRetry(app) { - if isNextRetryDue(app) { + timeUntilNextRetryDue, err := util.TimeUntilNextRetryDue(app) + if err != nil { + return err + } + if timeUntilNextRetryDue <= 0 { if err := r.deleteSparkResources(ctx, app); err != nil { logger.Error(err, "failed to delete spark resources", "name", app.Name, "namespace", app.Namespace) return err } app.Status.AppState.State = v1beta2.ApplicationStatePendingRerun + } else { + // If we're waiting before retrying then reconcile will not modify anything, so we need to requeue. + result.RequeueAfter = timeUntilNextRetryDue } } else { app.Status.AppState.State = v1beta2.ApplicationStateFailed @@ -528,9 +537,9 @@ func (r *Reconciler) reconcileFailingSparkApplication(ctx context.Context, req c ) if retryErr != nil { logger.Error(retryErr, "Failed to reconcile SparkApplication", "name", key.Name, "namespace", key.Namespace) - return ctrl.Result{}, retryErr + return result, retryErr } - return ctrl.Result{}, nil + return result, nil } func (r *Reconciler) reconcileCompletedSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -632,8 +641,28 @@ func (r *Reconciler) getSparkApplication(key types.NamespacedName) (*v1beta2.Spa } // submitSparkApplication creates a new submission for the given SparkApplication and submits it using spark-submit. -func (r *Reconciler) submitSparkApplication(app *v1beta2.SparkApplication) error { +func (r *Reconciler) submitSparkApplication(app *v1beta2.SparkApplication) (submitErr error) { logger.Info("Submitting SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State) + // SubmissionID must be set before creating any resources to ensure all the resources are labeled. + app.Status.SubmissionID = uuid.New().String() + app.Status.LastSubmissionAttemptTime = metav1.Now() + app.Status.SubmissionAttempts = app.Status.SubmissionAttempts + 1 + + defer func() { + if submitErr == nil { + app.Status.AppState = v1beta2.ApplicationState{ + State: v1beta2.ApplicationStateSubmitted, + } + app.Status.ExecutionAttempts = app.Status.ExecutionAttempts + 1 + } else { + logger.Info("Failed to submit SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State, "error", submitErr) + app.Status.AppState = v1beta2.ApplicationState{ + State: v1beta2.ApplicationStateFailedSubmission, + ErrorMessage: submitErr.Error(), + } + } + r.recordSparkApplicationEvent(app) + }() if util.PrometheusMonitoringEnabled(app) { logger.Info("Configure Prometheus monitoring for SparkApplication", "name", app.Name, "namespace", app.Namespace) @@ -709,7 +738,6 @@ func (r *Reconciler) submitSparkApplication(app *v1beta2.SparkApplication) error driverPodName := util.GetDriverPodName(app) app.Status.DriverInfo.PodName = driverPodName - app.Status.SubmissionID = uuid.New().String() sparkSubmitArgs, err := buildSparkSubmitArgs(app) if err != nil { return fmt.Errorf("failed to build spark-submit arguments: %v", err) @@ -717,44 +745,12 @@ func (r *Reconciler) submitSparkApplication(app *v1beta2.SparkApplication) error // Try submitting the application by running spark-submit. logger.Info("Running spark-submit for SparkApplication", "name", app.Name, "namespace", app.Namespace, "arguments", sparkSubmitArgs) - submitted, err := runSparkSubmit(newSubmission(sparkSubmitArgs, app)) - if err != nil { - r.recordSparkApplicationEvent(app) + if err := runSparkSubmit(newSubmission(sparkSubmitArgs, app)); err != nil { return fmt.Errorf("failed to run spark-submit: %v", err) } - if !submitted { - // The application may not have been submitted even if err == nil, e.g., when some - // state update caused an attempt to re-submit the application, in which case no - // error gets returned from runSparkSubmit. If this is the case, we simply return. - return nil - } - - app.Status.AppState = v1beta2.ApplicationState{ - State: v1beta2.ApplicationStateSubmitted, - } - app.Status.SubmissionAttempts = app.Status.SubmissionAttempts + 1 - app.Status.ExecutionAttempts = app.Status.ExecutionAttempts + 1 - app.Status.LastSubmissionAttemptTime = metav1.Now() - r.recordSparkApplicationEvent(app) return nil } -// Helper func to determine if the next retry the SparkApplication is due now. -func isNextRetryDue(app *v1beta2.SparkApplication) bool { - retryInterval := app.Spec.RestartPolicy.OnFailureRetryInterval - attemptsDone := app.Status.SubmissionAttempts - lastEventTime := app.Status.LastSubmissionAttemptTime - if retryInterval == nil || lastEventTime.IsZero() || attemptsDone <= 0 { - return false - } - - // Retry if we have waited at-least equal to attempts*RetryInterval since we do a linear back-off. - interval := time.Duration(*retryInterval) * time.Second * time.Duration(attemptsDone) - currentTime := time.Now() - logger.Info(fmt.Sprintf("currentTime is %v, interval is %v", currentTime, interval)) - return currentTime.After(lastEventTime.Add(interval)) -} - // updateDriverState finds the driver pod of the application // and updates the driver state based on the current phase of the pod. func (r *Reconciler) updateDriverState(_ context.Context, app *v1beta2.SparkApplication) error { diff --git a/internal/controller/sparkapplication/submission.go b/internal/controller/sparkapplication/submission.go index ef9485aa28..66e4a0be83 100644 --- a/internal/controller/sparkapplication/submission.go +++ b/internal/controller/sparkapplication/submission.go @@ -43,10 +43,10 @@ func newSubmission(args []string, app *v1beta2.SparkApplication) *submission { } } -func runSparkSubmit(submission *submission) (bool, error) { +func runSparkSubmit(submission *submission) error { sparkHome, present := os.LookupEnv(common.EnvSparkHome) if !present { - return false, fmt.Errorf("env %s is not specified", common.EnvSparkHome) + return fmt.Errorf("env %s is not specified", common.EnvSparkHome) } command := filepath.Join(sparkHome, "bin", "spark-submit") cmd := exec.Command(command, submission.args...) @@ -58,14 +58,14 @@ func runSparkSubmit(submission *submission) (bool, error) { } // The driver pod of the application already exists. if strings.Contains(errorMsg, common.ErrorCodePodAlreadyExists) { - return false, fmt.Errorf("driver pod already exist") + return fmt.Errorf("driver pod already exist") } if errorMsg != "" { - return false, fmt.Errorf("failed to run spark-submit: %s", errorMsg) + return fmt.Errorf("failed to run spark-submit: %s", errorMsg) } - return false, fmt.Errorf("failed to run spark-submit: %v", err) + return fmt.Errorf("failed to run spark-submit: %v", err) } - return true, nil + return nil } // buildSparkSubmitArgs builds the arguments for spark-submit. diff --git a/pkg/util/sparkapplication.go b/pkg/util/sparkapplication.go index 29b8dab819..a0aadd93f0 100644 --- a/pkg/util/sparkapplication.go +++ b/pkg/util/sparkapplication.go @@ -104,6 +104,27 @@ func ShouldRetry(app *v1beta2.SparkApplication) bool { return false } +func TimeUntilNextRetryDue(app *v1beta2.SparkApplication) (time.Duration, error) { + var retryInterval *int64 + switch app.Status.AppState.State { + case v1beta2.ApplicationStateFailedSubmission: + retryInterval = app.Spec.RestartPolicy.OnSubmissionFailureRetryInterval + case v1beta2.ApplicationStateFailing: + retryInterval = app.Spec.RestartPolicy.OnFailureRetryInterval + } + + attemptsDone := app.Status.SubmissionAttempts + lastAttemptTime := app.Status.LastSubmissionAttemptTime + if retryInterval == nil || lastAttemptTime.IsZero() || attemptsDone <= 0 { + return -1, fmt.Errorf("invalid retry interval (%v), last attempt time (%v) or attemptsDone (%v)", retryInterval, lastAttemptTime, attemptsDone) + } + + // Retry wait time is attempts*RetryInterval to do a linear backoff. + interval := time.Duration(*retryInterval) * time.Second * time.Duration(attemptsDone) + currentTime := time.Now() + return interval - currentTime.Sub(lastAttemptTime.Time), nil +} + func GetLocalVolumes(app *v1beta2.SparkApplication) map[string]corev1.Volume { volumes := make(map[string]corev1.Volume) for _, volume := range app.Spec.Volumes { diff --git a/test/e2e/bad_examples/fail-application.yaml b/test/e2e/bad_examples/fail-application.yaml new file mode 100644 index 0000000000..05991fd72d --- /dev/null +++ b/test/e2e/bad_examples/fail-application.yaml @@ -0,0 +1,44 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: sparkoperator.k8s.io/v1beta2 +kind: SparkApplication +metadata: + name: fail-submission + namespace: default +spec: + type: Scala + mode: cluster + image: spark:3.5.2 + imagePullPolicy: IfNotPresent + mainClass: non-existent + mainApplicationFile: local:///non-existent.jar + sparkVersion: 3.5.2 + restartPolicy: + type: OnFailure + onFailureRetries: 3 + onFailureRetryInterval: 1 + driver: + labels: + version: 3.5.2 + cores: 1 + memory: 512m + serviceAccount: spark-operator-spark + executor: + labels: + version: 3.5.2 + instances: 1 + cores: 1 + memory: 512m diff --git a/test/e2e/bad_examples/fail-submission.yaml b/test/e2e/bad_examples/fail-submission.yaml new file mode 100644 index 0000000000..827fca7418 --- /dev/null +++ b/test/e2e/bad_examples/fail-submission.yaml @@ -0,0 +1,44 @@ +# +# Copyright 2024 The Kubeflow authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: sparkoperator.k8s.io/v1beta2 +kind: SparkApplication +metadata: + name: fail-submission + namespace: default +spec: + type: Scala + mode: cluster + image: spark:3.5.2 + imagePullPolicy: IfNotPresent + mainClass: dummy + mainApplicationFile: local:///dummy.jar + sparkVersion: 3.5.2 + restartPolicy: + type: OnFailure + onSubmissionFailureRetries: 3 + onSubmissionFailureRetryInterval: 1 + driver: + labels: + version: 3.5.2 + cores: 1 + memory: 512m + serviceAccount: non-existent # This is the important part that causes submission to fail. + executor: + labels: + version: 3.5.2 + instances: 1 + cores: 1 + memory: 512m diff --git a/test/e2e/sparkapplication_test.go b/test/e2e/sparkapplication_test.go index 113326cea7..825129df07 100644 --- a/test/e2e/sparkapplication_test.go +++ b/test/e2e/sparkapplication_test.go @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -236,6 +237,122 @@ var _ = Describe("Example SparkApplication", func() { }) }) + Context("fail-submission", func() { + ctx := context.Background() + path := filepath.Join("bad_examples", "fail-submission.yaml") + app := &v1beta2.SparkApplication{} + + BeforeEach(func() { + By("Parsing SparkApplication from file") + file, err := os.Open(path) + Expect(err).NotTo(HaveOccurred()) + Expect(file).NotTo(BeNil()) + + decoder := yaml.NewYAMLOrJSONDecoder(file, 100) + Expect(decoder).NotTo(BeNil()) + Expect(decoder.Decode(app)).NotTo(HaveOccurred()) + + By("Creating SparkApplication") + Expect(k8sClient.Create(ctx, app)).To(Succeed()) + }) + + AfterEach(func() { + key := types.NamespacedName{Namespace: app.Namespace, Name: app.Name} + Expect(k8sClient.Get(ctx, key, app)).To(Succeed()) + + By("Deleting SparkApplication") + Expect(k8sClient.Delete(ctx, app)).To(Succeed()) + }) + + It("Fails submission and retries until retries are exhausted", func() { + By("Waiting for SparkApplication to terminate") + key := types.NamespacedName{Namespace: app.Namespace, Name: app.Name} + apps, polling_err := collectSparkApplicationsUntilTermination(ctx, key) + Expect(polling_err).To(HaveOccurred()) + + By("Should eventually fail") + finalApp := apps[len(apps)-1] + Expect(finalApp.Status.AppState.State).To(Equal(v1beta2.ApplicationStateFailed)) + Expect(finalApp.Status.AppState.ErrorMessage).To(ContainSubstring("failed to run spark-submit")) + Expect(finalApp.Status.SubmissionAttempts).To(Equal(*app.Spec.RestartPolicy.OnSubmissionFailureRetries + 1)) + + By("Only valid statuses appear in other apps") + validStatuses := []v1beta2.ApplicationStateType{ + v1beta2.ApplicationStateNew, + v1beta2.ApplicationStateFailedSubmission, + } + for _, app := range apps[:len(apps)-1] { + Expect(validStatuses).To(ContainElement(app.Status.AppState.State)) + } + + By("Checking driver does not exist") + driverPodName := util.GetDriverPodName(app) + driverPodKey := types.NamespacedName{Namespace: app.Namespace, Name: driverPodName} + err := k8sClient.Get(ctx, driverPodKey, &corev1.Pod{}) + Expect(errors.IsNotFound(err)).To(BeTrue()) + }) + }) + + Context("application-fails", func() { + ctx := context.Background() + path := filepath.Join("bad_examples", "fail-application.yaml") + app := &v1beta2.SparkApplication{} + + BeforeEach(func() { + By("Parsing SparkApplication from file") + file, err := os.Open(path) + Expect(err).NotTo(HaveOccurred()) + Expect(file).NotTo(BeNil()) + + decoder := yaml.NewYAMLOrJSONDecoder(file, 100) + Expect(decoder).NotTo(BeNil()) + Expect(decoder.Decode(app)).NotTo(HaveOccurred()) + + By("Creating SparkApplication") + Expect(k8sClient.Create(ctx, app)).To(Succeed()) + }) + + AfterEach(func() { + key := types.NamespacedName{Namespace: app.Namespace, Name: app.Name} + Expect(k8sClient.Get(ctx, key, app)).To(Succeed()) + + By("Deleting SparkApplication") + Expect(k8sClient.Delete(ctx, app)).To(Succeed()) + }) + + It("Application fails and retries until retries are exhausted", func() { + By("Waiting for SparkApplication to terminate") + key := types.NamespacedName{Namespace: app.Namespace, Name: app.Name} + apps, polling_err := collectSparkApplicationsUntilTermination(ctx, key) + Expect(polling_err).To(HaveOccurred()) + + By("Should eventually fail") + final_app := apps[len(apps)-1] + Expect(final_app.Status.AppState.State).To(Equal(v1beta2.ApplicationStateFailed)) + Expect(final_app.Status.AppState.ErrorMessage).To(ContainSubstring("driver container failed")) + Expect(final_app.Status.ExecutionAttempts).To(Equal(*app.Spec.RestartPolicy.OnFailureRetries + 1)) + + By("Only valid statuses appear in other apps") + validStatuses := []v1beta2.ApplicationStateType{ + v1beta2.ApplicationStateNew, + v1beta2.ApplicationStateSubmitted, + v1beta2.ApplicationStateRunning, + v1beta2.ApplicationStateFailing, + v1beta2.ApplicationStatePendingRerun, + } + for _, app := range apps[:len(apps)-1] { + Expect(validStatuses).To(ContainElement(app.Status.AppState.State)) + } + + By("Checking out driver logs") + driverPodName := util.GetDriverPodName(app) + bytes, err := clientset.CoreV1().Pods(app.Namespace).GetLogs(driverPodName, &corev1.PodLogOptions{}).Do(ctx).Raw() + Expect(err).NotTo(HaveOccurred()) + Expect(bytes).NotTo(BeEmpty()) + Expect(strings.Contains(string(bytes), "NoSuchFileException")).To(BeTrue()) + }) + }) + Context("spark-pi-python", func() { ctx := context.Background() path := filepath.Join("..", "..", "examples", "spark-pi-python.yaml") diff --git a/test/e2e/suit_test.go b/test/e2e/suit_test.go index 92167082a7..e9c534d942 100644 --- a/test/e2e/suit_test.go +++ b/test/e2e/suit_test.go @@ -271,3 +271,26 @@ func waitForSparkApplicationCompleted(ctx context.Context, key types.NamespacedN }) return err } + +func collectSparkApplicationsUntilTermination(ctx context.Context, key types.NamespacedName) ([]v1beta2.SparkApplication, error) { + cancelCtx, cancelFunc := context.WithTimeout(ctx, WaitTimeout) + defer cancelFunc() + + apps := []v1beta2.SparkApplication{} + + err := wait.PollUntilContextCancel(cancelCtx, PollInterval, true, func(ctx context.Context) (bool, error) { + app := v1beta2.SparkApplication{} + if err := k8sClient.Get(ctx, key, &app); err != nil { + return false, err + } + apps = append(apps, app) + switch app.Status.AppState.State { + case v1beta2.ApplicationStateFailed: + return true, errors.New(app.Status.AppState.ErrorMessage) + case v1beta2.ApplicationStateCompleted: + return true, nil + } + return false, nil + }) + return apps, err +}