diff --git a/.chloggen/fix-operator-install-operations.yaml b/.chloggen/fix-operator-install-operations.yaml new file mode 100644 index 0000000000..5840357671 --- /dev/null +++ b/.chloggen/fix-operator-install-operations.yaml @@ -0,0 +1,12 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix +# The name of the component, or a single word describing the area of concern, (e.g. agent, clusterReceiver, gateway, operator, chart, other) +component: operator +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Additional fixes for issues where sometimes certificates and Instrumentation opentelemetry.io/v1alpha1 are installed too early +# One or more tracking issues related to the change +issues: [1586] +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/certmanager.yaml b/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/certmanager.yaml index 46d53d3a0b..b609dbb24e 100644 --- a/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/certmanager.yaml +++ b/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/certmanager.yaml @@ -5,7 +5,7 @@ kind: Certificate metadata: annotations: helm.sh/hook: post-install,post-upgrade - helm.sh/hook-weight: "1" + helm.sh/hook-weight: "3" labels: helm.sh/chart: operator-0.71.2 app.kubernetes.io/name: operator @@ -34,7 +34,7 @@ kind: Issuer metadata: annotations: helm.sh/hook: post-install,post-upgrade - helm.sh/hook-weight: "1" + helm.sh/hook-weight: "2" labels: helm.sh/chart: operator-0.71.2 app.kubernetes.io/name: operator diff --git a/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/job-startupapicheck.yaml b/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/job-startupapicheck.yaml new file mode 100644 index 0000000000..d32f6963dd --- /dev/null +++ b/examples/enable-operator-and-auto-instrumentation/rendered_manifests/operator/job-startupapicheck.yaml @@ -0,0 +1,73 @@ +--- +# Source: splunk-otel-collector/templates/operator/job-startupapicheck.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: default-splunk-otel-collector-operator-startupapicheck + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.113.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.113.0" + app: splunk-otel-collector + component: otel-operator + chart: splunk-otel-collector-0.113.0 + release: default + heritage: Helm + app.kubernetes.io/component: otel-operator + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "4" +spec: + template: + spec: + containers: + - name: startupapicheck + image: registry.access.redhat.com/ubi9/ubi:latest + env: + - name: MANAGER_METRICS_SERVICE_CLUSTERIP + value: "default-splunk-otel-collector-operator" + - name: MANAGER_METRICS_SERVICE_PORT + value: "8443" + - name: WEBHOOK_SERVICE_CLUSTERIP + value: "default-splunk-otel-collector-operator-webhook" + - name: WEBHOOK_SERVICE_PORT + value: "443" + command: + - sh + - -c + - | + operator_service_checked=false + operator_webhook_service_checked=false + + for i in $(seq 1 300); do + # Checks for the Operator and Operator Webhook service availability using curl + # The services are ready when curl receives an HTTP 400 error response. + + if [ "$operator_service_checked" = false ]; then + curl_output_operator=$(curl -s -o /dev/null -w "%{http_code}" "$MANAGER_METRICS_SERVICE_CLUSTERIP:$MANAGER_METRICS_SERVICE_PORT") + if [ "$curl_output_operator" = "400" ]; then + operator_service_checked=true + fi + fi + + if [ "$operator_webhook_service_checked" = false ]; then + curl_output_webhook=$(curl -s -o /dev/null -w "%{http_code}" "$WEBHOOK_SERVICE_CLUSTERIP:$WEBHOOK_SERVICE_PORT") + if [ "$curl_output_webhook" = "400" ]; then + operator_webhook_service_checked=true + fi + fi + + echo "Attempt $i: Operator Service=${operator_service_checked}, Operator Webhook Service=${operator_webhook_service_checked}" + sleep 1 + + if [ "$operator_service_checked" = true ] && [ "$operator_webhook_service_checked" = true ]; then + echo "All checks passed." + exit 0 + fi + done + echo "Timeout reached. Not all checks completed successfully." + exit 1 + restartPolicy: Never diff --git a/functional_tests/functional_test.go b/functional_tests/functional_test.go index bca0fb2b9d..5904235558 100644 --- a/functional_tests/functional_test.go +++ b/functional_tests/functional_test.go @@ -9,6 +9,8 @@ import ( "bytes" "context" "fmt" + "k8s.io/cli-runtime/pkg/genericclioptions" + "k8s.io/client-go/discovery" "os" "path/filepath" "regexp" @@ -94,6 +96,20 @@ type sinks struct { tracesConsumer *consumertest.TracesSink } +type cacheInvalidatingRESTClientGetter struct { + genericclioptions.RESTClientGetter + log func(string, ...interface{}) +} + +func (c *cacheInvalidatingRESTClientGetter) ToDiscoveryClient() (discovery.CachedDiscoveryInterface, error) { + discoveryClient, err := c.RESTClientGetter.ToDiscoveryClient() + if err == nil { + c.log("Clearing discovery cache") + discoveryClient.Invalidate() + } + return discoveryClient, err +} + func setupOnce(t *testing.T) *sinks { setupRun.Do(func() { // create an API server @@ -184,24 +200,31 @@ func deployChartsAndApps(t *testing.T) { err = yaml.Unmarshal(buf.Bytes(), &values) require.NoError(t, err) + // When using the Helm sdk, there can be cache invalidation issues we can get around with this, + // Helm cli users don't face this issue. Deploying CRD instances via Helm templates + // and CR instances via a post-install hook will fail without this. + // Related issue: https://github.com/helm/helm/issues/6452 + // Draft of upstream fix: https://github.com/jvoravong/helm/pull/1/files actionConfig := new(action.Configuration) - if err := actionConfig.Init(kube.GetConfig(testKubeConfig, "", "default"), "default", os.Getenv("HELM_DRIVER"), func(format string, v ...interface{}) { + wrappedGetter := &cacheInvalidatingRESTClientGetter{ + RESTClientGetter: kube.GetConfig(testKubeConfig, "", "default"), + log: func(format string, args ...interface{}) { + fmt.Printf(format+"\n", args...) + }, + } + + if err := actionConfig.Init(wrappedGetter, "default", os.Getenv("HELM_DRIVER"), func(format string, v ...interface{}) { t.Logf(format+"\n", v...) }); err != nil { require.NoError(t, err) } + install := action.NewInstall(actionConfig) install.Namespace = "default" install.ReleaseName = "sock" + install.Timeout = 5 * time.Minute + _, err = install.Run(chart, values) - if err != nil { - t.Logf("error reported during helm install: %v\n", err) - retryUpgrade := action.NewUpgrade(actionConfig) - retryUpgrade.Namespace = "default" - retryUpgrade.Install = true - _, err = retryUpgrade.Run("sock", chart, values) - require.NoError(t, err) - } waitForAllDeploymentsToStart(t, client) diff --git a/helm-charts/splunk-otel-collector/templates/operator/_helpers.tpl b/helm-charts/splunk-otel-collector/templates/operator/_helpers.tpl index 0e14c1bb11..8eef11c9eb 100644 --- a/helm-charts/splunk-otel-collector/templates/operator/_helpers.tpl +++ b/helm-charts/splunk-otel-collector/templates/operator/_helpers.tpl @@ -1,3 +1,8 @@ +{{/*Create the startupapicheck job image name.*/}} +{{- define "splunk-otel-collector.operator.instrumentation-job.image" -}} +{{- printf "%s:%s" .Values.instrumentation.jobs.startupapicheck.repository .Values.instrumentation.jobs.startupapicheck.tag | trimSuffix ":" -}} +{{- end -}} + {{/* Helper to ensure the correct usage of the Splunk OpenTelemetry Collector Operator. - Checks for a valid endpoint for exporting telemetry data. diff --git a/helm-charts/splunk-otel-collector/templates/operator/job-startupapicheck.yaml b/helm-charts/splunk-otel-collector/templates/operator/job-startupapicheck.yaml new file mode 100644 index 0000000000..d335c073f8 --- /dev/null +++ b/helm-charts/splunk-otel-collector/templates/operator/job-startupapicheck.yaml @@ -0,0 +1,69 @@ +{{- if and .Values.operator.enabled .Values.instrumentation.jobs.startupapicheck.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ template "splunk-otel-collector.fullname" . }}-operator-startupapicheck + namespace: {{ template "splunk-otel-collector.namespace" . }} + labels: + {{- include "splunk-otel-collector.commonLabels" . | nindent 4 }} + app: {{ template "splunk-otel-collector.name" . }} + component: otel-operator + chart: {{ template "splunk-otel-collector.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + app.kubernetes.io/component: otel-operator + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "4" +spec: + template: + spec: + containers: + - name: startupapicheck + image: {{ template "splunk-otel-collector.operator.instrumentation-job.image" . }} + env: + - name: MANAGER_METRICS_SERVICE_CLUSTERIP + value: "{{ template "splunk-otel-collector.fullname" . }}-operator" + - name: MANAGER_METRICS_SERVICE_PORT + value: "8443" + - name: WEBHOOK_SERVICE_CLUSTERIP + value: "{{ template "splunk-otel-collector.fullname" . }}-operator-webhook" + - name: WEBHOOK_SERVICE_PORT + value: "443" + command: + - sh + - -c + - | + operator_service_checked=false + operator_webhook_service_checked=false + + for i in $(seq 1 300); do + # Checks for the Operator and Operator Webhook service availability using curl + # The services are ready when curl receives an HTTP 400 error response. + + if [ "$operator_service_checked" = false ]; then + curl_output_operator=$(curl -s -o /dev/null -w "%{http_code}" "$MANAGER_METRICS_SERVICE_CLUSTERIP:$MANAGER_METRICS_SERVICE_PORT") + if [ "$curl_output_operator" = "400" ]; then + operator_service_checked=true + fi + fi + + if [ "$operator_webhook_service_checked" = false ]; then + curl_output_webhook=$(curl -s -o /dev/null -w "%{http_code}" "$WEBHOOK_SERVICE_CLUSTERIP:$WEBHOOK_SERVICE_PORT") + if [ "$curl_output_webhook" = "400" ]; then + operator_webhook_service_checked=true + fi + fi + + echo "Attempt $i: Operator Service=${operator_service_checked}, Operator Webhook Service=${operator_webhook_service_checked}" + sleep 1 + + if [ "$operator_service_checked" = true ] && [ "$operator_webhook_service_checked" = true ]; then + echo "All checks passed." + exit 0 + fi + done + echo "Timeout reached. Not all checks completed successfully." + exit 1 + restartPolicy: Never +{{ end }} diff --git a/helm-charts/splunk-otel-collector/values.schema.json b/helm-charts/splunk-otel-collector/values.schema.json index 6472c986a7..e6a3ebdafb 100644 --- a/helm-charts/splunk-otel-collector/values.schema.json +++ b/helm-charts/splunk-otel-collector/values.schema.json @@ -1681,6 +1681,27 @@ } }, "required": ["repository", "tag"] + }, + "jobs": { + "type": "object", + "additionalProperties": false, + "properties": { + "startupapicheck": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "repository": { + "type": "string" + }, + "tag": { + "type": "string" + } + } + } + } } } }, diff --git a/helm-charts/splunk-otel-collector/values.yaml b/helm-charts/splunk-otel-collector/values.yaml index 1b3580e96d..9232b0368e 100644 --- a/helm-charts/splunk-otel-collector/values.yaml +++ b/helm-charts/splunk-otel-collector/values.yaml @@ -975,7 +975,6 @@ image: # The policy that specifies when the user wants the Universal Base images to be pulled pullPolicy: IfNotPresent - ################################################################################ # Extra system configuration ################################################################################ @@ -1178,13 +1177,14 @@ operator: enabled: false admissionWebhooks: certManager: - # Annotate the certificate and issuer to ensure they are created after the cert-manager CRDs have been installed. - certificateAnnotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "1" + # Annotate the issuer and certificate to ensure they are created after the cert-manager CRDs + # have been installed and cert-manager is ready. issuerAnnotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "1" + "helm.sh/hook-weight": "2" + certificateAnnotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "3" # Collector deployment via the operator is not supported at this time. # The collector image repository is specified here to meet operator subchart constraints. manager: @@ -1271,6 +1271,13 @@ instrumentation: # - name: NGINX_ENV_VAR # value: nginx_value # Auto-instrumentation Libraries (End) + # Jobs related to installing, uninstalling, or managing Operator Instrumentation. + jobs: + # Enables waiting for the operator to be available and ready before deploying instrumentation as a hook post-install step. + startupapicheck: + enabled: true + repository: registry.access.redhat.com/ubi9/ubi + tag: latest # The cert-manager is a CNCF application deployed as a subchart and used for supporting operators that require TLS certificates. # Full list of Helm value configurations: https://artifacthub.io/packages/helm/cert-manager/cert-manager?modal=values