Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add validation code to check replicas for quorum loss #102

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
13 changes: 13 additions & 0 deletions src/go/k8s/api/redpanda/v1alpha1/redpanda_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,19 @@ func RedpandaReady(rp *Redpanda) *Redpanda {
return rp
}

// RedpandaStalled registers a failure in reconciliation of a given HelmRelease.
func RedpandaStalled(rp *Redpanda, msg string) *Redpanda {
newCondition := metav1.Condition{
Type: meta.StalledCondition,
Status: metav1.ConditionTrue,
Reason: "RedpandaClusterStalled",
Message: msg,
}
apimeta.SetStatusCondition(rp.GetConditions(), newCondition)
rp.Status.LastAppliedRevision = rp.Status.LastAttemptedRevision
return rp
}

// RedpandaNotReady registers a failed reconciliation of the given Redpanda.
func RedpandaNotReady(rp *Redpanda, reason, message string) *Redpanda {
newCondition := metav1.Condition{
Expand Down
95 changes: 95 additions & 0 deletions src/go/k8s/internal/controller/redpanda/redpanda_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ import (
"context"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"maps"
"reflect"
"strconv"
"time"

helmv2beta1 "github.com/fluxcd/helm-controller/api/v2beta1"
Expand Down Expand Up @@ -60,6 +62,11 @@ const (

revisionPath = "/revision"
componentLabelValue = "redpanda-statefulset"

minimumTopicReplicas = 3

// these constants can be removed after versions older that v22.3.1 are no longer supported
defaultTopicReplicationKey = "default_topic_replications"
)

var errWaitForReleaseDeletion = errors.New("wait for helm release deletion")
Expand Down Expand Up @@ -610,6 +617,15 @@ func (r *RedpandaReconciler) reconcileHelmRelease(ctx context.Context, rp *v1alp
return rp, hr, fmt.Errorf("failed to get HelmRelease '%s/%s': %w", rp.Namespace, rp.Status.HelmRelease, err)
}

// We have retrieved an existing HelmRelease here, if it did not exist, it would have been created above
// so this is a good place, to validate the HelmRelease before updating.
errValidating := validateHelmRelease(rp, hr)
if errValidating != nil {
v1alpha1.RedpandaStalled(rp, "HelmRelease validation failed")
r.event(rp, rp.Status.LastAttemptedRevision, v1alpha1.EventSeverityError, errValidating.Error())
return rp, hr, fmt.Errorf("validating HelmRelease error: '%s/%s': %w", rp.Namespace, rp.Status.HelmRelease, errValidating)
}

// Check if we need to update here
hrTemplate, errTemplated := r.createHelmReleaseFromTemplate(ctx, rp)
if errTemplated != nil {
Expand Down Expand Up @@ -886,3 +902,82 @@ func disableConsoleReconciliation(console *vectorzied_v1alpha1.Console) {
}
console.Annotations[managedAnnotationKey] = NotManaged
}

func validateHelmRelease(rp *v1alpha1.Redpanda, hr *helmv2beta2.HelmRelease) error {
errs := make([]error, 0)

errReplicaCount := validateHelmReleaseReplicaCount(rp, hr)
if errReplicaCount != nil {
errs = append(errs, errReplicaCount)
}

return errors.Join(errs...)
}

func validateHelmReleaseReplicaCount(rp *v1alpha1.Redpanda, hr *helmv2beta2.HelmRelease) error {
// First validate if we are scaling down too fast
clusterSpec := &v1alpha1.RedpandaClusterSpec{}
err := json.Unmarshal(hr.Spec.Values.Raw, clusterSpec)
alejandroEsc marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
// nolint:goerr113 // error is not wrapping existing error
return fmt.Errorf("could not unmarshal values data to validate helmrelease")
}

currentReplicas := ptr.Deref(clusterSpec.Statefulset.Replicas, 0)
if currentReplicas == 0 {
// current replicas is 0, no longer validating.
return nil
}

// Calculate min number of nodes to (floored) to keep quorum
// Note slowly successful decommissioning will change this value,
// so as long as we do not lose quorum we should be able to scale
// in a controlled manner
minForQuorum := (currentReplicas + 1) / 2

requestedReplicas := ptr.Deref(rp.Spec.ClusterSpec.Statefulset.Replicas, 0)

if requestedReplicas < minForQuorum {
// nolint:goerr113 // error is not wrapping existing error
return fmt.Errorf("requested replicas of %d is less than %d neeed to maintain quorum", requestedReplicas, minForQuorum)
}

// If quorum may be preserved, then find out about topic replication and ensure we do not go below default
specConfigs := clusterSpec.Config
doCheckDefMinTopicReplicas := true
// nolint:nestif // complexity is ok
if clusterSpec.Config != nil {
clusterInfo := specConfigs.Cluster
if clusterInfo != nil {
clusterMap := make(map[string]string)
ClusterMapPtr := &clusterMap
errUnmar := json.Unmarshal(clusterInfo.Raw, ClusterMapPtr)
if errUnmar != nil {
return fmt.Errorf("cannot unmarshal cluster config data %w", errUnmar)
}

minReplicationFactor, ok := clusterMap[defaultTopicReplicationKey]
if ok {
doCheckDefMinTopicReplicas = false
minReplicationFactorInt, errConvert := strconv.Atoi(minReplicationFactor)
if errConvert != nil {
return fmt.Errorf("cannot unmarshal cluster config data %w", errConvert)
}

if requestedReplicas < minReplicationFactorInt {
// nolint:goerr113 // error is not wrapping existing error
return fmt.Errorf("requested replicas of %d is less than replication factor %d", requestedReplicas, minReplicationFactorInt)
}
}
}
}

if doCheckDefMinTopicReplicas {
if requestedReplicas < minimumTopicReplicas {
// nolint:goerr113 // error is not wrapping existing error
return fmt.Errorf("requested replicas of %d is less than replication factor %d", requestedReplicas, minimumTopicReplicas)
}
}

return nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
finalizers:
- operator.redpanda.com/finalizer
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 5
status:
conditions:
- message: Redpanda reconciliation succeeded
reason: RedpandaClusterDeployed
status: "True"
type: Ready
helmRelease: redpanda
helmReleaseReady: true
helmRepository: redpanda-repository
helmRepositoryReady: true
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redpanda
spec:
replicas: 5
status:
availableReplicas: 5
currentReplicas: 5
readyReplicas: 5
replicas: 5
updatedReplicas: 5
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
collectors:
- command: ../../../hack/get-redpanda-info.sh redpanda ../../_e2e_artifacts_v2
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
finalizers:
- operator.redpanda.com/finalizer
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 2
status:
conditions:
- message: Reconciliation in progress
reason: Progressing
status: Unknown
type: Ready
- message: HelmRelease validation failed
reason: RedpandaClusterStalled
status: "True"
type: Stalled
helmRelease: redpanda
helmReleaseReady: true
helmRepository: redpanda-repository
helmRepositoryReady: true
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redpanda
spec:
replicas: 5
status:
availableReplicas: 5
currentReplicas: 5
readyReplicas: 5
replicas: 5
updatedReplicas: 5
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
collectors:
- command: ../../../hack/get-redpanda-info.sh redpanda ../../_e2e_artifacts_v2

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
finalizers:
- operator.redpanda.com/finalizer
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 4
status:
conditions:
- message: Redpanda reconciliation succeeded
reason: RedpandaClusterDeployed
status: "True"
type: Ready
helmRelease: redpanda
helmReleaseReady: true
helmRepository: redpanda-repository
helmRepositoryReady: true
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redpanda
spec:
replicas: 4
status:
availableReplicas: 4
currentReplicas: 4
readyReplicas: 4
replicas: 4
updatedReplicas: 4
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
collectors:
- command: ../../../hack/get-redpanda-info.sh redpanda ../../_e2e_artifacts_v2

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
finalizers:
- operator.redpanda.com/finalizer
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 3
status:
conditions:
- message: Redpanda reconciliation succeeded
reason: RedpandaClusterDeployed
status: "True"
type: Ready
helmRelease: redpanda
helmReleaseReady: true
helmRepository: redpanda-repository
helmRepositoryReady: true
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redpanda
spec:
replicas: 3
status:
availableReplicas: 3
currentReplicas: 3
readyReplicas: 3
replicas: 3
updatedReplicas: 3
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
collectors:
- command: ../../../hack/get-redpanda-info.sh redpanda ../../_e2e_artifacts_v2

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
metadata:
name: redpanda
spec:
chartRef:
chartVersion: "5.7.35"
clusterSpec:
statefulset:
replicas: 3
# TODO: Default replication factor is 3.
# If any Topics have less than this, then the cluster is considered unhealthy
# So we must check the replication factor the topics of a cluster and also stop
# Scaling in those cases
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
collectors:
- command: ../../../hack/get-redpanda-info.sh redpanda ../../_e2e_artifacts_v2
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestStep
delete:
- apiVersion: cluster.redpanda.com/v1alpha1
kind: Redpanda
- apiVersion: batch/v1
kind: Job
- apiVersion: v1
kind: PersistentVolumeClaim
- apiVersion: v1
kind: Pod
labels:
app.kubernetes.io/name: redpanda
- apiVersion: v1
kind: Service
labels:
app.kubernetes.io/name: redpanda
Loading
Loading