Skip to content

Commit

Permalink
Merge pull request RamenDR#399 from raghavendra-talur/rtalur-backport…
Browse files Browse the repository at this point in the history
…-1646

Backport of PR RamenDR#1646 drpc: set ProgressionWaitOnUserToCleanUp along with placement update
  • Loading branch information
openshift-merge-bot[bot] authored Nov 15, 2024
2 parents 28a80de + 57f8ebe commit 4db50c6
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 38 deletions.
68 changes: 52 additions & 16 deletions internal/controller/drplacementcontrol.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ func (d *DRPCInstance) RunFailover() (bool, error) {
return !done, nil
}

return d.ensureActionCompleted(failoverCluster)
return d.ensureFailoverActionCompleted(failoverCluster)
} else if yes, err := d.mwExistsAndPlacementUpdated(failoverCluster); yes || err != nil {
// We have to wait for the VRG to appear on the failoverCluster or
// in case of an error, try again later
Expand Down Expand Up @@ -863,7 +863,7 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation,
metav1.ConditionTrue, string(d.instance.Status.Phase), "Completed")

return d.ensureActionCompleted(preferredCluster)
return d.ensureRelocateActionCompleted(preferredCluster)
}

d.setStatusInitiating()
Expand Down Expand Up @@ -896,6 +896,29 @@ func (d *DRPCInstance) RunRelocate() (bool, error) {
return d.relocate(preferredCluster, preferredClusterNamespace, rmn.Relocating)
}

func (d *DRPCInstance) ensureRelocateActionCompleted(srcCluster string) (bool, error) {
d.setProgression(rmn.ProgressionCleaningUp)

return d.ensureActionCompleted(srcCluster)
}

func (d *DRPCInstance) ensureFailoverActionCompleted(srcCluster string) (bool, error) {
// This is the time to cleanup the workload from the preferredCluster.
// For managed apps, it will be done automatically by ACM, when we update
// the placement to the targetCluster. For discovered apps, we have to let
// the user know that they need to clean up the apps.
// So set the progression to wait on user to clean up.
// If not discovered apps, then we can set the progression to cleaning up.
if d.instance.Spec.ProtectedNamespaces != nil &&
len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
} else {
d.setProgression(rmn.ProgressionCleaningUp)
}

return d.ensureActionCompleted(srcCluster)
}

func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) {
const done = true

Expand All @@ -909,8 +932,6 @@ func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) {
return !done, err
}

d.setProgression(rmn.ProgressionCleaningUp)

// Cleanup and setup VolSync if enabled
err = d.ensureCleanupAndVolSyncReplicationSetup(srcCluster)
if err != nil {
Expand Down Expand Up @@ -939,6 +960,7 @@ func (d *DRPCInstance) ensureCleanupAndVolSyncReplicationSetup(srcCluster string
return fmt.Errorf("waiting for RDSpec count on cluster %s to go to zero. VRG OK? %v", srcCluster, ok)
}

// Ensure cleanup waits for the VRG to be secondary on the clusters other than srcCluster
err = d.EnsureCleanup(srcCluster)
if err != nil {
return err
Expand All @@ -954,6 +976,7 @@ func (d *DRPCInstance) ensureCleanupAndVolSyncReplicationSetup(srcCluster string
return nil
}

// nolint: cyclop
func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error) {
const done = true

Expand All @@ -974,8 +997,19 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error)
addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation,
d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), "Starting quiescing for relocation")

// clear current user PlacementRule's decision
d.setProgression(rmn.ProgressionClearingPlacement)
// We are going to clear the placement, this is when ACM will start
// deleting the workloads from the current cluster. In case of
// discovered apps, we have to let the user know that they need to
// clean up the apps from the current cluster. So set the progression
// to wait on user to clean up. For non-discovered apps, we can set the
// progression to clearing placement.
if d.instance.Spec.ProtectedNamespaces != nil &&
len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
} else {
// clear current user PlacementRule's decision
d.setProgression(rmn.ProgressionClearingPlacement)
}

err := d.clearUserPlacementRuleStatus()
if err != nil {
Expand All @@ -990,7 +1024,12 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error)
}

if !result {
d.setProgression(rmn.ProgressionRunningFinalSync)
if d.instance.Spec.ProtectedNamespaces != nil &&
len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
} else {
d.setProgression(rmn.ProgressionRunningFinalSync)
}

return !done, nil
}
Expand Down Expand Up @@ -1235,7 +1274,12 @@ func (d *DRPCInstance) setupRelocation(preferredCluster string) error {
// complete in one shot, then coming back to this loop will reset the preferredCluster to secondary again.
clusterToSkip := preferredCluster
if !d.ensureVRGIsSecondaryEverywhere(clusterToSkip) {
d.setProgression(rmn.ProgressionEnsuringVolumesAreSecondary)
if d.instance.Spec.ProtectedNamespaces != nil &&
len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
} else {
d.setProgression(rmn.ProgressionEnsuringVolumesAreSecondary)
}
// During relocation, both clusters should be up and both must be secondaries before we proceed.
if !d.moveVRGToSecondaryEverywhere() {
return fmt.Errorf("failed to move VRG to secondary everywhere")
Expand Down Expand Up @@ -2048,10 +2092,6 @@ func (d *DRPCInstance) ensureVRGManifestWorkOnClusterDeleted(clusterName string)

d.log.Info("Request not complete yet", "cluster", clusterName)

if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
}

// IF we get here, either the VRG has not transitioned to secondary (yet) or delete didn't succeed. In either cases,
// we need to make sure that the VRG object is deleted. IOW, we still have to wait
return !done, nil
Expand All @@ -2067,10 +2107,6 @@ func (d *DRPCInstance) ensureVRGIsSecondaryEverywhere(clusterToSkip string) bool
continue
}

if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 {
d.setProgression(rmn.ProgressionWaitOnUserToCleanUp)
}

if !d.ensureVRGIsSecondaryOnCluster(clusterName) {
d.log.Info("Still waiting for VRG to transition to secondary", "cluster", clusterName)

Expand Down
12 changes: 12 additions & 0 deletions internal/controller/volsync/vshandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ type VSHandler struct {
destinationCopyMethod volsyncv1alpha1.CopyMethodType
volumeSnapshotClassList *snapv1.VolumeSnapshotClassList
vrgInAdminNamespace bool
workloadStatus string
}

func NewVSHandler(ctx context.Context, client client.Client, log logr.Logger, owner metav1.Object,
Expand All @@ -98,6 +99,10 @@ func NewVSHandler(ctx context.Context, client client.Client, log logr.Logger, ow
return vsHandler
}

func (v *VSHandler) GetWorkloadStatus() string {
return v.workloadStatus
}

// returns replication destination only if create/update is successful and the RD is considered available.
// Callers should assume getting a nil replication destination back means they should retry/requeue.
//
Expand Down Expand Up @@ -361,6 +366,8 @@ func (v *VSHandler) validatePVCBeforeRS(rsSpec ramendrv1alpha1.VolSyncReplicatio
}

if pvcIsMounted {
v.workloadStatus = "active"

return false, nil
}

Expand Down Expand Up @@ -1634,6 +1641,8 @@ func (v *VSHandler) IsRDDataProtected(pvcName, pvcNamespace string) (bool, error
func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncReplicationDestinationSpec,
) (*string, error) {
if !v.IsCopyMethodDirect() {
// TODO:
// We need to check the workload status even in other cases.
v.log.Info("Using default copyMethod of Snapshot")

return nil, nil // use default copyMethod
Expand All @@ -1655,6 +1664,9 @@ func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncRepl
// on this cluster). That race condition will be ignored. That would be a user error to deploy the
// same app in the same namespace and on the destination cluster...
if inUse {
// Even if one pvc is in use, mark the workload status as active
v.workloadStatus = "active"

return nil, fmt.Errorf("pvc %v is mounted by others. Checking later",
util.ProtectedPVCNamespacedName(rdSpec.ProtectedPVC))
}
Expand Down
19 changes: 1 addition & 18 deletions internal/controller/volumereplicationgroup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1236,31 +1236,14 @@ func (v *VRGInstance) updateVRGStatus(result ctrl.Result) ctrl.Result {
// on the cluster
func (v *VRGInstance) updateStatusState() {
dataReadyCondition := findCondition(v.instance.Status.Conditions, VRGConditionTypeDataReady)
if dataReadyCondition == nil {
// VRG is exclusively using volsync
if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary &&
len(v.instance.Spec.VolSync.RDSpec) > 0 {
v.instance.Status.State = ramendrv1alpha1.SecondaryState

return
}

v.log.Info("Failed to find the DataReady condition in status")

v.instance.Status.State = ramendrv1alpha1.UnknownState

return
}

StatusState := getStatusStateFromSpecState(v.instance.Spec.ReplicationState)

if dataReadyCondition.Status != metav1.ConditionTrue ||
dataReadyCondition.ObservedGeneration != v.instance.Generation {
v.instance.Status.State = ramendrv1alpha1.UnknownState

return
}

StatusState := getStatusStateFromSpecState(v.instance.Spec.ReplicationState)
if v.instance.Spec.ReplicationState == ramendrv1alpha1.Primary {
v.instance.Status.State = StatusState

Expand Down
19 changes: 15 additions & 4 deletions internal/controller/vrg_volsync.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ func (v *VRGInstance) reconcileVolSyncAsSecondary() bool {
// If we are secondary, and RDSpec is not set, then we don't want to have any PVC
// flagged as a VolSync PVC.
if v.instance.Spec.VolSync.RDSpec == nil {
// This might be a case where we lose the RDSpec temporarily,
// so we don't know if workload status is truly inactive.
idx := 0

for _, protectedPVC := range v.instance.Status.ProtectedPVCs {
Expand All @@ -250,6 +252,7 @@ func (v *VRGInstance) reconcileRDSpecForDeletionOrReplication() bool {
requeue := false
rdinCGs := []ramendrv1alpha1.VolSyncReplicationDestinationSpec{}

// TODO: Set the workload status in CG code path later
for _, rdSpec := range v.instance.Spec.VolSync.RDSpec {
cg, ok := rdSpec.ProtectedPVC.Labels[ConsistencyGroupLabel]
if ok && util.IsCGEnabled(v.instance.Annotations) {
Expand Down Expand Up @@ -347,16 +350,24 @@ func (v *VRGInstance) aggregateVolSyncDataReadyCondition() *metav1.Condition {
Message: "All VolSync PVCs are ready",
}

if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary {
if len(v.volSyncPVCs) == 0 {
dataReadyCondition.Reason = VRGConditionReasonUnused
dataReadyCondition.Message = "Volsync based PVC protection does not report DataReady condition as Secondary"
dataReadyCondition.Message = "No PVCs are protected using Volsync scheme"

return dataReadyCondition
}

if len(v.volSyncPVCs) == 0 {
if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary {
if v.volSyncHandler.GetWorkloadStatus() == "active" {
dataReadyCondition.Status = metav1.ConditionFalse
dataReadyCondition.Reason = VRGConditionReasonProgressing
dataReadyCondition.Message = "Some of the VolSync PVCs are active"

return dataReadyCondition
}

dataReadyCondition.Reason = VRGConditionReasonUnused
dataReadyCondition.Message = "No PVCs are protected using Volsync scheme"
dataReadyCondition.Message = "Volsync based PVC protection does not report DataReady condition as Secondary"

return dataReadyCondition
}
Expand Down

0 comments on commit 4db50c6

Please sign in to comment.