From 57f8ebebed575966e9e76164a696a6edb28d3d56 Mon Sep 17 00:00:00 2001 From: Raghavendra Talur Date: Sun, 10 Nov 2024 18:31:58 -0500 Subject: [PATCH] drpc: set ProgressionWaitOnUserToCleanUp along with placement update For discovered apps, we want user to perform the cleanup of the workload. We should advertise the progression to them at the same time when we ask OCM/ACM to perform the cleanup. Signed-off-by: Raghavendra Talur (cherry picked from commit a079ffdf459599f532404b0237e4f92bf46b2197) --- internal/controller/drplacementcontrol.go | 68 ++++++++++++++----- internal/controller/volsync/vshandler.go | 12 ++++ .../volumereplicationgroup_controller.go | 19 +----- internal/controller/vrg_volsync.go | 19 ++++-- 4 files changed, 80 insertions(+), 38 deletions(-) diff --git a/internal/controller/drplacementcontrol.go b/internal/controller/drplacementcontrol.go index 5c9c57715..a7d4bbe85 100644 --- a/internal/controller/drplacementcontrol.go +++ b/internal/controller/drplacementcontrol.go @@ -373,7 +373,7 @@ func (d *DRPCInstance) RunFailover() (bool, error) { return !done, nil } - return d.ensureActionCompleted(failoverCluster) + return d.ensureFailoverActionCompleted(failoverCluster) } else if yes, err := d.mwExistsAndPlacementUpdated(failoverCluster); yes || err != nil { // We have to wait for the VRG to appear on the failoverCluster or // in case of an error, try again later @@ -863,7 +863,7 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation, metav1.ConditionTrue, string(d.instance.Status.Phase), "Completed") - return d.ensureActionCompleted(preferredCluster) + return d.ensureRelocateActionCompleted(preferredCluster) } d.setStatusInitiating() @@ -896,6 +896,29 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { return d.relocate(preferredCluster, preferredClusterNamespace, rmn.Relocating) } +func (d *DRPCInstance) ensureRelocateActionCompleted(srcCluster string) (bool, error) { + d.setProgression(rmn.ProgressionCleaningUp) + + return d.ensureActionCompleted(srcCluster) +} + +func (d *DRPCInstance) ensureFailoverActionCompleted(srcCluster string) (bool, error) { + // This is the time to cleanup the workload from the preferredCluster. + // For managed apps, it will be done automatically by ACM, when we update + // the placement to the targetCluster. For discovered apps, we have to let + // the user know that they need to clean up the apps. + // So set the progression to wait on user to clean up. + // If not discovered apps, then we can set the progression to cleaning up. + if d.instance.Spec.ProtectedNamespaces != nil && + len(*d.instance.Spec.ProtectedNamespaces) > 0 { + d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) + } else { + d.setProgression(rmn.ProgressionCleaningUp) + } + + return d.ensureActionCompleted(srcCluster) +} + func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) { const done = true @@ -909,8 +932,6 @@ func (d *DRPCInstance) ensureActionCompleted(srcCluster string) (bool, error) { return !done, err } - d.setProgression(rmn.ProgressionCleaningUp) - // Cleanup and setup VolSync if enabled err = d.ensureCleanupAndVolSyncReplicationSetup(srcCluster) if err != nil { @@ -939,6 +960,7 @@ func (d *DRPCInstance) ensureCleanupAndVolSyncReplicationSetup(srcCluster string return fmt.Errorf("waiting for RDSpec count on cluster %s to go to zero. VRG OK? %v", srcCluster, ok) } + // Ensure cleanup waits for the VRG to be secondary on the clusters other than srcCluster err = d.EnsureCleanup(srcCluster) if err != nil { return err @@ -954,6 +976,7 @@ func (d *DRPCInstance) ensureCleanupAndVolSyncReplicationSetup(srcCluster string return nil } +// nolint: cyclop func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error) { const done = true @@ -974,8 +997,19 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error) addOrUpdateCondition(&d.instance.Status.Conditions, rmn.ConditionAvailable, d.instance.Generation, d.getConditionStatusForTypeAvailable(), string(d.instance.Status.Phase), "Starting quiescing for relocation") - // clear current user PlacementRule's decision - d.setProgression(rmn.ProgressionClearingPlacement) + // We are going to clear the placement, this is when ACM will start + // deleting the workloads from the current cluster. In case of + // discovered apps, we have to let the user know that they need to + // clean up the apps from the current cluster. So set the progression + // to wait on user to clean up. For non-discovered apps, we can set the + // progression to clearing placement. + if d.instance.Spec.ProtectedNamespaces != nil && + len(*d.instance.Spec.ProtectedNamespaces) > 0 { + d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) + } else { + // clear current user PlacementRule's decision + d.setProgression(rmn.ProgressionClearingPlacement) + } err := d.clearUserPlacementRuleStatus() if err != nil { @@ -990,7 +1024,12 @@ func (d *DRPCInstance) quiesceAndRunFinalSync(homeCluster string) (bool, error) } if !result { - d.setProgression(rmn.ProgressionRunningFinalSync) + if d.instance.Spec.ProtectedNamespaces != nil && + len(*d.instance.Spec.ProtectedNamespaces) > 0 { + d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) + } else { + d.setProgression(rmn.ProgressionRunningFinalSync) + } return !done, nil } @@ -1235,7 +1274,12 @@ func (d *DRPCInstance) setupRelocation(preferredCluster string) error { // complete in one shot, then coming back to this loop will reset the preferredCluster to secondary again. clusterToSkip := preferredCluster if !d.ensureVRGIsSecondaryEverywhere(clusterToSkip) { - d.setProgression(rmn.ProgressionEnsuringVolumesAreSecondary) + if d.instance.Spec.ProtectedNamespaces != nil && + len(*d.instance.Spec.ProtectedNamespaces) > 0 { + d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) + } else { + d.setProgression(rmn.ProgressionEnsuringVolumesAreSecondary) + } // During relocation, both clusters should be up and both must be secondaries before we proceed. if !d.moveVRGToSecondaryEverywhere() { return fmt.Errorf("failed to move VRG to secondary everywhere") @@ -2048,10 +2092,6 @@ func (d *DRPCInstance) ensureVRGManifestWorkOnClusterDeleted(clusterName string) d.log.Info("Request not complete yet", "cluster", clusterName) - if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 { - d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) - } - // IF we get here, either the VRG has not transitioned to secondary (yet) or delete didn't succeed. In either cases, // we need to make sure that the VRG object is deleted. IOW, we still have to wait return !done, nil @@ -2067,10 +2107,6 @@ func (d *DRPCInstance) ensureVRGIsSecondaryEverywhere(clusterToSkip string) bool continue } - if d.instance.Spec.ProtectedNamespaces != nil && len(*d.instance.Spec.ProtectedNamespaces) > 0 { - d.setProgression(rmn.ProgressionWaitOnUserToCleanUp) - } - if !d.ensureVRGIsSecondaryOnCluster(clusterName) { d.log.Info("Still waiting for VRG to transition to secondary", "cluster", clusterName) diff --git a/internal/controller/volsync/vshandler.go b/internal/controller/volsync/vshandler.go index 6457c9355..8db798618 100644 --- a/internal/controller/volsync/vshandler.go +++ b/internal/controller/volsync/vshandler.go @@ -73,6 +73,7 @@ type VSHandler struct { destinationCopyMethod volsyncv1alpha1.CopyMethodType volumeSnapshotClassList *snapv1.VolumeSnapshotClassList vrgInAdminNamespace bool + workloadStatus string } func NewVSHandler(ctx context.Context, client client.Client, log logr.Logger, owner metav1.Object, @@ -98,6 +99,10 @@ func NewVSHandler(ctx context.Context, client client.Client, log logr.Logger, ow return vsHandler } +func (v *VSHandler) GetWorkloadStatus() string { + return v.workloadStatus +} + // returns replication destination only if create/update is successful and the RD is considered available. // Callers should assume getting a nil replication destination back means they should retry/requeue. // @@ -361,6 +366,8 @@ func (v *VSHandler) validatePVCBeforeRS(rsSpec ramendrv1alpha1.VolSyncReplicatio } if pvcIsMounted { + v.workloadStatus = "active" + return false, nil } @@ -1634,6 +1641,8 @@ func (v *VSHandler) IsRDDataProtected(pvcName, pvcNamespace string) (bool, error func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncReplicationDestinationSpec, ) (*string, error) { if !v.IsCopyMethodDirect() { + // TODO: + // We need to check the workload status even in other cases. v.log.Info("Using default copyMethod of Snapshot") return nil, nil // use default copyMethod @@ -1655,6 +1664,9 @@ func (v *VSHandler) PrecreateDestPVCIfEnabled(rdSpec ramendrv1alpha1.VolSyncRepl // on this cluster). That race condition will be ignored. That would be a user error to deploy the // same app in the same namespace and on the destination cluster... if inUse { + // Even if one pvc is in use, mark the workload status as active + v.workloadStatus = "active" + return nil, fmt.Errorf("pvc %v is mounted by others. Checking later", util.ProtectedPVCNamespacedName(rdSpec.ProtectedPVC)) } diff --git a/internal/controller/volumereplicationgroup_controller.go b/internal/controller/volumereplicationgroup_controller.go index fc48ce2a5..d064e529d 100644 --- a/internal/controller/volumereplicationgroup_controller.go +++ b/internal/controller/volumereplicationgroup_controller.go @@ -1234,24 +1234,6 @@ func (v *VRGInstance) updateVRGStatus(result ctrl.Result) ctrl.Result { // on the cluster func (v *VRGInstance) updateStatusState() { dataReadyCondition := findCondition(v.instance.Status.Conditions, VRGConditionTypeDataReady) - if dataReadyCondition == nil { - // VRG is exclusively using volsync - if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary && - len(v.instance.Spec.VolSync.RDSpec) > 0 { - v.instance.Status.State = ramendrv1alpha1.SecondaryState - - return - } - - v.log.Info("Failed to find the DataReady condition in status") - - v.instance.Status.State = ramendrv1alpha1.UnknownState - - return - } - - StatusState := getStatusStateFromSpecState(v.instance.Spec.ReplicationState) - if dataReadyCondition.Status != metav1.ConditionTrue || dataReadyCondition.ObservedGeneration != v.instance.Generation { v.instance.Status.State = ramendrv1alpha1.UnknownState @@ -1259,6 +1241,7 @@ func (v *VRGInstance) updateStatusState() { return } + StatusState := getStatusStateFromSpecState(v.instance.Spec.ReplicationState) if v.instance.Spec.ReplicationState == ramendrv1alpha1.Primary { v.instance.Status.State = StatusState diff --git a/internal/controller/vrg_volsync.go b/internal/controller/vrg_volsync.go index d2059262f..6e0853120 100644 --- a/internal/controller/vrg_volsync.go +++ b/internal/controller/vrg_volsync.go @@ -225,6 +225,8 @@ func (v *VRGInstance) reconcileVolSyncAsSecondary() bool { // If we are secondary, and RDSpec is not set, then we don't want to have any PVC // flagged as a VolSync PVC. if v.instance.Spec.VolSync.RDSpec == nil { + // This might be a case where we lose the RDSpec temporarily, + // so we don't know if workload status is truly inactive. idx := 0 for _, protectedPVC := range v.instance.Status.ProtectedPVCs { @@ -250,6 +252,7 @@ func (v *VRGInstance) reconcileRDSpecForDeletionOrReplication() bool { requeue := false rdinCGs := []ramendrv1alpha1.VolSyncReplicationDestinationSpec{} + // TODO: Set the workload status in CG code path later for _, rdSpec := range v.instance.Spec.VolSync.RDSpec { cg, ok := rdSpec.ProtectedPVC.Labels[ConsistencyGroupLabel] if ok && util.IsCGEnabled(v.instance.Annotations) { @@ -347,16 +350,24 @@ func (v *VRGInstance) aggregateVolSyncDataReadyCondition() *metav1.Condition { Message: "All VolSync PVCs are ready", } - if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary { + if len(v.volSyncPVCs) == 0 { dataReadyCondition.Reason = VRGConditionReasonUnused - dataReadyCondition.Message = "Volsync based PVC protection does not report DataReady condition as Secondary" + dataReadyCondition.Message = "No PVCs are protected using Volsync scheme" return dataReadyCondition } - if len(v.volSyncPVCs) == 0 { + if v.instance.Spec.ReplicationState == ramendrv1alpha1.Secondary { + if v.volSyncHandler.GetWorkloadStatus() == "active" { + dataReadyCondition.Status = metav1.ConditionFalse + dataReadyCondition.Reason = VRGConditionReasonProgressing + dataReadyCondition.Message = "Some of the VolSync PVCs are active" + + return dataReadyCondition + } + dataReadyCondition.Reason = VRGConditionReasonUnused - dataReadyCondition.Message = "No PVCs are protected using Volsync scheme" + dataReadyCondition.Message = "Volsync based PVC protection does not report DataReady condition as Secondary" return dataReadyCondition }