[Stretch cluster] test device replacement in stretch cluster (red-hat…

…-storage#9548) Signed-off-by: Mahesh Shetty <[email protected]>
sagihirshfeld · Feb 10, 2025 · afa1627 · afa1627
1 parent 17eb221
commit afa1627
Show file tree

Hide file tree

Showing 4 changed files with 201 additions and 1 deletion.
diff --git a/ocs_ci/ocs/resources/pv.py b/ocs_ci/ocs/resources/pv.py
@@ -55,6 +55,23 @@ def get_pv_status(pv_obj):
     return pv_obj.get("status").get("phase")
 
 
+def get_pv_in_status(storage_class, status="Bound"):
+    """
+    It looks for pv with particular storageclass in particular status
+
+    Args:
+        storage_class (str): storage class
+        status (str): status of the pv
+
+    Returns:
+        list of pv objects
+
+    """
+
+    pvs = [pv for pv in get_pv_objs_in_sc(storage_class) if get_pv_status(pv) == status]
+    return pvs
+
+
 def get_pv_name(pv_obj):
     """
     Get the name of the pv object

diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_add_capacity.py b/tests/functional/disaster-recovery/sc_arbiter/test_add_capacity.py
@@ -5,7 +5,9 @@
 from ocs_ci.framework.pytest_customization.marks import (
     turquoise_squad,
     stretchcluster_required,
+    tier1,
 )
+from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm
 from ocs_ci.ocs import constants
 from ocs_ci.ocs.resources import storage_cluster
 from ocs_ci.ocs.resources.pod import (
@@ -18,6 +20,7 @@
 logger = logging.getLogger(__name__)
 
 
+@tier1
 @turquoise_squad
 @stretchcluster_required
 class TestAddCapacityStretchCluster:
@@ -85,6 +88,8 @@ def test_cluster_expansion(
         setup_logwriter_rbd_workload_factory,
         logreader_workload_factory,
         iterations,
+        setup_cnv,
+        cnv_workload,
     ):
         """
         Test cluster exapnsion and health when add capacity is performed
@@ -107,6 +112,13 @@ def test_cluster_expansion(
         )
         logger.info("All the workloads pods are successfully up and running")
 
+        # setup vm and write some data to the VM instance
+        vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)
+        vm_obj.run_ssh_cmd(
+            command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400"
+        )
+        md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
+
         start_time = datetime.now(timezone.utc)
 
         sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL)
@@ -123,6 +135,29 @@ def test_cluster_expansion(
         sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False)
         logger.info("Successfully verified with post failure checks for the workloads")
 
+        # check vm data written after the failure for integrity
+        md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
+        assert (
+            md5sum_before == md5sum_after
+        ), "Data integrity of the file inside VM is not maintained during the add capacity"
+        logger.info(
+            "Data integrity of the file inside VM is maintained during the add capacity"
+        )
+
+        # check if new data can be created
+        vm_obj.run_ssh_cmd(
+            command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
+        )
+        logger.info("Successfully created new data inside VM")
+
+        # check if the data can be copied back to local machine
+        vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
+        logger.info("VM data is successfully copied back to local machine")
+
+        # stop the VM
+        vm_obj.stop()
+        logger.info("Stoped the VM successfully")
+
         sc_obj.cephfs_logreader_job.delete()
         logger.info(sc_obj.cephfs_logreader_pods)
         for pod in sc_obj.cephfs_logreader_pods:

diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_device_replacement.py b/tests/functional/disaster-recovery/sc_arbiter/test_device_replacement.py
@@ -0,0 +1,146 @@
+import logging
+from datetime import datetime, timezone
+
+from ocs_ci.framework.pytest_customization.marks import (
+    stretchcluster_required,
+    turquoise_squad,
+    polarion_id,
+    tier1,
+)
+from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm
+from ocs_ci.ocs import constants
+from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_in_statuses
+
+from ocs_ci.ocs.osd_operations import osd_device_replacement
+from ocs_ci.ocs.resources.stretchcluster import StretchCluster
+
+logger = logging.getLogger(__name__)
+
+
+@tier1
+@stretchcluster_required
+@turquoise_squad
+class TestDeviceReplacementInStretchCluster:
+
+    @polarion_id("OCS-5047")
+    def test_device_replacement(
+        self,
+        nodes,
+        setup_logwriter_cephfs_workload_factory,
+        setup_logwriter_rbd_workload_factory,
+        logreader_workload_factory,
+        cnv_workload,
+        setup_cnv,
+    ):
+        """
+        Test device replacement in stretch cluster while logwriter workload
+        for both CephFs and RBD is running
+
+        Steps:
+            1) Run logwriter/reader workload for both CephFs and RBD volumes
+            2) Perform device replacement procedure
+            3) Verify no data loss
+            4) Verify no data corruption
+
+        """
+
+        sc_obj = StretchCluster()
+
+        # setup logwriter workloads in the background
+        (
+            sc_obj.cephfs_logwriter_dep,
+            sc_obj.cephfs_logreader_job,
+        ) = setup_logwriter_cephfs_workload_factory(read_duration=0)
+
+        sc_obj.get_logwriter_reader_pods(label=constants.LOGWRITER_CEPHFS_LABEL)
+        sc_obj.get_logwriter_reader_pods(label=constants.LOGREADER_CEPHFS_LABEL)
+        sc_obj.get_logwriter_reader_pods(
+            label=constants.LOGWRITER_RBD_LABEL, exp_num_replicas=2
+        )
+        logger.info("All the workloads pods are successfully up and running")
+
+        # setup vm and write some data to the VM instance
+        vm_obj = cnv_workload(volume_interface=constants.VM_VOLUME_PVC)
+        vm_obj.run_ssh_cmd(
+            command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400"
+        )
+        md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
+
+        start_time = datetime.now(timezone.utc)
+
+        sc_obj.get_logfile_map(label=constants.LOGWRITER_CEPHFS_LABEL)
+        sc_obj.get_logfile_map(label=constants.LOGWRITER_RBD_LABEL)
+
+        # run device replacement procedure
+        logger.info("Running device replacement procedure now")
+        osd_device_replacement(nodes)
+
+        # check Io for any failures
+        end_time = datetime.now(timezone.utc)
+        sc_obj.post_failure_checks(start_time, end_time, wait_for_read_completion=False)
+        logger.info("Successfully verified with post failure checks for the workloads")
+
+        # check vm data written after the failure for integrity
+        md5sum_after = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
+        assert (
+            md5sum_before == md5sum_after
+        ), "Data integrity of the file inside VM is not maintained during the device replacement"
+        logger.info(
+            "Data integrity of the file inside VM is maintained during the device replacement"
+        )
+
+        # check if new data can be created
+        vm_obj.run_ssh_cmd(
+            command="dd if=/dev/zero of=/file_2.txt bs=1024 count=103600"
+        )
+        logger.info("Successfully created new data inside VM")
+
+        # check if the data can be copied back to local machine
+        vm_obj.scp_from_vm(local_path="/tmp", vm_src_path="/file_1.txt")
+        logger.info("VM data is successfully copied back to local machine")
+
+        # stop the VM
+        vm_obj.stop()
+        logger.info("Stoped the VM successfully")
+
+        sc_obj.cephfs_logreader_job.delete()
+        logger.info(sc_obj.cephfs_logreader_pods)
+        for pod in sc_obj.cephfs_logreader_pods:
+            pod.wait_for_pod_delete(timeout=120)
+        logger.info("All old CephFS logreader pods are deleted")
+
+        # check for any data loss
+        assert sc_obj.check_for_data_loss(
+            constants.LOGWRITER_CEPHFS_LABEL
+        ), "[CephFS] Data is lost"
+        logger.info("[CephFS] No data loss is seen")
+        assert sc_obj.check_for_data_loss(
+            constants.LOGWRITER_RBD_LABEL
+        ), "[RBD] Data is lost"
+        logger.info("[RBD] No data loss is seen")
+
+        # check for data corruption
+        logreader_workload_factory(
+            pvc=sc_obj.get_workload_pvc_obj(constants.LOGWRITER_CEPHFS_LABEL)[0],
+            logreader_path=constants.LOGWRITER_CEPHFS_READER,
+            duration=5,
+        )
+        sc_obj.get_logwriter_reader_pods(constants.LOGREADER_CEPHFS_LABEL)
+
+        wait_for_pods_to_be_in_statuses(
+            expected_statuses=constants.STATUS_COMPLETED,
+            pod_names=[pod.name for pod in sc_obj.cephfs_logreader_pods],
+            timeout=900,
+            namespace=constants.STRETCH_CLUSTER_NAMESPACE,
+        )
+        logger.info("[CephFS] Logreader job pods have reached 'Completed' state!")
+
+        assert sc_obj.check_for_data_corruption(
+            label=constants.LOGREADER_CEPHFS_LABEL
+        ), "Data is corrupted for cephFS workloads"
+        logger.info("No data corruption is seen in CephFS workloads")
+
+        assert sc_obj.check_for_data_corruption(
+            label=constants.LOGWRITER_RBD_LABEL
+        ), "Data is corrupted for RBD workloads"
+        logger.info("No data corruption is seen in RBD workloads")
diff --git a/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py b/tests/functional/disaster-recovery/sc_arbiter/test_mon_osd_failures.py
@@ -10,6 +10,7 @@
     polarion_id,
     stretchcluster_required,
     turquoise_squad,
+    tier2,
 )
 from ocs_ci.helpers.cnv_helpers import cal_md5sum_vm
 from ocs_ci.helpers.helpers import modify_deployment_replica_count
@@ -147,7 +148,7 @@ def setup_cnv_workload(request, cnv_workload_class, setup_cnv):
     logger.info("Setting up CNV workload and creating some data")
     vm_obj = cnv_workload_class(
         volume_interface=constants.VM_VOLUME_PVC, namespace=CNV_WORKLOAD_NAMESPACE
-    )[0]
+    )
     vm_obj.run_ssh_cmd(command="dd if=/dev/zero of=/file_1.txt bs=1024 count=102400")
     md5sum_before = cal_md5sum_vm(vm_obj, file_path="/file_1.txt")
 
@@ -181,6 +182,7 @@ def finalizer():
     request.addfinalizer(finalizer)
 
 
+@tier2
 @turquoise_squad
 @stretchcluster_required
 @pytest.mark.usefixtures("setup_cnv_workload")