Skip to content

Commit

Permalink
Merge pull request red-hat-storage#11228 from sagihirshfeld/fix-ep-re…
Browse files Browse the repository at this point in the history
…start-method-racecondition

Fix `MCG::reset_endpoint_pods` race condition - use rollout restart instead of deleting pods by name
  • Loading branch information
petr-balogh authored Jan 29, 2025
2 parents c301be6 + 03ff989 commit 7148326
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 20 deletions.
30 changes: 19 additions & 11 deletions ocs_ci/ocs/resources/mcg.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,7 @@ def reset_core_pod(self):

self.core_pod.delete(wait=True)
wait_for_pods_by_label_count(
label=constants.NOOBAA_CORE_POD_LABEL, exptected_count=1
label=constants.NOOBAA_CORE_POD_LABEL, expected_count=1
)
self.core_pod = Pod(
**get_pods_having_label(constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0]
Expand All @@ -1018,18 +1018,26 @@ def reset_endpoint_pods(self):

from ocs_ci.ocs.resources.pod import wait_for_pods_by_label_count

endpoint_pods = [
Pod(**pod_data)
for pod_data in get_pods_having_label(
constants.NOOBAA_ENDPOINT_POD_LABEL, self.namespace
)
]
for pod in endpoint_pods:
pod.delete(wait=True)

nb_ep_dep_obj = OCP(
kind="deployment",
namespace=self.namespace,
resource_name=constants.NOOBAA_ENDPOINT_DEPLOYMENT,
)
nb_ep_dep_obj.exec_oc_cmd(
f"rollout restart deployment/{constants.NOOBAA_ENDPOINT_DEPLOYMENT}"
)
# Wait for the rollout to complete
nb_ep_dep_obj.exec_oc_cmd(
(
f"rollout status deployment/{constants.NOOBAA_ENDPOINT_DEPLOYMENT}"
" --timeout=120s"
),
out_yaml_format=False,
)
expected_pod_count = nb_ep_dep_obj.get().get("spec").get("replicas")
wait_for_pods_by_label_count(
label=constants.NOOBAA_ENDPOINT_POD_LABEL,
exptected_count=len(endpoint_pods),
expected_count=expected_pod_count,
)

endpoint_pods = [
Expand Down
8 changes: 4 additions & 4 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -2577,7 +2577,7 @@ def wait_for_pods_to_be_running(

def wait_for_pods_by_label_count(
label,
exptected_count,
expected_count,
namespace=config.ENV_DATA["cluster_namespace"],
timeout=200,
sleep=10,
Expand All @@ -2588,7 +2588,7 @@ def wait_for_pods_by_label_count(
Args:
selector (str): The resource selector to search with
exptected_count (int): The expected number of pods with the given selector
expected_count (int): The expected number of pods with the given selector
namespace (str): the namespace ot the pods
timeout (int): time to wait for pods to be running
sleep (int): Time in seconds to sleep between attempts
Expand All @@ -2604,8 +2604,8 @@ def wait_for_pods_by_label_count(
namespace=namespace,
):
# Check if the expected number of pods with the given selector is met
if pods_count == exptected_count:
logger.info(f"Found {exptected_count} pods with selector {label}")
if pods_count == expected_count:
logger.info(f"Found {expected_count} pods with selector {label}")
return True
except TimeoutExpiredError:
logger.warning(
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6051,7 +6051,7 @@ def nsfs_bucket_factory_implementation(nsfs_obj):
# and for the obsolete noobaa-endpoint pods to be terminated
wait_for_pods_by_label_count(
label=constants.NOOBAA_ENDPOINT_POD_LABEL,
exptected_count=original_endpoint_pods_count,
expected_count=original_endpoint_pods_count,
)

# Apply the necessary permissions on the filesystem
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def test_single_mon_failures(self):
logger.info(f"recovering mon {mon_dep} now...")
modify_deployment_replica_count(mon_dep, 1)
wait_for_pods_by_label_count(
label=constants.MON_APP_LABEL, exptected_count=5, timeout=300
label=constants.MON_APP_LABEL, expected_count=5, timeout=300
)

@polarion_id("OCS-5060")
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_both_mon_failure(self):
expected_mon_count -= 1
mon_deps.append(mon_dep)
wait_for_pods_by_label_count(
label=constants.MON_APP_LABEL, exptected_count=expected_mon_count
label=constants.MON_APP_LABEL, expected_count=expected_mon_count
)

time.sleep(600)
Expand All @@ -267,7 +267,7 @@ def test_both_mon_failure(self):
logger.info(f"Recovering mon by scaling up the mon deployment {mon_dep}")
modify_deployment_replica_count(mon_dep, 1)
wait_for_pods_by_label_count(
label=constants.MON_APP_LABEL, exptected_count=5, timeout=300
label=constants.MON_APP_LABEL, expected_count=5, timeout=300
)

@polarion_id("OCS-5061")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_nb_endpoint_topology_spread_constraints(setup_nb_endpoint):
), f"Topology spread constraints are not set as expected: \n {topology_spread_constraint}"
logger.info("Topology spread constraints are set correctly")

wait_for_pods_by_label_count(constants.NOOBAA_ENDPOINT_POD_LABEL, exptected_count=2)
wait_for_pods_by_label_count(constants.NOOBAA_ENDPOINT_POD_LABEL, expected_count=2)

nb_endpoint_pods = [
Pod(**pod)
Expand Down

0 comments on commit 7148326

Please sign in to comment.