From 816d09d94fbc29cf3c948f0cc0cabead4333d69d Mon Sep 17 00:00:00 2001 From: Pavel Tisnovsky Date: Wed, 11 Dec 2024 09:00:09 +0100 Subject: [PATCH] Waiting for pod in e2e tests --- tests/e2e/test_api.py | 10 +- tests/e2e/utils/cluster.py | 70 +++++++++- tests/e2e/utils/metrics.py | 7 + tests/e2e/utils/ols_installer.py | 221 ++++++++++++++++--------------- 4 files changed, 192 insertions(+), 116 deletions(-) diff --git a/tests/e2e/test_api.py b/tests/e2e/test_api.py index d59b9c1a..1d08ed80 100644 --- a/tests/e2e/test_api.py +++ b/tests/e2e/test_api.py @@ -33,7 +33,6 @@ read_conversation_history_count, retrieve_connection, ) -from tests.e2e.utils.retry import retry_until_timeout_or_success from tests.e2e.utils.wait_for_ols import wait_for_ols from tests.scripts.must_gather import must_gather @@ -202,14 +201,7 @@ def test_forbidden_user(): def test_transcripts_storing_cluster(): """Test if the transcripts are stored properly.""" transcripts_path = OLS_USER_DATA_PATH + "/transcripts" - r = retry_until_timeout_or_success( - 120, - 5, - lambda: len(cluster_utils.get_pod_by_prefix(fail_not_found=False)) == 1, - ) - if not r: - print("Timed out waiting for new OLS pod to be ready") - return + cluster_utils.wait_for_running_pod() pod_name = cluster_utils.get_pod_by_prefix()[0] # disable collector script to avoid interference with the test cluster_utils.create_file(pod_name, OLS_COLLECTOR_DISABLING_FILE, "") diff --git a/tests/e2e/utils/cluster.py b/tests/e2e/utils/cluster.py index 86f377cf..5543eb6a 100644 --- a/tests/e2e/utils/cluster.py +++ b/tests/e2e/utils/cluster.py @@ -3,6 +3,10 @@ import json import subprocess +from tests.e2e.utils.retry import retry_until_timeout_or_success + +OC_COMMAND_RETRY_COUNT = 120 + def run_oc( args: list[str], input=None, ignore_existing_resource=False # noqa: A002 @@ -25,6 +29,7 @@ def run_oc( f"Error running oc command {args}: {e}, stdout: {e.output}, stderr: {e.stderr}" ) raise + return subprocess.CompletedProcess("", 0) def run_oc_and_store_stdout( @@ -171,9 +176,7 @@ def get_pod_by_prefix( pods = [] try: result = get_pods(namespace) - for pod in result: - if prefix in pod: - pods.append(pod) # noqa: PERF401 + pods = [pod for pod in result if prefix in pod] if fail_not_found and not pods: assert False, f"No OLS api server pod found in list pods: {result}" return pods @@ -295,3 +298,64 @@ def remove_file(pod_name: str, path: str) -> None: run_oc(["exec", pod_name, "--", "rm", path]) except subprocess.CalledProcessError as e: raise Exception("Error removing file") from e + + +def wait_for_running_pod( + name: str = "lightspeed-app-server-", namespace: str = "openshift-lightspeed" +): + """Wait for the selected pod to be in running state.""" + r = retry_until_timeout_or_success( + 5, + 3, + lambda: len( + run_oc( + [ + "get", + "pods", + "--field-selector=status.phase=Pending", + "-n", + namespace, + ] + ).stdout + ) + == 1, + ) + r = retry_until_timeout_or_success( + OC_COMMAND_RETRY_COUNT, + 6, + lambda: ( + len( + [ + pod + for pod in [ + run_oc( + [ + "get", + "pods", + "--field-selector=status.phase=Running", + "-n", + namespace, + ] + ).stdout.find(name) + ] + if pod > 0 + ] + ) + == 1 + ), + ) + + # wait for new ols app pod to be created+running + # there should be exactly one, if we see more than one it may be an old pod + # and we need to wait for it to go away before progressing so we don't try to + # interact with it. + r = retry_until_timeout_or_success( + OC_COMMAND_RETRY_COUNT, + 5, + lambda: len( + get_pod_by_prefix(prefix=name, namespace=namespace, fail_not_found=False) + ) + == 1, + ) + if not r: + raise Exception("Timed out waiting for new OLS pod to be ready") diff --git a/tests/e2e/utils/metrics.py b/tests/e2e/utils/metrics.py index f72e4088..490acb85 100644 --- a/tests/e2e/utils/metrics.py +++ b/tests/e2e/utils/metrics.py @@ -166,6 +166,9 @@ def __init__(self, client, endpoint, status_code=requests.codes.ok): self.client = client self.endpoint = endpoint self.status_code = status_code + # to be updated when the code entered the "with" block + self.old_counter = None + self.old_duration = None def __enter__(self): """Retrieve old counter value before calling REST API.""" @@ -224,6 +227,10 @@ def __init__( # expect change in number of received tokens self.expect_received_change = expect_received_change + # to be updated when code enters the "with" block + self.old_counter_token_sent_total = None + self.old_counter_token_received_total = None + def __enter__(self): """Retrieve old counter values before calling LLM.""" if self.skip_check: diff --git a/tests/e2e/utils/ols_installer.py b/tests/e2e/utils/ols_installer.py index 7808c90a..88b4cc50 100644 --- a/tests/e2e/utils/ols_installer.py +++ b/tests/e2e/utils/ols_installer.py @@ -13,11 +13,116 @@ OC_COMMAND_RETRY_DELAY = 5 -def install_ols() -> tuple[str, str, str]: +def create_and_config_sas() -> tuple[str, str]: + """Create and provide access to service accounts for testing. + + Returns: + tuple containing token and metrics token. + """ + cluster_utils.run_oc( + ["project", "openshift-lightspeed"], ignore_existing_resource=True + ) + cluster_utils.create_user("test-user", ignore_existing_resource=True) + cluster_utils.create_user("metrics-test-user", ignore_existing_resource=True) + token = cluster_utils.get_token_for("test-user") + metrics_token = cluster_utils.get_token_for("metrics-test-user") + print("created test service account users") + + # grant the test service accounts permission to query ols and retrieve metrics + cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access") + cluster_utils.grant_sa_user_access( + "metrics-test-user", "lightspeed-operator-ols-metrics-reader" + ) + print("test service account permissions granted") + return token, metrics_token + + +def update_ols_config() -> None: + """Create the ols config configmap with log and collector config for e2e tests. + + Returns: + Nothing. + """ + # modify olsconfig configmap + configmap_yaml = cluster_utils.run_oc(["get", "cm/olsconfig", "-o", "yaml"]).stdout + configmap = yaml.safe_load(configmap_yaml) + olsconfig = yaml.safe_load(configmap["data"]["olsconfig.yaml"]) + + # one of our libs logs a secrets in debug mode which causes the pod + # logs beying redacted/removed completely - we need log at info level + olsconfig["ols_config"]["logging_config"]["lib_log_level"] = "INFO" + + # add collector config for e2e tests + olsconfig["user_data_collector_config"] = { + "data_storage": "/app-root/ols-user-data", + "log_level": "debug", + "collection_interval": 10, + "run_without_initial_wait": True, + "ingress_env": "stage", + "cp_offline_token": os.getenv("CP_OFFLINE_TOKEN", ""), + } + configmap["data"]["olsconfig.yaml"] = yaml.dump(olsconfig) + updated_configmap = yaml.dump(configmap) + + cluster_utils.run_oc(["delete", "configmap", "olsconfig"]) + cluster_utils.run_oc(["apply", "-f", "-"], input=updated_configmap) + + +def replace_ols_image(ols_image: str) -> None: + """Replace the existing ols image with a new one. + + Args: + ols_image (str): the new ols image to be added to the server pod. + + Returns: + Nothing. + """ + print(f"Updating deployment to use OLS image {ols_image}") + + # Ensure the operator controller manager pod is gone before touching anything else + retry_until_timeout_or_success( + OC_COMMAND_RETRY_COUNT, + OC_COMMAND_RETRY_DELAY, + lambda: not cluster_utils.get_pod_by_prefix( + "lightspeed-operator-controller-manager", fail_not_found=False + ), + ) + + # scale down the ols api server so we can ensure no pods + # are still running the unsubstituted image + cluster_utils.run_oc( + [ + "scale", + "deployment/lightspeed-app-server", + "--replicas", + "0", + ] + ) + + # wait for the old ols api pod to go away due to deployment being scaled down + retry_until_timeout_or_success( + OC_COMMAND_RETRY_COUNT, + OC_COMMAND_RETRY_DELAY, + lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False), + ) + + # update the OLS deployment to use the new image from CI/OLS_IMAGE env var + patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value":"{ols_image}"}}]""" # noqa: E501 + cluster_utils.run_oc( + ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch] + ) + + patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/1/image", "value":"{ols_image}"}}]""" # noqa: E501 + cluster_utils.run_oc( + ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch] + ) + + +def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915 """Install OLS onto an OCP cluster using the OLS operator.""" print("Setting up for on cluster test execution") - - if not os.getenv("KONFLUX_OLS_SERVICE_IMAGE"): + is_konflux = os.getenv("KONFLUX_BOOL") + if not is_konflux: # setup the lightspeed namespace cluster_utils.run_oc( ["create", "ns", "openshift-lightspeed"], ignore_existing_resource=True @@ -68,11 +173,7 @@ def install_ols() -> tuple[str, str, str]: ) raise - cluster_utils.create_user("test-user", ignore_existing_resource=True) - cluster_utils.create_user("metrics-test-user", ignore_existing_resource=True) - token = cluster_utils.get_token_for("test-user") - metrics_token = cluster_utils.get_token_for("metrics-test-user") - print("created test service account users") + token, metrics_token = create_and_config_sas() # wait for the operator to install # time.sleep(3) # not sure if it is needed but it fails sometimes @@ -94,13 +195,6 @@ def install_ols() -> tuple[str, str, str]: return None print("Operator installed successfully") - # grant the test service accounts permission to query ols and retrieve metrics - cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access") - cluster_utils.grant_sa_user_access( - "metrics-test-user", "lightspeed-operator-ols-metrics-reader" - ) - print("test service account permissions granted") - provider = os.getenv("PROVIDER") # create the llm api key secret ols will mount @@ -200,9 +294,7 @@ def install_ols() -> tuple[str, str, str]: ) # get the name of the OLS image from CI so we can substitute it in - ols_image = os.getenv("OLS_IMAGE", "") - - print(f"Updating deployment to use OLS image {ols_image}") + new_ols_image = os.getenv("OLS_IMAGE", "") # scale down the operator controller manager to avoid it interfering with the tests cluster_utils.run_oc( @@ -214,17 +306,12 @@ def install_ols() -> tuple[str, str, str]: ] ) - # Ensure the operator controller manager pod is gone before touching anything else - retry_until_timeout_or_success( - OC_COMMAND_RETRY_COUNT, - OC_COMMAND_RETRY_DELAY, - lambda: not cluster_utils.get_pod_by_prefix( - "lightspeed-operator-controller-manager", fail_not_found=False - ), - ) + if new_ols_image != "": + replace_ols_image(new_ols_image) - # scale down the ols api server so we can ensure no pods - # are still running the unsubstituted image + # Scale down server pod. If image is replaced, it won't do anything + # otherwise, it enables the config modification and subsequent + # scaling up cluster_utils.run_oc( [ "scale", @@ -233,49 +320,7 @@ def install_ols() -> tuple[str, str, str]: "0", ] ) - - # wait for the old ols api pod to go away due to deployment being scaled down - retry_until_timeout_or_success( - OC_COMMAND_RETRY_COUNT, - OC_COMMAND_RETRY_DELAY, - lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False), - ) - - # update the OLS deployment to use the new image from CI/OLS_IMAGE env var - patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value":"{ols_image}"}}]""" # noqa: E501 - cluster_utils.run_oc( - ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch] - ) - - patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/1/image", "value":"{ols_image}"}}]""" # noqa: E501 - cluster_utils.run_oc( - ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch] - ) - - # modify olsconfig configmap - configmap_yaml = cluster_utils.run_oc(["get", "cm/olsconfig", "-o", "yaml"]).stdout - configmap = yaml.safe_load(configmap_yaml) - olsconfig = yaml.safe_load(configmap["data"]["rcsconfig.yaml"]) - - # one of our libs logs a secrets in debug mode which causes the pod - # logs beying redacted/removed completely - we need log at info level - olsconfig["ols_config"]["logging_config"]["lib_log_level"] = "INFO" - - # add collector config for e2e tests - olsconfig["user_data_collector_config"] = { - "data_storage": "/app-root/ols-user-data", - "log_level": "debug", - "collection_interval": 10, - "run_without_initial_wait": True, - "ingress_env": "stage", - "cp_offline_token": os.getenv("CP_OFFLINE_TOKEN", ""), - } - configmap["data"]["rcsconfig.yaml"] = yaml.dump(olsconfig) - updated_configmap = yaml.dump(configmap) - - cluster_utils.run_oc(["delete", "configmap", "olsconfig"]) - cluster_utils.run_oc(["apply", "-f", "-"], input=updated_configmap) - + update_ols_config() # scale the ols app server up cluster_utils.run_oc( [ @@ -287,39 +332,7 @@ def install_ols() -> tuple[str, str, str]: ) print("Deployment updated, waiting for new pod to be ready") # Wait for the pod to start being created and then wait for it to start running. - r = retry_until_timeout_or_success( - OC_COMMAND_RETRY_COUNT, - 2, - lambda: len( - cluster_utils.run_oc( - ["get", "pods", "--field-selector=status.phase=Pending"] - ).stdout - ) - == 1, - ) - r = retry_until_timeout_or_success( - OC_COMMAND_RETRY_COUNT, - 5, - lambda: len( - cluster_utils.run_oc( - ["get", "pods", "--field-selector=status.phase=Pending"] - ).stdout - ) - == 0, - ) - - # wait for new ols app pod to be created+running - # there should be exactly one, if we see more than one it may be an old pod - # and we need to wait for it to go away before progressing so we don't try to - # interact with it. - r = retry_until_timeout_or_success( - OC_COMMAND_RETRY_COUNT, - 5, - lambda: len(cluster_utils.get_pod_by_prefix(fail_not_found=False)) == 1, - ) - if not r: - print("Timed out waiting for new OLS pod to be ready") - return None + cluster_utils.wait_for_running_pod() print("-" * 50) print("OLS pod seems to be ready")