From 816d09d94fbc29cf3c948f0cc0cabead4333d69d Mon Sep 17 00:00:00 2001
From: Pavel Tisnovsky <ptisnovs@redhat.com>
Date: Wed, 11 Dec 2024 09:00:09 +0100
Subject: [PATCH] Waiting for pod in e2e tests

---
 tests/e2e/test_api.py            |  10 +-
 tests/e2e/utils/cluster.py       |  70 +++++++++-
 tests/e2e/utils/metrics.py       |   7 +
 tests/e2e/utils/ols_installer.py | 221 ++++++++++++++++---------------
 4 files changed, 192 insertions(+), 116 deletions(-)

diff --git a/tests/e2e/test_api.py b/tests/e2e/test_api.py
index d59b9c1a..1d08ed80 100644
--- a/tests/e2e/test_api.py
+++ b/tests/e2e/test_api.py
@@ -33,7 +33,6 @@
     read_conversation_history_count,
     retrieve_connection,
 )
-from tests.e2e.utils.retry import retry_until_timeout_or_success
 from tests.e2e.utils.wait_for_ols import wait_for_ols
 from tests.scripts.must_gather import must_gather
 
@@ -202,14 +201,7 @@ def test_forbidden_user():
 def test_transcripts_storing_cluster():
     """Test if the transcripts are stored properly."""
     transcripts_path = OLS_USER_DATA_PATH + "/transcripts"
-    r = retry_until_timeout_or_success(
-        120,
-        5,
-        lambda: len(cluster_utils.get_pod_by_prefix(fail_not_found=False)) == 1,
-    )
-    if not r:
-        print("Timed out waiting for new OLS pod to be ready")
-        return
+    cluster_utils.wait_for_running_pod()
     pod_name = cluster_utils.get_pod_by_prefix()[0]
     # disable collector script to avoid interference with the test
     cluster_utils.create_file(pod_name, OLS_COLLECTOR_DISABLING_FILE, "")
diff --git a/tests/e2e/utils/cluster.py b/tests/e2e/utils/cluster.py
index 86f377cf..5543eb6a 100644
--- a/tests/e2e/utils/cluster.py
+++ b/tests/e2e/utils/cluster.py
@@ -3,6 +3,10 @@
 import json
 import subprocess
 
+from tests.e2e.utils.retry import retry_until_timeout_or_success
+
+OC_COMMAND_RETRY_COUNT = 120
+
 
 def run_oc(
     args: list[str], input=None, ignore_existing_resource=False  # noqa: A002
@@ -25,6 +29,7 @@ def run_oc(
                 f"Error running oc command {args}: {e}, stdout: {e.output}, stderr: {e.stderr}"
             )
             raise
+    return subprocess.CompletedProcess("", 0)
 
 
 def run_oc_and_store_stdout(
@@ -171,9 +176,7 @@ def get_pod_by_prefix(
     pods = []
     try:
         result = get_pods(namespace)
-        for pod in result:
-            if prefix in pod:
-                pods.append(pod)  # noqa: PERF401
+        pods = [pod for pod in result if prefix in pod]
         if fail_not_found and not pods:
             assert False, f"No OLS api server pod found in list pods: {result}"
         return pods
@@ -295,3 +298,64 @@ def remove_file(pod_name: str, path: str) -> None:
         run_oc(["exec", pod_name, "--", "rm", path])
     except subprocess.CalledProcessError as e:
         raise Exception("Error removing file") from e
+
+
+def wait_for_running_pod(
+    name: str = "lightspeed-app-server-", namespace: str = "openshift-lightspeed"
+):
+    """Wait for the selected pod to be in running state."""
+    r = retry_until_timeout_or_success(
+        5,
+        3,
+        lambda: len(
+            run_oc(
+                [
+                    "get",
+                    "pods",
+                    "--field-selector=status.phase=Pending",
+                    "-n",
+                    namespace,
+                ]
+            ).stdout
+        )
+        == 1,
+    )
+    r = retry_until_timeout_or_success(
+        OC_COMMAND_RETRY_COUNT,
+        6,
+        lambda: (
+            len(
+                [
+                    pod
+                    for pod in [
+                        run_oc(
+                            [
+                                "get",
+                                "pods",
+                                "--field-selector=status.phase=Running",
+                                "-n",
+                                namespace,
+                            ]
+                        ).stdout.find(name)
+                    ]
+                    if pod > 0
+                ]
+            )
+            == 1
+        ),
+    )
+
+    # wait for new ols app pod to be created+running
+    # there should be exactly one, if we see more than one it may be an old pod
+    # and we need to wait for it to go away before progressing so we don't try to
+    # interact with it.
+    r = retry_until_timeout_or_success(
+        OC_COMMAND_RETRY_COUNT,
+        5,
+        lambda: len(
+            get_pod_by_prefix(prefix=name, namespace=namespace, fail_not_found=False)
+        )
+        == 1,
+    )
+    if not r:
+        raise Exception("Timed out waiting for new OLS pod to be ready")
diff --git a/tests/e2e/utils/metrics.py b/tests/e2e/utils/metrics.py
index f72e4088..490acb85 100644
--- a/tests/e2e/utils/metrics.py
+++ b/tests/e2e/utils/metrics.py
@@ -166,6 +166,9 @@ def __init__(self, client, endpoint, status_code=requests.codes.ok):
         self.client = client
         self.endpoint = endpoint
         self.status_code = status_code
+        # to be updated when the code entered the "with" block
+        self.old_counter = None
+        self.old_duration = None
 
     def __enter__(self):
         """Retrieve old counter value before calling REST API."""
@@ -224,6 +227,10 @@ def __init__(
         # expect change in number of received tokens
         self.expect_received_change = expect_received_change
 
+        # to be updated when code enters the "with" block
+        self.old_counter_token_sent_total = None
+        self.old_counter_token_received_total = None
+
     def __enter__(self):
         """Retrieve old counter values before calling LLM."""
         if self.skip_check:
diff --git a/tests/e2e/utils/ols_installer.py b/tests/e2e/utils/ols_installer.py
index 7808c90a..88b4cc50 100644
--- a/tests/e2e/utils/ols_installer.py
+++ b/tests/e2e/utils/ols_installer.py
@@ -13,11 +13,116 @@
 OC_COMMAND_RETRY_DELAY = 5
 
 
-def install_ols() -> tuple[str, str, str]:
+def create_and_config_sas() -> tuple[str, str]:
+    """Create and provide access to service accounts for testing.
+
+    Returns:
+        tuple containing token and metrics token.
+    """
+    cluster_utils.run_oc(
+        ["project", "openshift-lightspeed"], ignore_existing_resource=True
+    )
+    cluster_utils.create_user("test-user", ignore_existing_resource=True)
+    cluster_utils.create_user("metrics-test-user", ignore_existing_resource=True)
+    token = cluster_utils.get_token_for("test-user")
+    metrics_token = cluster_utils.get_token_for("metrics-test-user")
+    print("created test service account users")
+
+    # grant the test service accounts permission to query ols and retrieve metrics
+    cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access")
+    cluster_utils.grant_sa_user_access(
+        "metrics-test-user", "lightspeed-operator-ols-metrics-reader"
+    )
+    print("test service account permissions granted")
+    return token, metrics_token
+
+
+def update_ols_config() -> None:
+    """Create the ols config configmap with log and collector config for e2e tests.
+
+    Returns:
+        Nothing.
+    """
+    # modify olsconfig configmap
+    configmap_yaml = cluster_utils.run_oc(["get", "cm/olsconfig", "-o", "yaml"]).stdout
+    configmap = yaml.safe_load(configmap_yaml)
+    olsconfig = yaml.safe_load(configmap["data"]["olsconfig.yaml"])
+
+    # one of our libs logs a secrets in debug mode which causes the pod
+    # logs beying redacted/removed completely - we need log at info level
+    olsconfig["ols_config"]["logging_config"]["lib_log_level"] = "INFO"
+
+    # add collector config for e2e tests
+    olsconfig["user_data_collector_config"] = {
+        "data_storage": "/app-root/ols-user-data",
+        "log_level": "debug",
+        "collection_interval": 10,
+        "run_without_initial_wait": True,
+        "ingress_env": "stage",
+        "cp_offline_token": os.getenv("CP_OFFLINE_TOKEN", ""),
+    }
+    configmap["data"]["olsconfig.yaml"] = yaml.dump(olsconfig)
+    updated_configmap = yaml.dump(configmap)
+
+    cluster_utils.run_oc(["delete", "configmap", "olsconfig"])
+    cluster_utils.run_oc(["apply", "-f", "-"], input=updated_configmap)
+
+
+def replace_ols_image(ols_image: str) -> None:
+    """Replace the existing ols image with a new one.
+
+    Args:
+        ols_image (str): the new ols image to be added to the server pod.
+
+    Returns:
+        Nothing.
+    """
+    print(f"Updating deployment to use OLS image {ols_image}")
+
+    # Ensure the operator controller manager pod is gone before touching anything else
+    retry_until_timeout_or_success(
+        OC_COMMAND_RETRY_COUNT,
+        OC_COMMAND_RETRY_DELAY,
+        lambda: not cluster_utils.get_pod_by_prefix(
+            "lightspeed-operator-controller-manager", fail_not_found=False
+        ),
+    )
+
+    # scale down the ols api server so we can ensure no pods
+    # are still running the unsubstituted image
+    cluster_utils.run_oc(
+        [
+            "scale",
+            "deployment/lightspeed-app-server",
+            "--replicas",
+            "0",
+        ]
+    )
+
+    # wait for the old ols api pod to go away due to deployment being scaled down
+    retry_until_timeout_or_success(
+        OC_COMMAND_RETRY_COUNT,
+        OC_COMMAND_RETRY_DELAY,
+        lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False),
+    )
+
+    # update the OLS deployment to use the new image from CI/OLS_IMAGE env var
+    patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value":"{ols_image}"}}]"""  # noqa: E501
+    cluster_utils.run_oc(
+        ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch]
+    )
+
+    patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/1/image", "value":"{ols_image}"}}]"""  # noqa: E501
+    cluster_utils.run_oc(
+        ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch]
+    )
+
+
+def install_ols() -> tuple[str, str, str]:  # pylint: disable=R0915
     """Install OLS onto an OCP cluster using the OLS operator."""
     print("Setting up for on cluster test execution")
-
-    if not os.getenv("KONFLUX_OLS_SERVICE_IMAGE"):
+    is_konflux = os.getenv("KONFLUX_BOOL")
+    if not is_konflux:
         # setup the lightspeed namespace
         cluster_utils.run_oc(
             ["create", "ns", "openshift-lightspeed"], ignore_existing_resource=True
@@ -68,11 +173,7 @@ def install_ols() -> tuple[str, str, str]:
             )
             raise
 
-    cluster_utils.create_user("test-user", ignore_existing_resource=True)
-    cluster_utils.create_user("metrics-test-user", ignore_existing_resource=True)
-    token = cluster_utils.get_token_for("test-user")
-    metrics_token = cluster_utils.get_token_for("metrics-test-user")
-    print("created test service account users")
+    token, metrics_token = create_and_config_sas()
 
     # wait for the operator to install
     # time.sleep(3)  # not sure if it is needed but it fails sometimes
@@ -94,13 +195,6 @@ def install_ols() -> tuple[str, str, str]:
         return None
     print("Operator installed successfully")
 
-    # grant the test service accounts permission to query ols and retrieve metrics
-    cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access")
-    cluster_utils.grant_sa_user_access(
-        "metrics-test-user", "lightspeed-operator-ols-metrics-reader"
-    )
-    print("test service account permissions granted")
-
     provider = os.getenv("PROVIDER")
 
     # create the llm api key secret ols will mount
@@ -200,9 +294,7 @@ def install_ols() -> tuple[str, str, str]:
     )
 
     # get the name of the OLS image from CI so we can substitute it in
-    ols_image = os.getenv("OLS_IMAGE", "")
-
-    print(f"Updating deployment to use OLS image {ols_image}")
+    new_ols_image = os.getenv("OLS_IMAGE", "")
 
     # scale down the operator controller manager to avoid it interfering with the tests
     cluster_utils.run_oc(
@@ -214,17 +306,12 @@ def install_ols() -> tuple[str, str, str]:
         ]
     )
 
-    # Ensure the operator controller manager pod is gone before touching anything else
-    retry_until_timeout_or_success(
-        OC_COMMAND_RETRY_COUNT,
-        OC_COMMAND_RETRY_DELAY,
-        lambda: not cluster_utils.get_pod_by_prefix(
-            "lightspeed-operator-controller-manager", fail_not_found=False
-        ),
-    )
+    if new_ols_image != "":
+        replace_ols_image(new_ols_image)
 
-    # scale down the ols api server so we can ensure no pods
-    # are still running the unsubstituted image
+    # Scale down server pod. If image is replaced, it won't do anything
+    # otherwise, it enables the config modification and subsequent
+    # scaling up
     cluster_utils.run_oc(
         [
             "scale",
@@ -233,49 +320,7 @@ def install_ols() -> tuple[str, str, str]:
             "0",
         ]
     )
-
-    # wait for the old ols api pod to go away due to deployment being scaled down
-    retry_until_timeout_or_success(
-        OC_COMMAND_RETRY_COUNT,
-        OC_COMMAND_RETRY_DELAY,
-        lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False),
-    )
-
-    # update the OLS deployment to use the new image from CI/OLS_IMAGE env var
-    patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value":"{ols_image}"}}]"""  # noqa: E501
-    cluster_utils.run_oc(
-        ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch]
-    )
-
-    patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/1/image", "value":"{ols_image}"}}]"""  # noqa: E501
-    cluster_utils.run_oc(
-        ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch]
-    )
-
-    # modify olsconfig configmap
-    configmap_yaml = cluster_utils.run_oc(["get", "cm/olsconfig", "-o", "yaml"]).stdout
-    configmap = yaml.safe_load(configmap_yaml)
-    olsconfig = yaml.safe_load(configmap["data"]["rcsconfig.yaml"])
-
-    # one of our libs logs a secrets in debug mode which causes the pod
-    # logs beying redacted/removed completely - we need log at info level
-    olsconfig["ols_config"]["logging_config"]["lib_log_level"] = "INFO"
-
-    # add collector config for e2e tests
-    olsconfig["user_data_collector_config"] = {
-        "data_storage": "/app-root/ols-user-data",
-        "log_level": "debug",
-        "collection_interval": 10,
-        "run_without_initial_wait": True,
-        "ingress_env": "stage",
-        "cp_offline_token": os.getenv("CP_OFFLINE_TOKEN", ""),
-    }
-    configmap["data"]["rcsconfig.yaml"] = yaml.dump(olsconfig)
-    updated_configmap = yaml.dump(configmap)
-
-    cluster_utils.run_oc(["delete", "configmap", "olsconfig"])
-    cluster_utils.run_oc(["apply", "-f", "-"], input=updated_configmap)
-
+    update_ols_config()
     # scale the ols app server up
     cluster_utils.run_oc(
         [
@@ -287,39 +332,7 @@ def install_ols() -> tuple[str, str, str]:
     )
     print("Deployment updated, waiting for new pod to be ready")
     # Wait for the pod to start being created and then wait for it to start running.
-    r = retry_until_timeout_or_success(
-        OC_COMMAND_RETRY_COUNT,
-        2,
-        lambda: len(
-            cluster_utils.run_oc(
-                ["get", "pods", "--field-selector=status.phase=Pending"]
-            ).stdout
-        )
-        == 1,
-    )
-    r = retry_until_timeout_or_success(
-        OC_COMMAND_RETRY_COUNT,
-        5,
-        lambda: len(
-            cluster_utils.run_oc(
-                ["get", "pods", "--field-selector=status.phase=Pending"]
-            ).stdout
-        )
-        == 0,
-    )
-
-    # wait for new ols app pod to be created+running
-    # there should be exactly one, if we see more than one it may be an old pod
-    # and we need to wait for it to go away before progressing so we don't try to
-    # interact with it.
-    r = retry_until_timeout_or_success(
-        OC_COMMAND_RETRY_COUNT,
-        5,
-        lambda: len(cluster_utils.get_pod_by_prefix(fail_not_found=False)) == 1,
-    )
-    if not r:
-        print("Timed out waiting for new OLS pod to be ready")
-        return None
+    cluster_utils.wait_for_running_pod()
 
     print("-" * 50)
     print("OLS pod seems to be ready")