Merge pull request #97 from tisnik/answer-evalutation-fixes

Answer evalutation fixes
road-core · Nov 11, 2024 · 62ee217 · 62ee217
2 parents 6c607e0 + f7bed2b
commit 62ee217
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 30 deletions.
diff --git a/scripts/evaluation/eval_data/question_answer_pair.json b/scripts/evaluation/eval_data/question_answer_pair.json
@@ -91,8 +91,9 @@
                     ]
                 },
                 "azure_openai+gpt-4o-mini+with_rag": {
-                    "cutoff_score": 0.3,
+                    "cutoff_score": 0.35,
                     "text": [
+                        "OpenShift Virtualization is an add-on to the Red Hat OpenShift Container Platform that enables you to run and manage virtual machine (VM) workloads alongside container workloads. It integrates virtualization capabilities into the OpenShift environment, allowing users to create and manage both Linux and Windows VMs as native Kubernetes objects.\n\nKey features of OpenShift Virtualization include:\n\n- Creating and managing VMs directly within the OpenShift platform.\n- Running pod (container) and VM workloads together in a single cluster.\n- Connecting to VMs through various consoles and command-line tools.\n- Importing and cloning existing virtual machines.\n- Managing network interfaces and storage disks attached to VMs.\n- Live migrating VMs between nodes for better resource management.\n\nThe enhanced web console provides a graphical interface for managing these virtualized resources alongside containerized applications, making it easier for users to handle diverse workloads within a unified platform.",
                         "OpenShift Virtualization is a feature of Red Hat OpenShift Container Platform that allows you to bring traditional virtual machines (VMs) into the Kubernetes environment and run them alongside containers. In OpenShift Virtualization, VMs are treated as native Kubernetes objects and can be managed using the OpenShift web console or command line interface. This feature enables organizations to consolidate their containerized and virtualized workloads onto a single platform, providing greater flexibility and efficiency in managing their applications."
                     ]
                 },
@@ -802,4 +803,4 @@
             "how do i reformat a laptop hard drive"
         ]
     }
-}
+}
diff --git a/scripts/evaluation/response_evaluation.py b/scripts/evaluation/response_evaluation.py
@@ -44,7 +44,7 @@ def __init__(self, eval_args, api_client):
         self._load_config_and_rag()  # Set global config
         self._input_dir, self._result_dir = self._set_directories()
 
-        self._scorer = ResponseScore()
+        self._scorer = ResponseScore(self._args.eval_metrics)
 
         # Load data
         with open(os.path.join(self._input_dir, DEFAULT_QNA_FILE)) as qna_f:
@@ -71,7 +71,7 @@ def _load_config_and_rag(self):
         if len(set(self._args.eval_modes) - {"ols"}) > 0:
             # load config separately
             # Use OLS config file to set provider/model related config. Ex: credential/url
-            cfg_file = os.environ.get("RCS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
+            cfg_file = os.environ.get("OLS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
             config.reload_from_yaml_file(cfg_file)
 
         if "ols_rag" in self._args.eval_modes:
@@ -101,7 +101,7 @@ def _load_qna_pool_parquet(self):
                 columns={"ID": "query_id", "Question": "question", "Answer": "answer"}
             )
             qna_pool_df["query_id"] = "qna" + qna_pool_df["query_id"].astype(str)
-            qna_pool_df["query_source"].append("doc")
+            qna_pool_df["query_source"] = "doc"
             qna_pool_df["consistency_cutoff"] = EVAL_THRESHOLD
             qna_pool_df["in_use"] = True
         return qna_pool_df
@@ -131,9 +131,9 @@ def _restructure_qna_pool_json(self, provider_model_id):
                 qna_pool_dict["question"].append(question)
                 qna_pool_dict["answer"].append(answer)
                 qna_pool_dict["query_source"].append("transcript")
-                qna_pool_dict["doc_source"].append(None)
-                qna_pool_dict["doc_title"].append(None)
-                qna_pool_dict["doc_page"].append(None)
+                qna_pool_dict["doc_source"].append("NA")
+                qna_pool_dict["doc_title"].append("NA")
+                qna_pool_dict["doc_page"].append("NA")
                 qna_pool_dict["consistency_cutoff"].append(consistency_cutoff)
                 qna_pool_dict["in_use"].append(in_use)
 
@@ -236,21 +236,27 @@ def _get_model_response(self, qna_pool_df, provider_model_id, eval_mode):
     def _get_evaluation_score(self, qna_pool_df):
         """Get response evaluation score."""
         print("Getting evaluation scores...")
-        qna_pool_df[
-            [
-                "cos_score",
-                "euc_score",
-                "len_score",
-                "rougeL_precision",
-                "rougeL_recall",
-                "rougeL_f1",
-            ]
-        ] = qna_pool_df.progress_apply(
-            lambda row: self._scorer.calculate_scores(row.answer, row.response),
+        # Default scores
+        score_cols = [
+            "cos_score",
+            "euc_score",
+            "len_score",
+            "rougeL_precision",
+            "rougeL_recall",
+            "rougeL_f1",
+            "answer_relevancy",
+            # Supporting data
+            "answer_valid_flag",
+            "generated_questions",
+        ]
+        qna_pool_df[score_cols] = qna_pool_df.progress_apply(
+            lambda row: self._scorer.calculate_scores(
+                row.question, row.answer, row.response
+            ),
             axis=1,
             result_type="expand",
         )
-        return qna_pool_df
+        return qna_pool_df.dropna(axis=1, how="all")
 
     def _get_response_with_score(self):
         """Get responses with scores."""
@@ -280,7 +286,6 @@ def _condense_eval_df(result_df):
                 "doc_source",
                 "doc_title",
                 "doc_page",
-                "consistency_cutoff",
             ],
             columns=["eval_mode", "provider_model_id"],
         ).swaplevel(0, axis=1)

diff --git a/scripts/evaluation/utils/constants.py b/scripts/evaluation/utils/constants.py
@@ -4,12 +4,14 @@
 INSCOPE_MODELS = {
     "bam+ibm/granite-13b-chat-v2": ("bam", "ibm/granite-13b-chat-v2"),
     "watsonx+ibm/granite-13b-chat-v2": ("watsonx", "ibm/granite-13b-chat-v2"),
-    "openai+gpt-3.5-turbo": ("openai", "gpt-3.5-turbo"),
+    "watsonx+ibm/granite-3-2b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
+    "watsonx+ibm/granite-3-8b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
     "openai+gpt-4o-mini": ("openai", "gpt-4o-mini"),
-    "azure_openai+gpt-3.5-turbo": ("azure_openai", "gpt-3.5-turbo"),
-    "azure_openai+gpt-3.5-turbo-4k": ("azure_openai", "gpt-3.5-turbo"),
-    "azure_openai+gpt-3.5-turbo-16k": ("azure_openai", "gpt-3.5-turbo"),
+    "openai+gpt-4o": ("openai", "gpt-4o"),
+    "azure_openai+gpt-4o-mini": ("azure_openai", "gpt-4o-mini"),
     "azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),
+    "ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),
+    "ollama+mistral": ("ollama", "mistral"),
 }
 
 SCORE_DESCRIPTION = {
@@ -19,6 +21,7 @@
     "rougeL_precision": "RougeL Precision Score",
     "rougeL_recall": "RougeL Recall Score",
     "rougeL_f1": "RougeL F1 Score",
+    "answer_relevancy": "Answer relevancy score against query",
 }
 
 EVAL_MODES = {
@@ -36,7 +39,7 @@
 """
 
 DEFAULT_QNA_FILE = "question_answer_pair.json"
-DEFAULT_CONFIG_FILE = "rcsconfig.yaml"
+DEFAULT_CONFIG_FILE = "olsconfig.yaml"
 
 DEFAULT_INPUT_DIR = "eval_data"
 DEFAULT_RESULT_DIR = "eval_result"
@@ -48,3 +51,6 @@
 
 # Cut-off similarity score used for response evaluation.
 EVAL_THRESHOLD = 0.3  # low score is better
+
+# Number of related questions to be generated.
+N_QUESTIONS = 2
diff --git a/scripts/evaluation/utils/prompts.py b/scripts/evaluation/utils/prompts.py
@@ -0,0 +1,24 @@
+# ruff: noqa: E501
+"""Prompt templates/constants."""
+
+# Below is inspired by both ragas & langchain internal/example prompts.
+ANSWER_RELEVANCY_PROMPT = """
+You are an helpful assistant. Your task is to analyze answer and come up with questions from the given answer.
+Given the following answer delimited by three backticks please generate {num_questions} questions.
+A question should be concise and based explicitly on the information present in answer. It should be asking about one thing at a time.
+Give Valid as 1 if the answer is valid and 0 if the answer is invalid. An invalid answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers.
+When formulating a question, don't include text like "from the provided context", "as described in the document", "according to the given document" or anything similar. Also don't add sequence number in question.
+
+Use below json format for your response. Do not add any additional text apart from json output.
+{{
+    Question: [
+        QUESTION 1,
+        QUESTION 2,
+    ],
+    Valid: 0 or 1
+}}
+
+```
+{answer}
+```
+"""
diff --git a/scripts/evaluation/utils/relevancy_score.py b/scripts/evaluation/utils/relevancy_score.py
@@ -0,0 +1,77 @@
+"""Relevancy score calculation."""
+
+from statistics import mean
+from time import sleep
+
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts.prompt import PromptTemplate
+from scipy.spatial.distance import cosine
+
+from ols import config
+
+from .constants import MAX_RETRY_ATTEMPTS, N_QUESTIONS, TIME_TO_BREATH
+from .models import VANILLA_MODEL
+from .prompts import ANSWER_RELEVANCY_PROMPT
+
+
+class AnswerRelevancyScore:
+    """Calculate response/answer relevancy score."""
+
+    def __init__(
+        self, embedding_model, judge_provider="ollama", judge_model="llama3.1:latest"
+    ):
+        """Initialize."""
+        self._embedding_model = embedding_model
+        self._judge_llm = self._judge_llm_init(judge_provider, judge_model)
+
+    @staticmethod
+    def _judge_llm_init(judge_provider, judge_model):
+        """Load judge LLM."""
+        # Provider/model should be in config yaml
+        provider_config = config.config.llm_providers.providers[judge_provider]
+        llm = VANILLA_MODEL[provider_config.type](judge_model, provider_config).load()
+
+        prompt = PromptTemplate.from_template(ANSWER_RELEVANCY_PROMPT)
+        return prompt | llm | JsonOutputParser()
+
+    def get_score(
+        self,
+        question,
+        response,
+        retry_attemps=MAX_RETRY_ATTEMPTS,
+        time_to_breath=TIME_TO_BREATH,
+    ):
+        """Calculate relevancy score."""
+        # Generate relevant questions.
+        for retry_counter in range(retry_attemps):
+            try:
+                out = self._judge_llm.invoke(
+                    {"answer": response, "num_questions": N_QUESTIONS}
+                )
+                break
+            except Exception:
+                if retry_counter == retry_attemps - 1:
+                    out = None  ## Continue with without result
+                    # raise
+            sleep(time_to_breath)
+
+        if out:
+            valid_flag = out["Valid"]
+            gen_questions = out["Question"]
+            score = 0
+            if valid_flag == 1:
+                org_vec = self._embedding_model.get_text_embedding(question)
+                score = mean(
+                    [
+                        1
+                        - cosine(
+                            org_vec,
+                            self._embedding_model.get_text_embedding(gen_question),
+                        )
+                        for gen_question in gen_questions
+                    ]
+                )
+
+            return score, valid_flag, "\n".join(gen_questions)
+
+        return None, None, None
diff --git a/scripts/evaluation/utils/response.py b/scripts/evaluation/utils/response.py
@@ -39,7 +39,9 @@ def get_model_response(query, provider, model, mode, api_client=None):
         override_params = {
             GenericLLMParameters.MAX_TOKENS_FOR_RESPONSE: max_resp_tokens
         }
-        llm = MODEL_OLS_PARAM[provider](model, provider_config, override_params).load()
+        llm = MODEL_OLS_PARAM[provider_config.type](
+            model, provider_config, override_params
+        ).load()
     if mode == "ols_prompt":
         prompt, prompt_input = GeneratePrompt(query, [], []).generate_prompt(model)
     if mode == "ols_rag":

diff --git a/scripts/evaluation/utils/score.py b/scripts/evaluation/utils/score.py
@@ -8,14 +8,21 @@
 class ResponseScore:
     """Calculate response score."""
 
-    def __init__(self):
+    def __init__(self, metrics):
         """Initialize."""
         self._embedding_model = HuggingFaceEmbedding(
             "sentence-transformers/all-mpnet-base-v2"
         )
         self._rouge_scorer = RougeScorer(["rougeL"], use_stemmer=True)
 
-    def calculate_scores(self, answer, response):
+        self._relevancy_scorer = None
+        if "answer_relevancy" in metrics:
+            # Importing here to avoid setting up judge LLM in config, if not required.
+            from .relevancy_score import AnswerRelevancyScore
+
+            self._relevancy_scorer = AnswerRelevancyScore(self._embedding_model)
+
+    def calculate_scores(self, query, answer, response):
         """Calculate different similarity scores for two strings."""
         res_vec = self._embedding_model.get_text_embedding(response)
         ans_vec = self._embedding_model.get_text_embedding(answer)
@@ -30,11 +37,18 @@ def calculate_scores(self, answer, response):
         # text based scores
         rouge_score = self._rouge_scorer.score(target=answer, prediction=response)
 
+        relevancy_score = answer_valid_flag = generated_questions = None
+        if self._relevancy_scorer:
+            relevancy_score, answer_valid_flag, generated_questions = (
+                self._relevancy_scorer.get_score(query, response)
+            )
+
         print(
             f"cos_score: {cos_score}, "
             f"euc_score: {euc_score}, "
             f"len_score: {len_score}, "
-            f"rouge_score: {rouge_score}"
+            f"rouge_score: {rouge_score}, "
+            f"relevancy_score: {relevancy_score}"
         )
         return (
             cos_score,
@@ -43,4 +57,8 @@ def calculate_scores(self, answer, response):
             rouge_score["rougeL"].precision,
             rouge_score["rougeL"].recall,
             rouge_score["rougeL"].fmeasure,
+            relevancy_score,
+            # Return additional information
+            answer_valid_flag,
+            generated_questions,
         )