Relevancy score: refactoring

road-core · Nov 11, 2024 · f7bed2b · f7bed2b
1 parent 148acd4
commit f7bed2b
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 28 deletions.
diff --git a/scripts/evaluation/response_evaluation.py b/scripts/evaluation/response_evaluation.py
@@ -44,7 +44,7 @@ def __init__(self, eval_args, api_client):
         self._load_config_and_rag()  # Set global config
         self._input_dir, self._result_dir = self._set_directories()
 
-        self._scorer = ResponseScore()
+        self._scorer = ResponseScore(self._args.eval_metrics)
 
         # Load data
         with open(os.path.join(self._input_dir, DEFAULT_QNA_FILE)) as qna_f:
@@ -71,7 +71,7 @@ def _load_config_and_rag(self):
         if len(set(self._args.eval_modes) - {"ols"}) > 0:
             # load config separately
             # Use OLS config file to set provider/model related config. Ex: credential/url
-            cfg_file = os.environ.get("RCS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
+            cfg_file = os.environ.get("OLS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
             config.reload_from_yaml_file(cfg_file)
 
         if "ols_rag" in self._args.eval_modes:
@@ -101,7 +101,7 @@ def _load_qna_pool_parquet(self):
                 columns={"ID": "query_id", "Question": "question", "Answer": "answer"}
             )
             qna_pool_df["query_id"] = "qna" + qna_pool_df["query_id"].astype(str)
-            qna_pool_df["query_source"].append("doc")
+            qna_pool_df["query_source"] = "doc"
             qna_pool_df["consistency_cutoff"] = EVAL_THRESHOLD
             qna_pool_df["in_use"] = True
         return qna_pool_df
@@ -131,9 +131,9 @@ def _restructure_qna_pool_json(self, provider_model_id):
                 qna_pool_dict["question"].append(question)
                 qna_pool_dict["answer"].append(answer)
                 qna_pool_dict["query_source"].append("transcript")
-                qna_pool_dict["doc_source"].append(None)
-                qna_pool_dict["doc_title"].append(None)
-                qna_pool_dict["doc_page"].append(None)
+                qna_pool_dict["doc_source"].append("NA")
+                qna_pool_dict["doc_title"].append("NA")
+                qna_pool_dict["doc_page"].append("NA")
                 qna_pool_dict["consistency_cutoff"].append(consistency_cutoff)
                 qna_pool_dict["in_use"].append(in_use)
 
@@ -236,21 +236,27 @@ def _get_model_response(self, qna_pool_df, provider_model_id, eval_mode):
     def _get_evaluation_score(self, qna_pool_df):
         """Get response evaluation score."""
         print("Getting evaluation scores...")
-        qna_pool_df[
-            [
-                "cos_score",
-                "euc_score",
-                "len_score",
-                "rougeL_precision",
-                "rougeL_recall",
-                "rougeL_f1",
-            ]
-        ] = qna_pool_df.progress_apply(
-            lambda row: self._scorer.calculate_scores(row.answer, row.response),
+        # Default scores
+        score_cols = [
+            "cos_score",
+            "euc_score",
+            "len_score",
+            "rougeL_precision",
+            "rougeL_recall",
+            "rougeL_f1",
+            "answer_relevancy",
+            # Supporting data
+            "answer_valid_flag",
+            "generated_questions",
+        ]
+        qna_pool_df[score_cols] = qna_pool_df.progress_apply(
+            lambda row: self._scorer.calculate_scores(
+                row.question, row.answer, row.response
+            ),
             axis=1,
             result_type="expand",
         )
-        return qna_pool_df
+        return qna_pool_df.dropna(axis=1, how="all")
 
     def _get_response_with_score(self):
         """Get responses with scores."""
@@ -280,7 +286,6 @@ def _condense_eval_df(result_df):
                 "doc_source",
                 "doc_title",
                 "doc_page",
-                "consistency_cutoff",
             ],
             columns=["eval_mode", "provider_model_id"],
         ).swaplevel(0, axis=1)

diff --git a/scripts/evaluation/utils/constants.py b/scripts/evaluation/utils/constants.py
@@ -4,12 +4,14 @@
 INSCOPE_MODELS = {
     "bam+ibm/granite-13b-chat-v2": ("bam", "ibm/granite-13b-chat-v2"),
     "watsonx+ibm/granite-13b-chat-v2": ("watsonx", "ibm/granite-13b-chat-v2"),
-    "openai+gpt-3.5-turbo": ("openai", "gpt-3.5-turbo"),
+    "watsonx+ibm/granite-3-2b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
+    "watsonx+ibm/granite-3-8b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
     "openai+gpt-4o-mini": ("openai", "gpt-4o-mini"),
-    "azure_openai+gpt-3.5-turbo": ("azure_openai", "gpt-3.5-turbo"),
-    "azure_openai+gpt-3.5-turbo-4k": ("azure_openai", "gpt-3.5-turbo"),
-    "azure_openai+gpt-3.5-turbo-16k": ("azure_openai", "gpt-3.5-turbo"),
+    "openai+gpt-4o": ("openai", "gpt-4o"),
+    "azure_openai+gpt-4o-mini": ("azure_openai", "gpt-4o-mini"),
     "azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),
+    "ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),
+    "ollama+mistral": ("ollama", "mistral"),
 }
 
 SCORE_DESCRIPTION = {
@@ -19,6 +21,7 @@
     "rougeL_precision": "RougeL Precision Score",
     "rougeL_recall": "RougeL Recall Score",
     "rougeL_f1": "RougeL F1 Score",
+    "answer_relevancy": "Answer relevancy score against query",
 }
 
 EVAL_MODES = {
@@ -36,7 +39,7 @@
 """
 
 DEFAULT_QNA_FILE = "question_answer_pair.json"
-DEFAULT_CONFIG_FILE = "rcsconfig.yaml"
+DEFAULT_CONFIG_FILE = "olsconfig.yaml"
 
 DEFAULT_INPUT_DIR = "eval_data"
 DEFAULT_RESULT_DIR = "eval_result"
@@ -48,3 +51,6 @@
 
 # Cut-off similarity score used for response evaluation.
 EVAL_THRESHOLD = 0.3  # low score is better
+
+# Number of related questions to be generated.
+N_QUESTIONS = 2
diff --git a/scripts/evaluation/utils/prompts.py b/scripts/evaluation/utils/prompts.py
@@ -0,0 +1,24 @@
+# ruff: noqa: E501
+"""Prompt templates/constants."""
+
+# Below is inspired by both ragas & langchain internal/example prompts.
+ANSWER_RELEVANCY_PROMPT = """
+You are an helpful assistant. Your task is to analyze answer and come up with questions from the given answer.
+Given the following answer delimited by three backticks please generate {num_questions} questions.
+A question should be concise and based explicitly on the information present in answer. It should be asking about one thing at a time.
+Give Valid as 1 if the answer is valid and 0 if the answer is invalid. An invalid answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers.
+When formulating a question, don't include text like "from the provided context", "as described in the document", "according to the given document" or anything similar. Also don't add sequence number in question.
+
+Use below json format for your response. Do not add any additional text apart from json output.
+{{
+    Question: [
+        QUESTION 1,
+        QUESTION 2,
+    ],
+    Valid: 0 or 1
+}}
+
+```
+{answer}
+```
+"""
diff --git a/scripts/evaluation/utils/relevancy_score.py b/scripts/evaluation/utils/relevancy_score.py
@@ -0,0 +1,77 @@
+"""Relevancy score calculation."""
+
+from statistics import mean
+from time import sleep
+
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts.prompt import PromptTemplate
+from scipy.spatial.distance import cosine
+
+from ols import config
+
+from .constants import MAX_RETRY_ATTEMPTS, N_QUESTIONS, TIME_TO_BREATH
+from .models import VANILLA_MODEL
+from .prompts import ANSWER_RELEVANCY_PROMPT
+
+
+class AnswerRelevancyScore:
+    """Calculate response/answer relevancy score."""
+
+    def __init__(
+        self, embedding_model, judge_provider="ollama", judge_model="llama3.1:latest"
+    ):
+        """Initialize."""
+        self._embedding_model = embedding_model
+        self._judge_llm = self._judge_llm_init(judge_provider, judge_model)
+
+    @staticmethod
+    def _judge_llm_init(judge_provider, judge_model):
+        """Load judge LLM."""
+        # Provider/model should be in config yaml
+        provider_config = config.config.llm_providers.providers[judge_provider]
+        llm = VANILLA_MODEL[provider_config.type](judge_model, provider_config).load()
+
+        prompt = PromptTemplate.from_template(ANSWER_RELEVANCY_PROMPT)
+        return prompt | llm | JsonOutputParser()
+
+    def get_score(
+        self,
+        question,
+        response,
+        retry_attemps=MAX_RETRY_ATTEMPTS,
+        time_to_breath=TIME_TO_BREATH,
+    ):
+        """Calculate relevancy score."""
+        # Generate relevant questions.
+        for retry_counter in range(retry_attemps):
+            try:
+                out = self._judge_llm.invoke(
+                    {"answer": response, "num_questions": N_QUESTIONS}
+                )
+                break
+            except Exception:
+                if retry_counter == retry_attemps - 1:
+                    out = None  ## Continue with without result
+                    # raise
+            sleep(time_to_breath)
+
+        if out:
+            valid_flag = out["Valid"]
+            gen_questions = out["Question"]
+            score = 0
+            if valid_flag == 1:
+                org_vec = self._embedding_model.get_text_embedding(question)
+                score = mean(
+                    [
+                        1
+                        - cosine(
+                            org_vec,
+                            self._embedding_model.get_text_embedding(gen_question),
+                        )
+                        for gen_question in gen_questions
+                    ]
+                )
+
+            return score, valid_flag, "\n".join(gen_questions)
+
+        return None, None, None
diff --git a/scripts/evaluation/utils/response.py b/scripts/evaluation/utils/response.py
@@ -39,7 +39,9 @@ def get_model_response(query, provider, model, mode, api_client=None):
         override_params = {
             GenericLLMParameters.MAX_TOKENS_FOR_RESPONSE: max_resp_tokens
         }
-        llm = MODEL_OLS_PARAM[provider](model, provider_config, override_params).load()
+        llm = MODEL_OLS_PARAM[provider_config.type](
+            model, provider_config, override_params
+        ).load()
     if mode == "ols_prompt":
         prompt, prompt_input = GeneratePrompt(query, [], []).generate_prompt(model)
     if mode == "ols_rag":

diff --git a/scripts/evaluation/utils/score.py b/scripts/evaluation/utils/score.py
@@ -8,14 +8,21 @@
 class ResponseScore:
     """Calculate response score."""
 
-    def __init__(self):
+    def __init__(self, metrics):
         """Initialize."""
         self._embedding_model = HuggingFaceEmbedding(
             "sentence-transformers/all-mpnet-base-v2"
         )
         self._rouge_scorer = RougeScorer(["rougeL"], use_stemmer=True)
 
-    def calculate_scores(self, answer, response):
+        self._relevancy_scorer = None
+        if "answer_relevancy" in metrics:
+            # Importing here to avoid setting up judge LLM in config, if not required.
+            from .relevancy_score import AnswerRelevancyScore
+
+            self._relevancy_scorer = AnswerRelevancyScore(self._embedding_model)
+
+    def calculate_scores(self, query, answer, response):
         """Calculate different similarity scores for two strings."""
         res_vec = self._embedding_model.get_text_embedding(response)
         ans_vec = self._embedding_model.get_text_embedding(answer)
@@ -30,11 +37,18 @@ def calculate_scores(self, answer, response):
         # text based scores
         rouge_score = self._rouge_scorer.score(target=answer, prediction=response)
 
+        relevancy_score = answer_valid_flag = generated_questions = None
+        if self._relevancy_scorer:
+            relevancy_score, answer_valid_flag, generated_questions = (
+                self._relevancy_scorer.get_score(query, response)
+            )
+
         print(
             f"cos_score: {cos_score}, "
             f"euc_score: {euc_score}, "
             f"len_score: {len_score}, "
-            f"rouge_score: {rouge_score}"
+            f"rouge_score: {rouge_score}, "
+            f"relevancy_score: {relevancy_score}"
         )
         return (
             cos_score,
@@ -43,4 +57,8 @@ def calculate_scores(self, answer, response):
             rouge_score["rougeL"].precision,
             rouge_score["rougeL"].recall,
             rouge_score["rougeL"].fmeasure,
+            relevancy_score,
+            # Return additional information
+            answer_valid_flag,
+            generated_questions,
         )