Skip to content

Commit

Permalink
Merge pull request #97 from tisnik/answer-evalutation-fixes
Browse files Browse the repository at this point in the history
Answer evalutation fixes
  • Loading branch information
tisnik authored Nov 11, 2024
2 parents 6c607e0 + f7bed2b commit 62ee217
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 30 deletions.
5 changes: 3 additions & 2 deletions scripts/evaluation/eval_data/question_answer_pair.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,9 @@
]
},
"azure_openai+gpt-4o-mini+with_rag": {
"cutoff_score": 0.3,
"cutoff_score": 0.35,
"text": [
"OpenShift Virtualization is an add-on to the Red Hat OpenShift Container Platform that enables you to run and manage virtual machine (VM) workloads alongside container workloads. It integrates virtualization capabilities into the OpenShift environment, allowing users to create and manage both Linux and Windows VMs as native Kubernetes objects.\n\nKey features of OpenShift Virtualization include:\n\n- Creating and managing VMs directly within the OpenShift platform.\n- Running pod (container) and VM workloads together in a single cluster.\n- Connecting to VMs through various consoles and command-line tools.\n- Importing and cloning existing virtual machines.\n- Managing network interfaces and storage disks attached to VMs.\n- Live migrating VMs between nodes for better resource management.\n\nThe enhanced web console provides a graphical interface for managing these virtualized resources alongside containerized applications, making it easier for users to handle diverse workloads within a unified platform.",
"OpenShift Virtualization is a feature of Red Hat OpenShift Container Platform that allows you to bring traditional virtual machines (VMs) into the Kubernetes environment and run them alongside containers. In OpenShift Virtualization, VMs are treated as native Kubernetes objects and can be managed using the OpenShift web console or command line interface. This feature enables organizations to consolidate their containerized and virtualized workloads onto a single platform, providing greater flexibility and efficiency in managing their applications."
]
},
Expand Down Expand Up @@ -802,4 +803,4 @@
"how do i reformat a laptop hard drive"
]
}
}
}
43 changes: 24 additions & 19 deletions scripts/evaluation/response_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, eval_args, api_client):
self._load_config_and_rag() # Set global config
self._input_dir, self._result_dir = self._set_directories()

self._scorer = ResponseScore()
self._scorer = ResponseScore(self._args.eval_metrics)

# Load data
with open(os.path.join(self._input_dir, DEFAULT_QNA_FILE)) as qna_f:
Expand All @@ -71,7 +71,7 @@ def _load_config_and_rag(self):
if len(set(self._args.eval_modes) - {"ols"}) > 0:
# load config separately
# Use OLS config file to set provider/model related config. Ex: credential/url
cfg_file = os.environ.get("RCS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
cfg_file = os.environ.get("OLS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
config.reload_from_yaml_file(cfg_file)

if "ols_rag" in self._args.eval_modes:
Expand Down Expand Up @@ -101,7 +101,7 @@ def _load_qna_pool_parquet(self):
columns={"ID": "query_id", "Question": "question", "Answer": "answer"}
)
qna_pool_df["query_id"] = "qna" + qna_pool_df["query_id"].astype(str)
qna_pool_df["query_source"].append("doc")
qna_pool_df["query_source"] = "doc"
qna_pool_df["consistency_cutoff"] = EVAL_THRESHOLD
qna_pool_df["in_use"] = True
return qna_pool_df
Expand Down Expand Up @@ -131,9 +131,9 @@ def _restructure_qna_pool_json(self, provider_model_id):
qna_pool_dict["question"].append(question)
qna_pool_dict["answer"].append(answer)
qna_pool_dict["query_source"].append("transcript")
qna_pool_dict["doc_source"].append(None)
qna_pool_dict["doc_title"].append(None)
qna_pool_dict["doc_page"].append(None)
qna_pool_dict["doc_source"].append("NA")
qna_pool_dict["doc_title"].append("NA")
qna_pool_dict["doc_page"].append("NA")
qna_pool_dict["consistency_cutoff"].append(consistency_cutoff)
qna_pool_dict["in_use"].append(in_use)

Expand Down Expand Up @@ -236,21 +236,27 @@ def _get_model_response(self, qna_pool_df, provider_model_id, eval_mode):
def _get_evaluation_score(self, qna_pool_df):
"""Get response evaluation score."""
print("Getting evaluation scores...")
qna_pool_df[
[
"cos_score",
"euc_score",
"len_score",
"rougeL_precision",
"rougeL_recall",
"rougeL_f1",
]
] = qna_pool_df.progress_apply(
lambda row: self._scorer.calculate_scores(row.answer, row.response),
# Default scores
score_cols = [
"cos_score",
"euc_score",
"len_score",
"rougeL_precision",
"rougeL_recall",
"rougeL_f1",
"answer_relevancy",
# Supporting data
"answer_valid_flag",
"generated_questions",
]
qna_pool_df[score_cols] = qna_pool_df.progress_apply(
lambda row: self._scorer.calculate_scores(
row.question, row.answer, row.response
),
axis=1,
result_type="expand",
)
return qna_pool_df
return qna_pool_df.dropna(axis=1, how="all")

def _get_response_with_score(self):
"""Get responses with scores."""
Expand Down Expand Up @@ -280,7 +286,6 @@ def _condense_eval_df(result_df):
"doc_source",
"doc_title",
"doc_page",
"consistency_cutoff",
],
columns=["eval_mode", "provider_model_id"],
).swaplevel(0, axis=1)
Expand Down
16 changes: 11 additions & 5 deletions scripts/evaluation/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
INSCOPE_MODELS = {
"bam+ibm/granite-13b-chat-v2": ("bam", "ibm/granite-13b-chat-v2"),
"watsonx+ibm/granite-13b-chat-v2": ("watsonx", "ibm/granite-13b-chat-v2"),
"openai+gpt-3.5-turbo": ("openai", "gpt-3.5-turbo"),
"watsonx+ibm/granite-3-2b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
"watsonx+ibm/granite-3-8b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
"openai+gpt-4o-mini": ("openai", "gpt-4o-mini"),
"azure_openai+gpt-3.5-turbo": ("azure_openai", "gpt-3.5-turbo"),
"azure_openai+gpt-3.5-turbo-4k": ("azure_openai", "gpt-3.5-turbo"),
"azure_openai+gpt-3.5-turbo-16k": ("azure_openai", "gpt-3.5-turbo"),
"openai+gpt-4o": ("openai", "gpt-4o"),
"azure_openai+gpt-4o-mini": ("azure_openai", "gpt-4o-mini"),
"azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),
"ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),
"ollama+mistral": ("ollama", "mistral"),
}

SCORE_DESCRIPTION = {
Expand All @@ -19,6 +21,7 @@
"rougeL_precision": "RougeL Precision Score",
"rougeL_recall": "RougeL Recall Score",
"rougeL_f1": "RougeL F1 Score",
"answer_relevancy": "Answer relevancy score against query",
}

EVAL_MODES = {
Expand All @@ -36,7 +39,7 @@
"""

DEFAULT_QNA_FILE = "question_answer_pair.json"
DEFAULT_CONFIG_FILE = "rcsconfig.yaml"
DEFAULT_CONFIG_FILE = "olsconfig.yaml"

DEFAULT_INPUT_DIR = "eval_data"
DEFAULT_RESULT_DIR = "eval_result"
Expand All @@ -48,3 +51,6 @@

# Cut-off similarity score used for response evaluation.
EVAL_THRESHOLD = 0.3 # low score is better

# Number of related questions to be generated.
N_QUESTIONS = 2
24 changes: 24 additions & 0 deletions scripts/evaluation/utils/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# ruff: noqa: E501
"""Prompt templates/constants."""

# Below is inspired by both ragas & langchain internal/example prompts.
ANSWER_RELEVANCY_PROMPT = """
You are an helpful assistant. Your task is to analyze answer and come up with questions from the given answer.
Given the following answer delimited by three backticks please generate {num_questions} questions.
A question should be concise and based explicitly on the information present in answer. It should be asking about one thing at a time.
Give Valid as 1 if the answer is valid and 0 if the answer is invalid. An invalid answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers.
When formulating a question, don't include text like "from the provided context", "as described in the document", "according to the given document" or anything similar. Also don't add sequence number in question.
Use below json format for your response. Do not add any additional text apart from json output.
{{
Question: [
QUESTION 1,
QUESTION 2,
],
Valid: 0 or 1
}}
```
{answer}
```
"""
77 changes: 77 additions & 0 deletions scripts/evaluation/utils/relevancy_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Relevancy score calculation."""

from statistics import mean
from time import sleep

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts.prompt import PromptTemplate
from scipy.spatial.distance import cosine

from ols import config

from .constants import MAX_RETRY_ATTEMPTS, N_QUESTIONS, TIME_TO_BREATH
from .models import VANILLA_MODEL
from .prompts import ANSWER_RELEVANCY_PROMPT


class AnswerRelevancyScore:
"""Calculate response/answer relevancy score."""

def __init__(
self, embedding_model, judge_provider="ollama", judge_model="llama3.1:latest"
):
"""Initialize."""
self._embedding_model = embedding_model
self._judge_llm = self._judge_llm_init(judge_provider, judge_model)

@staticmethod
def _judge_llm_init(judge_provider, judge_model):
"""Load judge LLM."""
# Provider/model should be in config yaml
provider_config = config.config.llm_providers.providers[judge_provider]
llm = VANILLA_MODEL[provider_config.type](judge_model, provider_config).load()

prompt = PromptTemplate.from_template(ANSWER_RELEVANCY_PROMPT)
return prompt | llm | JsonOutputParser()

def get_score(
self,
question,
response,
retry_attemps=MAX_RETRY_ATTEMPTS,
time_to_breath=TIME_TO_BREATH,
):
"""Calculate relevancy score."""
# Generate relevant questions.
for retry_counter in range(retry_attemps):
try:
out = self._judge_llm.invoke(
{"answer": response, "num_questions": N_QUESTIONS}
)
break
except Exception:
if retry_counter == retry_attemps - 1:
out = None ## Continue with without result
# raise
sleep(time_to_breath)

if out:
valid_flag = out["Valid"]
gen_questions = out["Question"]
score = 0
if valid_flag == 1:
org_vec = self._embedding_model.get_text_embedding(question)
score = mean(
[
1
- cosine(
org_vec,
self._embedding_model.get_text_embedding(gen_question),
)
for gen_question in gen_questions
]
)

return score, valid_flag, "\n".join(gen_questions)

return None, None, None
4 changes: 3 additions & 1 deletion scripts/evaluation/utils/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def get_model_response(query, provider, model, mode, api_client=None):
override_params = {
GenericLLMParameters.MAX_TOKENS_FOR_RESPONSE: max_resp_tokens
}
llm = MODEL_OLS_PARAM[provider](model, provider_config, override_params).load()
llm = MODEL_OLS_PARAM[provider_config.type](
model, provider_config, override_params
).load()
if mode == "ols_prompt":
prompt, prompt_input = GeneratePrompt(query, [], []).generate_prompt(model)
if mode == "ols_rag":
Expand Down
24 changes: 21 additions & 3 deletions scripts/evaluation/utils/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,21 @@
class ResponseScore:
"""Calculate response score."""

def __init__(self):
def __init__(self, metrics):
"""Initialize."""
self._embedding_model = HuggingFaceEmbedding(
"sentence-transformers/all-mpnet-base-v2"
)
self._rouge_scorer = RougeScorer(["rougeL"], use_stemmer=True)

def calculate_scores(self, answer, response):
self._relevancy_scorer = None
if "answer_relevancy" in metrics:
# Importing here to avoid setting up judge LLM in config, if not required.
from .relevancy_score import AnswerRelevancyScore

self._relevancy_scorer = AnswerRelevancyScore(self._embedding_model)

def calculate_scores(self, query, answer, response):
"""Calculate different similarity scores for two strings."""
res_vec = self._embedding_model.get_text_embedding(response)
ans_vec = self._embedding_model.get_text_embedding(answer)
Expand All @@ -30,11 +37,18 @@ def calculate_scores(self, answer, response):
# text based scores
rouge_score = self._rouge_scorer.score(target=answer, prediction=response)

relevancy_score = answer_valid_flag = generated_questions = None
if self._relevancy_scorer:
relevancy_score, answer_valid_flag, generated_questions = (
self._relevancy_scorer.get_score(query, response)
)

print(
f"cos_score: {cos_score}, "
f"euc_score: {euc_score}, "
f"len_score: {len_score}, "
f"rouge_score: {rouge_score}"
f"rouge_score: {rouge_score}, "
f"relevancy_score: {relevancy_score}"
)
return (
cos_score,
Expand All @@ -43,4 +57,8 @@ def calculate_scores(self, answer, response):
rouge_score["rougeL"].precision,
rouge_score["rougeL"].recall,
rouge_score["rougeL"].fmeasure,
relevancy_score,
# Return additional information
answer_valid_flag,
generated_questions,
)

0 comments on commit 62ee217

Please sign in to comment.