Skip to content

Commit

Permalink
Relevancy score: refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
tisnik committed Nov 11, 2024
1 parent 148acd4 commit f7bed2b
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 28 deletions.
43 changes: 24 additions & 19 deletions scripts/evaluation/response_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, eval_args, api_client):
self._load_config_and_rag() # Set global config
self._input_dir, self._result_dir = self._set_directories()

self._scorer = ResponseScore()
self._scorer = ResponseScore(self._args.eval_metrics)

# Load data
with open(os.path.join(self._input_dir, DEFAULT_QNA_FILE)) as qna_f:
Expand All @@ -71,7 +71,7 @@ def _load_config_and_rag(self):
if len(set(self._args.eval_modes) - {"ols"}) > 0:
# load config separately
# Use OLS config file to set provider/model related config. Ex: credential/url
cfg_file = os.environ.get("RCS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
cfg_file = os.environ.get("OLS_CONFIG_FILE", DEFAULT_CONFIG_FILE)
config.reload_from_yaml_file(cfg_file)

if "ols_rag" in self._args.eval_modes:
Expand Down Expand Up @@ -101,7 +101,7 @@ def _load_qna_pool_parquet(self):
columns={"ID": "query_id", "Question": "question", "Answer": "answer"}
)
qna_pool_df["query_id"] = "qna" + qna_pool_df["query_id"].astype(str)
qna_pool_df["query_source"].append("doc")
qna_pool_df["query_source"] = "doc"
qna_pool_df["consistency_cutoff"] = EVAL_THRESHOLD
qna_pool_df["in_use"] = True
return qna_pool_df
Expand Down Expand Up @@ -131,9 +131,9 @@ def _restructure_qna_pool_json(self, provider_model_id):
qna_pool_dict["question"].append(question)
qna_pool_dict["answer"].append(answer)
qna_pool_dict["query_source"].append("transcript")
qna_pool_dict["doc_source"].append(None)
qna_pool_dict["doc_title"].append(None)
qna_pool_dict["doc_page"].append(None)
qna_pool_dict["doc_source"].append("NA")
qna_pool_dict["doc_title"].append("NA")
qna_pool_dict["doc_page"].append("NA")
qna_pool_dict["consistency_cutoff"].append(consistency_cutoff)
qna_pool_dict["in_use"].append(in_use)

Expand Down Expand Up @@ -236,21 +236,27 @@ def _get_model_response(self, qna_pool_df, provider_model_id, eval_mode):
def _get_evaluation_score(self, qna_pool_df):
"""Get response evaluation score."""
print("Getting evaluation scores...")
qna_pool_df[
[
"cos_score",
"euc_score",
"len_score",
"rougeL_precision",
"rougeL_recall",
"rougeL_f1",
]
] = qna_pool_df.progress_apply(
lambda row: self._scorer.calculate_scores(row.answer, row.response),
# Default scores
score_cols = [
"cos_score",
"euc_score",
"len_score",
"rougeL_precision",
"rougeL_recall",
"rougeL_f1",
"answer_relevancy",
# Supporting data
"answer_valid_flag",
"generated_questions",
]
qna_pool_df[score_cols] = qna_pool_df.progress_apply(
lambda row: self._scorer.calculate_scores(
row.question, row.answer, row.response
),
axis=1,
result_type="expand",
)
return qna_pool_df
return qna_pool_df.dropna(axis=1, how="all")

def _get_response_with_score(self):
"""Get responses with scores."""
Expand Down Expand Up @@ -280,7 +286,6 @@ def _condense_eval_df(result_df):
"doc_source",
"doc_title",
"doc_page",
"consistency_cutoff",
],
columns=["eval_mode", "provider_model_id"],
).swaplevel(0, axis=1)
Expand Down
16 changes: 11 additions & 5 deletions scripts/evaluation/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
INSCOPE_MODELS = {
"bam+ibm/granite-13b-chat-v2": ("bam", "ibm/granite-13b-chat-v2"),
"watsonx+ibm/granite-13b-chat-v2": ("watsonx", "ibm/granite-13b-chat-v2"),
"openai+gpt-3.5-turbo": ("openai", "gpt-3.5-turbo"),
"watsonx+ibm/granite-3-2b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
"watsonx+ibm/granite-3-8b-instruct": ("watsonx", "ibm/granite-3-2b-instruct"),
"openai+gpt-4o-mini": ("openai", "gpt-4o-mini"),
"azure_openai+gpt-3.5-turbo": ("azure_openai", "gpt-3.5-turbo"),
"azure_openai+gpt-3.5-turbo-4k": ("azure_openai", "gpt-3.5-turbo"),
"azure_openai+gpt-3.5-turbo-16k": ("azure_openai", "gpt-3.5-turbo"),
"openai+gpt-4o": ("openai", "gpt-4o"),
"azure_openai+gpt-4o-mini": ("azure_openai", "gpt-4o-mini"),
"azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),
"ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),
"ollama+mistral": ("ollama", "mistral"),
}

SCORE_DESCRIPTION = {
Expand All @@ -19,6 +21,7 @@
"rougeL_precision": "RougeL Precision Score",
"rougeL_recall": "RougeL Recall Score",
"rougeL_f1": "RougeL F1 Score",
"answer_relevancy": "Answer relevancy score against query",
}

EVAL_MODES = {
Expand All @@ -36,7 +39,7 @@
"""

DEFAULT_QNA_FILE = "question_answer_pair.json"
DEFAULT_CONFIG_FILE = "rcsconfig.yaml"
DEFAULT_CONFIG_FILE = "olsconfig.yaml"

DEFAULT_INPUT_DIR = "eval_data"
DEFAULT_RESULT_DIR = "eval_result"
Expand All @@ -48,3 +51,6 @@

# Cut-off similarity score used for response evaluation.
EVAL_THRESHOLD = 0.3 # low score is better

# Number of related questions to be generated.
N_QUESTIONS = 2
24 changes: 24 additions & 0 deletions scripts/evaluation/utils/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# ruff: noqa: E501
"""Prompt templates/constants."""

# Below is inspired by both ragas & langchain internal/example prompts.
ANSWER_RELEVANCY_PROMPT = """
You are an helpful assistant. Your task is to analyze answer and come up with questions from the given answer.
Given the following answer delimited by three backticks please generate {num_questions} questions.
A question should be concise and based explicitly on the information present in answer. It should be asking about one thing at a time.
Give Valid as 1 if the answer is valid and 0 if the answer is invalid. An invalid answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers.
When formulating a question, don't include text like "from the provided context", "as described in the document", "according to the given document" or anything similar. Also don't add sequence number in question.
Use below json format for your response. Do not add any additional text apart from json output.
{{
Question: [
QUESTION 1,
QUESTION 2,
],
Valid: 0 or 1
}}
```
{answer}
```
"""
77 changes: 77 additions & 0 deletions scripts/evaluation/utils/relevancy_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Relevancy score calculation."""

from statistics import mean
from time import sleep

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts.prompt import PromptTemplate
from scipy.spatial.distance import cosine

from ols import config

from .constants import MAX_RETRY_ATTEMPTS, N_QUESTIONS, TIME_TO_BREATH
from .models import VANILLA_MODEL
from .prompts import ANSWER_RELEVANCY_PROMPT


class AnswerRelevancyScore:
"""Calculate response/answer relevancy score."""

def __init__(
self, embedding_model, judge_provider="ollama", judge_model="llama3.1:latest"
):
"""Initialize."""
self._embedding_model = embedding_model
self._judge_llm = self._judge_llm_init(judge_provider, judge_model)

@staticmethod
def _judge_llm_init(judge_provider, judge_model):
"""Load judge LLM."""
# Provider/model should be in config yaml
provider_config = config.config.llm_providers.providers[judge_provider]
llm = VANILLA_MODEL[provider_config.type](judge_model, provider_config).load()

prompt = PromptTemplate.from_template(ANSWER_RELEVANCY_PROMPT)
return prompt | llm | JsonOutputParser()

def get_score(
self,
question,
response,
retry_attemps=MAX_RETRY_ATTEMPTS,
time_to_breath=TIME_TO_BREATH,
):
"""Calculate relevancy score."""
# Generate relevant questions.
for retry_counter in range(retry_attemps):
try:
out = self._judge_llm.invoke(
{"answer": response, "num_questions": N_QUESTIONS}
)
break
except Exception:
if retry_counter == retry_attemps - 1:
out = None ## Continue with without result
# raise
sleep(time_to_breath)

if out:
valid_flag = out["Valid"]
gen_questions = out["Question"]
score = 0
if valid_flag == 1:
org_vec = self._embedding_model.get_text_embedding(question)
score = mean(
[
1
- cosine(
org_vec,
self._embedding_model.get_text_embedding(gen_question),
)
for gen_question in gen_questions
]
)

return score, valid_flag, "\n".join(gen_questions)

return None, None, None
4 changes: 3 additions & 1 deletion scripts/evaluation/utils/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def get_model_response(query, provider, model, mode, api_client=None):
override_params = {
GenericLLMParameters.MAX_TOKENS_FOR_RESPONSE: max_resp_tokens
}
llm = MODEL_OLS_PARAM[provider](model, provider_config, override_params).load()
llm = MODEL_OLS_PARAM[provider_config.type](
model, provider_config, override_params
).load()
if mode == "ols_prompt":
prompt, prompt_input = GeneratePrompt(query, [], []).generate_prompt(model)
if mode == "ols_rag":
Expand Down
24 changes: 21 additions & 3 deletions scripts/evaluation/utils/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,21 @@
class ResponseScore:
"""Calculate response score."""

def __init__(self):
def __init__(self, metrics):
"""Initialize."""
self._embedding_model = HuggingFaceEmbedding(
"sentence-transformers/all-mpnet-base-v2"
)
self._rouge_scorer = RougeScorer(["rougeL"], use_stemmer=True)

def calculate_scores(self, answer, response):
self._relevancy_scorer = None
if "answer_relevancy" in metrics:
# Importing here to avoid setting up judge LLM in config, if not required.
from .relevancy_score import AnswerRelevancyScore

self._relevancy_scorer = AnswerRelevancyScore(self._embedding_model)

def calculate_scores(self, query, answer, response):
"""Calculate different similarity scores for two strings."""
res_vec = self._embedding_model.get_text_embedding(response)
ans_vec = self._embedding_model.get_text_embedding(answer)
Expand All @@ -30,11 +37,18 @@ def calculate_scores(self, answer, response):
# text based scores
rouge_score = self._rouge_scorer.score(target=answer, prediction=response)

relevancy_score = answer_valid_flag = generated_questions = None
if self._relevancy_scorer:
relevancy_score, answer_valid_flag, generated_questions = (
self._relevancy_scorer.get_score(query, response)
)

print(
f"cos_score: {cos_score}, "
f"euc_score: {euc_score}, "
f"len_score: {len_score}, "
f"rouge_score: {rouge_score}"
f"rouge_score: {rouge_score}, "
f"relevancy_score: {relevancy_score}"
)
return (
cos_score,
Expand All @@ -43,4 +57,8 @@ def calculate_scores(self, answer, response):
rouge_score["rougeL"].precision,
rouge_score["rougeL"].recall,
rouge_score["rougeL"].fmeasure,
relevancy_score,
# Return additional information
answer_valid_flag,
generated_questions,
)

0 comments on commit f7bed2b

Please sign in to comment.