-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #97 from tisnik/answer-evalutation-fixes
Answer evalutation fixes
- Loading branch information
Showing
7 changed files
with
163 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# ruff: noqa: E501 | ||
"""Prompt templates/constants.""" | ||
|
||
# Below is inspired by both ragas & langchain internal/example prompts. | ||
ANSWER_RELEVANCY_PROMPT = """ | ||
You are an helpful assistant. Your task is to analyze answer and come up with questions from the given answer. | ||
Given the following answer delimited by three backticks please generate {num_questions} questions. | ||
A question should be concise and based explicitly on the information present in answer. It should be asking about one thing at a time. | ||
Give Valid as 1 if the answer is valid and 0 if the answer is invalid. An invalid answer is one that is evasive, vague, or ambiguous. For example, "I don't know" or "I'm not sure" are noncommittal answers. | ||
When formulating a question, don't include text like "from the provided context", "as described in the document", "according to the given document" or anything similar. Also don't add sequence number in question. | ||
Use below json format for your response. Do not add any additional text apart from json output. | ||
{{ | ||
Question: [ | ||
QUESTION 1, | ||
QUESTION 2, | ||
], | ||
Valid: 0 or 1 | ||
}} | ||
``` | ||
{answer} | ||
``` | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
"""Relevancy score calculation.""" | ||
|
||
from statistics import mean | ||
from time import sleep | ||
|
||
from langchain_core.output_parsers import JsonOutputParser | ||
from langchain_core.prompts.prompt import PromptTemplate | ||
from scipy.spatial.distance import cosine | ||
|
||
from ols import config | ||
|
||
from .constants import MAX_RETRY_ATTEMPTS, N_QUESTIONS, TIME_TO_BREATH | ||
from .models import VANILLA_MODEL | ||
from .prompts import ANSWER_RELEVANCY_PROMPT | ||
|
||
|
||
class AnswerRelevancyScore: | ||
"""Calculate response/answer relevancy score.""" | ||
|
||
def __init__( | ||
self, embedding_model, judge_provider="ollama", judge_model="llama3.1:latest" | ||
): | ||
"""Initialize.""" | ||
self._embedding_model = embedding_model | ||
self._judge_llm = self._judge_llm_init(judge_provider, judge_model) | ||
|
||
@staticmethod | ||
def _judge_llm_init(judge_provider, judge_model): | ||
"""Load judge LLM.""" | ||
# Provider/model should be in config yaml | ||
provider_config = config.config.llm_providers.providers[judge_provider] | ||
llm = VANILLA_MODEL[provider_config.type](judge_model, provider_config).load() | ||
|
||
prompt = PromptTemplate.from_template(ANSWER_RELEVANCY_PROMPT) | ||
return prompt | llm | JsonOutputParser() | ||
|
||
def get_score( | ||
self, | ||
question, | ||
response, | ||
retry_attemps=MAX_RETRY_ATTEMPTS, | ||
time_to_breath=TIME_TO_BREATH, | ||
): | ||
"""Calculate relevancy score.""" | ||
# Generate relevant questions. | ||
for retry_counter in range(retry_attemps): | ||
try: | ||
out = self._judge_llm.invoke( | ||
{"answer": response, "num_questions": N_QUESTIONS} | ||
) | ||
break | ||
except Exception: | ||
if retry_counter == retry_attemps - 1: | ||
out = None ## Continue with without result | ||
# raise | ||
sleep(time_to_breath) | ||
|
||
if out: | ||
valid_flag = out["Valid"] | ||
gen_questions = out["Question"] | ||
score = 0 | ||
if valid_flag == 1: | ||
org_vec = self._embedding_model.get_text_embedding(question) | ||
score = mean( | ||
[ | ||
1 | ||
- cosine( | ||
org_vec, | ||
self._embedding_model.get_text_embedding(gen_question), | ||
) | ||
for gen_question in gen_questions | ||
] | ||
) | ||
|
||
return score, valid_flag, "\n".join(gen_questions) | ||
|
||
return None, None, None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters