diff --git a/app/__init__.py b/app/__init__.py index 5f4040f..a266343 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -2,10 +2,12 @@ from app.config.elasticsearch_config import create_index_if_not_exists from app.models.sentence_transformer import get_sentence_transformer from app.config.settings import settings +from app.services.reranking import RerankingService embedding_model = get_sentence_transformer() es_client = create_index_if_not_exists(settings.elasticsearch.index_name) +reranker_service = RerankingService() def create_app(): diff --git a/app/config/settings.py b/app/config/settings.py index dfa0684..28b9a41 100644 --- a/app/config/settings.py +++ b/app/config/settings.py @@ -47,4 +47,4 @@ def __init__(self): # Logging setup logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -logging.getLogger("elastic_transport").setLevel(logging.WARNING) \ No newline at end of file +logging.getLogger("elastic_transport").setLevel(logging.WARNING) diff --git a/app/data_models/__init__.py b/app/data_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/data_models/search_result.py b/app/data_models/search_result.py new file mode 100644 index 0000000..1db20d8 --- /dev/null +++ b/app/data_models/search_result.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + + +@dataclass +class SearchResult: + score: float + content: str + metadata: dict diff --git a/app/evaluation/retrieval/retrieval_metrics.py b/app/evaluation/retrieval/retrieval_metrics.py index d40c000..5e097b3 100644 --- a/app/evaluation/retrieval/retrieval_metrics.py +++ b/app/evaluation/retrieval/retrieval_metrics.py @@ -13,7 +13,8 @@ def evaluate_resources_summaries_retrieval( qa_references: list[dict], search_text_boost: int = 1, search_embedding_boost: int = 1, - k: int = 5 + k: int = 5, + rerank_top_k: int = 0, ) -> dict: # Initialize counters and sums for metrics total_questions = 0 @@ -29,8 +30,7 @@ def evaluate_resources_summaries_retrieval( reference_resource_id = response["custom_id"] content = response["response"]["body"]["choices"][0]["message"]["content"] - questions_and_answers = json.loads( - content)["questions_and_answers"] + questions_and_answers = json.loads(content)["questions_and_answers"] if len(questions_and_answers) > 0: # Sample one random question per resource_id to evaluate @@ -42,13 +42,15 @@ def evaluate_resources_summaries_retrieval( total_questions += 1 # Query question - search_results = search_query(question, - embedding_model, - es_client, - k=k, - text_boost=search_text_boost, - embedding_boost=search_embedding_boost) - + search_results = search_query( + question, + embedding_model, + es_client, + k=k, + text_boost=search_text_boost, + embedding_boost=search_embedding_boost, + rerank_top_k=rerank_top_k, + ) # Evaluate if any returned chunk belongs to the correct resource_id found = False rank = 0 @@ -59,7 +61,7 @@ def evaluate_resources_summaries_retrieval( if search_results != {"detail": "Not Found"}: for i, result in enumerate(search_results): - if result["metadata"]["resource_id"] == reference_resource_id: + if result.metadata["resource_id"] == reference_resource_id: if not found: total_contexts_found += 1 rank = i + 1 @@ -67,11 +69,10 @@ def evaluate_resources_summaries_retrieval( found = True retrieved_relevant_chunks += 1 elif search_results == {"detail": "Not Found"}: - search_results = {} + search_results = [] # Calculate precision and recall for this specific question - precision = retrieved_relevant_chunks / \ - len(search_results) if len(search_results) > 0 else 0 + precision = retrieved_relevant_chunks / len(search_results) if len(search_results) > 0 else 0 recall = retrieved_relevant_chunks / relevant_chunks if relevant_chunks > 0 else 0 precision_sum += precision @@ -81,16 +82,11 @@ def evaluate_resources_summaries_retrieval( position_sum += rank # Calculate final metrics - retrieval_accuracy = round( - total_contexts_found / total_questions, 3) if total_questions > 0 else 0 - average_position = round( - position_sum / total_contexts_found, 3) if total_contexts_found > 0 else 0 - mrr = round(reciprocal_rank_sum / total_questions, - 3) if total_questions > 0 else 0 - average_precision = round( - precision_sum / total_questions, 3) if total_questions > 0 else 0 - average_recall = round(recall_sum / total_questions, - 3) if total_questions > 0 else 0 + retrieval_accuracy = round(total_contexts_found / total_questions, 3) if total_questions > 0 else 0 + average_position = round(position_sum / total_contexts_found, 3) if total_contexts_found > 0 else 0 + mrr = round(reciprocal_rank_sum / total_questions, 3) if total_questions > 0 else 0 + average_precision = round(precision_sum / total_questions, 3) if total_questions > 0 else 0 + average_recall = round(recall_sum / total_questions, 3) if total_questions > 0 else 0 return { # The percentage of questions for which the system successfully retrieved at least one relevant chunk. diff --git a/app/routes/database_endpoints.py b/app/routes/database_endpoints.py index 1180819..e0eddc1 100644 --- a/app/routes/database_endpoints.py +++ b/app/routes/database_endpoints.py @@ -23,8 +23,7 @@ async def bulk_load(file: UploadFile = File(...), text_key: str = Form(...)): json_data = csv_to_dict(data) else: - raise HTTPException( - status_code=400, detail="Unsupported file format. Only JSON and CSV are supported.") + raise HTTPException(status_code=400, detail="Unsupported file format. Only JSON and CSV are supported.") try: helpers.bulk( @@ -43,33 +42,25 @@ async def bulk_load(file: UploadFile = File(...), text_key: str = Form(...)): @router.delete("/delete_all_documents") async def delete_all_documents(index_name: str): try: - es_client.delete_by_query(index=index_name, body={ - "query": {"match_all": {}}}) + es_client.delete_by_query(index=index_name, body={"query": {"match_all": {}}}) logger.info(f"All documents deleted from index '{index_name}'") return {"status": "success", "message": f"All documents deleted from index '{index_name}'"} except Exception as e: logger.error(f"Failed to delete documents: {str(e)}") - raise HTTPException( - status_code=500, detail=f"Failed to delete documents: {str(e)}") + raise HTTPException(status_code=500, detail=f"Failed to delete documents: {str(e)}") @router.get("/get_all_documents") -async def get_all_documents(index_name: str = settings.elasticsearch.index_name, - size: int = 2000): +async def get_all_documents(index_name: str = settings.elasticsearch.index_name, size: int = 2000): try: - documents = fetch_all_documents( - index_name=index_name, - es_client=es_client, - size=size) + documents = fetch_all_documents(index_name=index_name, es_client=es_client, size=size) return documents except Exception as e: logger.error(f"Error retrieving documents: {str(e)}") - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Error retrieving documents: {str(e)}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error retrieving documents: {str(e)}") @router.get("/search") async def search_documents(query: str, k: int = 5, text_boost: float = 0.25, embedding_boost: float = 4.0): - results = search_query(query, embedding_model, es_client, k=k, - text_boost=text_boost, embedding_boost=embedding_boost) + results = search_query(query, embedding_model, es_client, k=k, text_boost=text_boost, embedding_boost=embedding_boost) return results diff --git a/app/routes/evaluation_endpoints.py b/app/routes/evaluation_endpoints.py index 9298cc2..f8731fe 100644 --- a/app/routes/evaluation_endpoints.py +++ b/app/routes/evaluation_endpoints.py @@ -15,44 +15,40 @@ @router.post("/evaluate_retrieval") -async def evaluate_retrieval(file: UploadFile = File(...), - index_name: str = Form( - settings.elasticsearch.index_name), - size: int = Form(2000), - search_text_boost: float = Form(1), - search_embedding_boost: float = Form(1), - k: int = Form(5), - urls_in_resources: bool = Form(None), - questions_with_ids_and_dates: str = Form(None), - chunk_size: int = Form(None), - chunk_overlap: int = Form(None), - clearml_track_experiment: bool = Form(False), - clearml_experiment_name: str = Form("Retrieval evaluation"), - clearml_project_name: str = Form("Fasten")): +async def evaluate_retrieval( + file: UploadFile = File(...), + index_name: str = Form(settings.elasticsearch.index_name), + size: int = Form(2000), + search_text_boost: float = Form(1), + search_embedding_boost: float = Form(1), + k: int = Form(5), + rerank_top_k: int = Form(0), + urls_in_resources: bool = Form(None), + questions_with_ids_and_dates: str = Form(None), + chunk_size: int = Form(None), + chunk_overlap: int = Form(None), + clearml_track_experiment: bool = Form(False), + clearml_experiment_name: str = Form("Retrieval evaluation"), + clearml_project_name: str = Form("Fasten"), +): # Read and process reference questions and answers in JSONL try: qa_references = [] file_data = await file.read() - for line in file_data.decode('utf-8').splitlines(): + for line in file_data.decode("utf-8").splitlines(): qa_references.append(json.loads(line)) except json.JSONDecodeError: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.") + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.") # Count total chunks by resource in database try: - documents = fetch_all_documents( - index_name=index_name, - es_client=es_client, - size=size) - id, counts = np.unique([resource["metadata"]["resource_id"] - for resource in documents], return_counts=True) + documents = fetch_all_documents(index_name=index_name, es_client=es_client, size=size) + id, counts = np.unique([resource["metadata"]["resource_id"] for resource in documents], return_counts=True) resources_counts = dict(zip(id, counts)) except Exception as e: logger.error(f"Error retrieving documents: {str(e)}") - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Error retrieving documents: {str(e)}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error retrieving documents: {str(e)}") # Evaluate retrieval try: if clearml_track_experiment: @@ -60,14 +56,14 @@ async def evaluate_retrieval(file: UploadFile = File(...), "search_text_boost": search_text_boost, "search_embedding_boost": search_embedding_boost, "k": k, + "rerank_top_k": rerank_top_k, "urls_in_resources": urls_in_resources, "questions_with_ids_and_dates": questions_with_ids_and_dates, "chunk_size": chunk_size, - "chunk_overlap": chunk_overlap + "chunk_overlap": chunk_overlap, } unique_task_name = f"{clearml_experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - task = Task.init(project_name=clearml_project_name, - task_name=unique_task_name) + task = Task.init(project_name=clearml_project_name, task_name=unique_task_name) task.connect(params) retrieval_metrics = evaluate_resources_summaries_retrieval( @@ -77,7 +73,9 @@ async def evaluate_retrieval(file: UploadFile = File(...), qa_references=qa_references, search_text_boost=search_text_boost, search_embedding_boost=search_embedding_boost, - k=k) + k=k, + rerank_top_k=rerank_top_k, + ) # Upload metrics and close task if task: @@ -90,5 +88,6 @@ async def evaluate_retrieval(file: UploadFile = File(...), except Exception as e: logger.error(f"Error during retrieval evaluation: {str(e)}") - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Error during retrieval evaluation: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error during retrieval evaluation: {str(e)}" + ) diff --git a/app/routes/llm_endpoints.py b/app/routes/llm_endpoints.py index e728b86..6e18d67 100644 --- a/app/routes/llm_endpoints.py +++ b/app/routes/llm_endpoints.py @@ -19,8 +19,9 @@ async def answer_query( query: str, k: int = 5, params=None, stream: bool = False, text_boost: float = 0.25, embedding_boost: float = 4.0 ): - results = search_query(query, embedding_model, es_client, k=k, - text_boost=text_boost, embedding_boost=embedding_boost) + results = search_query( + query, embedding_model, es_client, k=k, text_boost=text_boost, embedding_boost=embedding_boost, rerank_top_k=0 + ) if not results: concatenated_content = "There is no context" else: @@ -41,21 +42,17 @@ async def summarize( resources = await file.read() resources = json.loads(resources) except json.JSONDecodeError: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.") + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.") # Process file and summarize resources try: limit = 1 if limit <= 0 else limit if limit: - resources_processed = process_resources( - data=resources, remove_urls=remove_urls)[:limit] + resources_processed = process_resources(data=resources, remove_urls=remove_urls)[:limit] else: - resources_processed = process_resources( - data=resources, remove_urls=remove_urls) + resources_processed = process_resources(data=resources, remove_urls=remove_urls) resources_summarized = summarize_resources(resources_processed, stream) except Exception as e: - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Error during processing: {str(e)}") + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error during processing: {str(e)}") # Save resources try: helpers.bulk( diff --git a/app/services/conversation.py b/app/services/conversation.py index cd912b9..7e8aded 100644 --- a/app/services/conversation.py +++ b/app/services/conversation.py @@ -1,19 +1,21 @@ import time +from typing import List from fastapi.responses import StreamingResponse from app.services.llama_client import llm_client from app.config.settings import logger +from app.data_models.search_result import SearchResult -def process_search_output(search_results): +def process_search_output(search_results: List[SearchResult]): logger.info("Processing search results") processed_contents = [] resources_id = [] for result in search_results: - content = result["content"] - resource_id = result["metadata"]["resource_id"] + content = result.content + resource_id = result.metadata["resource_id"] processed_contents.append(content.replace("\\", "")) resources_id.append(resource_id) diff --git a/app/services/reranking.py b/app/services/reranking.py new file mode 100644 index 0000000..91f2cf8 --- /dev/null +++ b/app/services/reranking.py @@ -0,0 +1,22 @@ +from FlagEmbedding import FlagReranker +from typing import List, Tuple + +from app.data_models.search_result import SearchResult + + +class RerankingService: + def __init__(self, model_name="BAAI/bge-reranker-v2-m3"): + self.reranker = FlagReranker(model_name, use_fp16=True) + + def rerank(self, query: str, documents: List[SearchResult]) -> List[Tuple[SearchResult, float]]: + """Computes a score for each document in the list of documents and returns a ranked list of documents. + + :param str query: user query used for ranking + :param List[str] documents: Documents to be ranked + :return tuple(str, float): list of tuples containing the document and its score + """ + scores = self.reranker.compute_score([[query, doc.content] for doc in documents], normalize=True) + print(scores) + + ranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True) + return ranked_docs diff --git a/app/services/search_documents.py b/app/services/search_documents.py index 12c1b09..70f11ea 100644 --- a/app/services/search_documents.py +++ b/app/services/search_documents.py @@ -1,4 +1,8 @@ +from typing import List + +from app import reranker_service from app.config.settings import settings +from app.data_models.search_result import SearchResult def search_query( @@ -9,11 +13,11 @@ def search_query( k=5, text_boost=0.25, embedding_boost=4.0, -): - query_embedding = embedding_model.encode(query_text, - show_progress_bar=False).tolist() + rerank_top_k=0, +) -> List[SearchResult]: + query_embedding = embedding_model.encode(query_text, show_progress_bar=False).tolist() query_body = { - "size": k, + "size": max(k, rerank_top_k), "query": { "bool": { "should": [ @@ -41,32 +45,26 @@ def search_query( } response = es_client.search(index=index_name, body=query_body) results = response["hits"]["hits"] - return [ - { - "score": result["_score"], - "content": str(result["_source"]["content"]), - "metadata": result["_source"].get("metadata", {}), - } + search_results = [ + SearchResult( + score=result["_score"], + content=str(result["_source"]["content"]), + metadata=result["_source"].get("metadata", {}), + ) for result in results ] + if rerank_top_k > 0: + search_results = [result for result, score in reranker_service.rerank(query_text, search_results)[:k]] + return search_results -def fetch_all_documents(es_client, - index_name=settings.elasticsearch.index_name, - size: int = 2000): - query_body = {"query": - { - "match_all": {} - }, - "_source": ["content", "metadata"], - "size": size - } +def fetch_all_documents(es_client, index_name=settings.elasticsearch.index_name, size: int = 2000): + query_body = {"query": {"match_all": {}}, "_source": ["content", "metadata"], "size": size} response = es_client.search(index=index_name, body=query_body) results = response["hits"]["hits"] return [ - {"id": result["_id"], "content": result["_source"].get( - "content", ""), "metadata": result["_source"].get("metadata", {})} + {"id": result["_id"], "content": result["_source"].get("content", ""), "metadata": result["_source"].get("metadata", {})} for result in results ] diff --git a/evaluation/core/evaluators/generation/correctness.py b/evaluation/core/evaluators/generation/correctness.py index 2d9ad90..5ac9aca 100644 --- a/evaluation/core/evaluators/generation/correctness.py +++ b/evaluation/core/evaluators/generation/correctness.py @@ -88,25 +88,24 @@ def run_correctness_eval(self, query_str: str, reference_answer: str, generated_ """ try: user_prompt = CORRECTNESS_USER_TMPL.format( - query=query_str, - reference_answer=reference_answer, - generated_answer=generated_answer) + query=query_str, reference_answer=reference_answer, generated_answer=generated_answer + ) system_prompt = CORRECTNESS_SYS_TMPL - open_ai_response = get_chat_completion(self.openai_api_key, - user_prompt, - system_prompt, - ANSWER_JSON_SCHEMA, - model=self.model, - max_tokens=self.max_tokens) - json_answer = json.loads(open_ai_response.get("choices")[ - 0].get("message").get("content")) + open_ai_response = get_chat_completion( + self.openai_api_key, user_prompt, system_prompt, ANSWER_JSON_SCHEMA, model=self.model, max_tokens=self.max_tokens + ) + json_answer = json.loads(open_ai_response.get("choices")[0].get("message").get("content")) score = json_answer["score"] reasoning = json_answer["reasoning"] - return {"score": score, "reasoning": reasoning, "passing": score >= self.threshold, } + return { + "score": score, + "reasoning": reasoning, + "passing": score >= self.threshold, + } except json.JSONDecodeError as e: logging.error(f"Failed to decode JSON response: {e}") @@ -120,14 +119,15 @@ def run_correctness_eval(self, query_str: str, reference_answer: str, generated_ logging.error(f"An error occurred: {e}") return {"score": None, "passing": None, "reasoning": "An unexpected error occurred"} - def run_batch_evaluation(self, - df: pd.DataFrame, - output_file: str, - query_column: str, - reference_answer_column: str, - generated_answer_column: str, - resource_id_column: str - ): + def run_batch_evaluation( + self, + df: pd.DataFrame, + output_file: str, + query_column: str, + reference_answer_column: str, + generated_answer_column: str, + resource_id_column: str, + ): """ Runs correctness evaluation on a batch of queries, reference answers, and generated answers. Saves results incrementally to avoid data loss in case of failure. @@ -143,9 +143,8 @@ def run_batch_evaluation(self, # Determine if the file already exists file_exists = os.path.isfile(output_file) - with open(output_file, mode='a', newline='') as file: - writer = csv.DictWriter( - file, fieldnames=[resource_id_column, 'score', 'reasoning', 'passing']) + with open(output_file, mode="a", newline="") as file: + writer = csv.DictWriter(file, fieldnames=[resource_id_column, "score", "reasoning", "passing"]) # Write header only if the file does not exist if not file_exists: @@ -154,9 +153,8 @@ def run_batch_evaluation(self, try: for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing correctness"): result = self.run_correctness_eval( - row[query_column], - row[reference_answer_column], - row[generated_answer_column]) + row[query_column], row[reference_answer_column], row[generated_answer_column] + ) result[resource_id_column] = row[resource_id_column] # Write the result to the CSV file writer.writerow(result) @@ -171,7 +169,6 @@ def run_batch_evaluation(self, # Load the results back into a DataFrame and concatenate with the original results_df = pd.read_csv(output_file) - correctnes_mean_score = round(results_df["score"].sum( - ) / (len(results_df) * 5), 2) + correctnes_mean_score = round(results_df["score"].sum() / (len(results_df) * 5), 2) return correctnes_mean_score diff --git a/evaluation/core/evaluators/generation/faithfullness.py b/evaluation/core/evaluators/generation/faithfullness.py index 88daf2c..9f93407 100644 --- a/evaluation/core/evaluators/generation/faithfullness.py +++ b/evaluation/core/evaluators/generation/faithfullness.py @@ -90,24 +90,18 @@ def run_faithfulness_eval(self, generated_answer: str, contexts: str): - dict, containing evaluations on relevancy, accuracy, conciseness and pertinence, and reasoning. """ try: - user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer, - contexts=contexts) + user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer, contexts=contexts) system_prompt = FAITHFULLNESS_SYS_TMPL - open_ai_response = get_chat_completion(self.openai_api_key, - user_prompt, - system_prompt, - ANSWER_JSON_SCHEMA, - model=self.model, - max_tokens=self.max_tokens) + open_ai_response = get_chat_completion( + self.openai_api_key, user_prompt, system_prompt, ANSWER_JSON_SCHEMA, model=self.model, max_tokens=self.max_tokens + ) - json_answer = json.loads(open_ai_response.get("choices")[ - 0].get("message").get("content")) + json_answer = json.loads(open_ai_response.get("choices")[0].get("message").get("content")) relevancy = 1 if json_answer["relevancy"] == "YES" else 0 accuracy = 1 if json_answer["accuracy"] == "YES" else 0 - conciseness_and_pertinence = 1 if json_answer[ - "conciseness_and_pertinence"] == "YES" else 0 + conciseness_and_pertinence = 1 if json_answer["conciseness_and_pertinence"] == "YES" else 0 reasoning = json_answer["reasoning"] return { @@ -123,18 +117,25 @@ def run_faithfulness_eval(self, generated_answer: str, contexts: str): except KeyError as e: logging.error(f"Missing key in JSON response: {e}") - return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "Incomplete JSON response"} + return { + "relevancy": None, + "accuracy": None, + "conciseness_and_pertinence": None, + "reasoning": "Incomplete JSON response", + } except Exception as e: logging.error(f"An error occurred: {e}") - return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "An unexpected error occurred"} - - def run_batch_evaluation(self, - df: pd.DataFrame, - output_file: str, - generated_answer_column: str, - contexts_column: str, - resource_id_column: str): + return { + "relevancy": None, + "accuracy": None, + "conciseness_and_pertinence": None, + "reasoning": "An unexpected error occurred", + } + + def run_batch_evaluation( + self, df: pd.DataFrame, output_file: str, generated_answer_column: str, contexts_column: str, resource_id_column: str + ): """ Runs faithfulness evaluation on a batch of generated answers and contexts. Saves results incrementally to avoid data loss in case of failure. @@ -149,9 +150,10 @@ def run_batch_evaluation(self, # Determine if the file already exists file_exists = os.path.isfile(output_file) - with open(output_file, mode='a', newline='') as file: - writer = csv.DictWriter(file, fieldnames=[ - resource_id_column, 'relevancy', 'accuracy', 'conciseness_and_pertinence', 'reasoning']) + with open(output_file, mode="a", newline="") as file: + writer = csv.DictWriter( + file, fieldnames=[resource_id_column, "relevancy", "accuracy", "conciseness_and_pertinence", "reasoning"] + ) # Write header only if the file does not exist if not file_exists: @@ -159,9 +161,7 @@ def run_batch_evaluation(self, try: for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing faithfulness"): - result = self.run_faithfulness_eval( - row[generated_answer_column], - row[contexts_column]) + result = self.run_faithfulness_eval(row[generated_answer_column], row[contexts_column]) result[resource_id_column] = row[resource_id_column] # Write the result to the CSV file writer.writerow(result) @@ -177,11 +177,8 @@ def run_batch_evaluation(self, results_df = pd.read_csv(output_file) total_questions = len(results_df) - faithfulness_relevancy = round(results_df["relevancy"].sum( - ) / total_questions, 2) - faithfulness_accuracy = round( - results_df["accuracy"].sum() / total_questions, 2) - faithfulness_conciseness_and_pertinence = round(results_df["conciseness_and_pertinence"].sum( - ) / total_questions, 2) + faithfulness_relevancy = round(results_df["relevancy"].sum() / total_questions, 2) + faithfulness_accuracy = round(results_df["accuracy"].sum() / total_questions, 2) + faithfulness_conciseness_and_pertinence = round(results_df["conciseness_and_pertinence"].sum() / total_questions, 2) return faithfulness_relevancy, faithfulness_accuracy, faithfulness_conciseness_and_pertinence diff --git a/evaluation/core/openai/openai.py b/evaluation/core/openai/openai.py index 1452dd7..e305831 100644 --- a/evaluation/core/openai/openai.py +++ b/evaluation/core/openai/openai.py @@ -25,20 +25,17 @@ def get_chat_completion( - dict, the response from the OpenAI API. """ - headers = {"Content-Type": "application/json", - "Authorization": f"Bearer {openai_api_key}"} + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {openai_api_key}"} payload = { "model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}], "response_format": answer_json_schema, - "max_tokens": max_tokens + "max_tokens": max_tokens, } try: - response = requests.post(endpoint_url, - headers=headers, - json=payload) + response = requests.post(endpoint_url, headers=headers, json=payload) response.raise_for_status() return response.json() except requests.exceptions.HTTPError as http_err: @@ -49,42 +46,39 @@ def get_chat_completion( return None -def get_total_tokens_from_string(string: str, - encoding_name: str = "o200k_base") -> int: +def get_total_tokens_from_string(string: str, encoding_name: str = "o200k_base") -> int: """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding(encoding_name) return len(encoding.encode(string)) -def calculate_total_tokens(df, - query_column, - generated_answer_column, - contexts_column, - reference_answer_column, - system_template: str, - user_template: str, - template_type: str, - encoding_name: str) -> int: +def calculate_total_tokens( + df, + query_column, + generated_answer_column, + contexts_column, + reference_answer_column, + system_template: str, + user_template: str, + template_type: str, + encoding_name: str, +) -> int: """Calculates the total number of tokens required for the batch based on the type of evaluation.""" total_tokens = 0 for _, row in df.iterrows(): - if template_type == 'correctness': + if template_type == "correctness": user_prompt = user_template.format( query=row[query_column], reference_answer=row[reference_answer_column], - generated_answer=row[generated_answer_column] - ) - elif template_type == 'faithfulness': - user_prompt = user_template.format( generated_answer=row[generated_answer_column], - contexts=row[contexts_column] ) + elif template_type == "faithfulness": + user_prompt = user_template.format(generated_answer=row[generated_answer_column], contexts=row[contexts_column]) system_prompt = system_template complete_prompt = system_prompt + user_prompt - total_tokens += get_total_tokens_from_string( - complete_prompt, encoding_name=encoding_name) + total_tokens += get_total_tokens_from_string(complete_prompt, encoding_name=encoding_name) return total_tokens @@ -111,13 +105,11 @@ def calculate_api_costs( """ # Calculate input token costs - input_cost = round(total_input_tokens * - (cost_per_million_input_tokens / 1_000_000), 3) + input_cost = round(total_input_tokens * (cost_per_million_input_tokens / 1_000_000), 3) # Calculate total output tokens and their costs total_output_tokens = total_openai_requests * tokens_per_response - output_cost = round(total_output_tokens * - (cost_per_million_output_tokens / 1_000_000), 3) + output_cost = round(total_output_tokens * (cost_per_million_output_tokens / 1_000_000), 3) # Calculate the total cost total_cost = round(input_cost + output_cost, 2) @@ -127,5 +119,5 @@ def calculate_api_costs( "input_cost": input_cost, "output_cost": output_cost, "total_input_tokens": total_input_tokens, - "total_output_tokens": total_output_tokens + "total_output_tokens": total_output_tokens, } diff --git a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py index 0b3335f..dfa7880 100644 --- a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py +++ b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py @@ -13,14 +13,11 @@ with open(INPUT_PATH, "r") as config_file: config = json.load(config_file) -DATA_DIR = os.path.abspath(os.path.join( - os.path.dirname(__file__), "..", "..", "..", "data")) +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "data")) # Files -REFERENCE_ANSWERS_FILE = os.path.join( - DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"]) -GENERATED_ANSWERS_FILE = os.path.join( - DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"]) +REFERENCE_ANSWERS_FILE = os.path.join(DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"]) +GENERATED_ANSWERS_FILE = os.path.join(DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"]) # Interest columns QUERY_COLUMN = config.get("QUERY_COLUMN") @@ -30,27 +27,27 @@ # Output costs OUTPUT_COSTS = os.path.join(os.path.dirname(__file__), "data", "output_estimate_costs.txt") -COST_PER_MILLION_INPUT_TOKENS= config.get("COST_PER_MILLION_INPUT_TOKENS") -COST_PER_MILLION_OUTPUT_TOKENS= config.get("COST_PER_MILLION_OUTPUT_TOKENS") +COST_PER_MILLION_INPUT_TOKENS = config.get("COST_PER_MILLION_INPUT_TOKENS") +COST_PER_MILLION_OUTPUT_TOKENS = config.get("COST_PER_MILLION_OUTPUT_TOKENS") def main(): # Read data - generated_answers= pd.read_csv(GENERATED_ANSWERS_FILE) - reference_answers= pd.read_csv(REFERENCE_ANSWERS_FILE) + generated_answers = pd.read_csv(GENERATED_ANSWERS_FILE) + reference_answers = pd.read_csv(REFERENCE_ANSWERS_FILE) generated_answers[REFERENCE_ANSWER_COLUMN] = reference_answers[REFERENCE_ANSWER_COLUMN] # Calculate total tokens - total_tokens_input_correctness= calculate_total_tokens( - df = generated_answers, + total_tokens_input_correctness = calculate_total_tokens( + df=generated_answers, query_column=QUERY_COLUMN, generated_answer_column=GENERATED_ANSWER_COLUMN, contexts_column=CONTEXTS_COLUMN, reference_answer_column=REFERENCE_ANSWER_COLUMN, - system_template = CORRECTNESS_SYS_TMPL, - user_template = CORRECTNESS_USER_TMPL, - template_type = 'correctness', - encoding_name = 'o200k_base' + system_template=CORRECTNESS_SYS_TMPL, + user_template=CORRECTNESS_USER_TMPL, + template_type="correctness", + encoding_name="o200k_base", ) total_tokens_input_faithfulness = calculate_total_tokens( df=generated_answers, @@ -60,8 +57,8 @@ def main(): reference_answer_column=REFERENCE_ANSWER_COLUMN, system_template=FAITHFULLNESS_SYS_TMPL, user_template=FAITHFULLNESS_USER_TMPL, - template_type='faithfulness', - encoding_name='o200k_base' + template_type="faithfulness", + encoding_name="o200k_base", ) # Calculate costs @@ -83,31 +80,23 @@ def main(): ) # Save total costs - with open(OUTPUT_COSTS, 'w') as f: + with open(OUTPUT_COSTS, "w") as f: f.write("Total Estimated Costs:\n") - f.write( - f"${cost_estimate_correctness['total_cost'] + cost_estimate_faithfulness['total_cost']}\n\n\n") + f.write(f"${cost_estimate_correctness['total_cost'] + cost_estimate_faithfulness['total_cost']}\n\n\n") f.write("Correctness Evaluation Costs:\n") - f.write( - f"Estimated Total Costs: ${cost_estimate_correctness['total_cost']}\n") + f.write(f"Estimated Total Costs: ${cost_estimate_correctness['total_cost']}\n") f.write(f"Input Costs: ${cost_estimate_correctness['input_cost']}\n") - f.write( - f"Output Costs: ${cost_estimate_correctness['output_cost']}\n") - f.write( - f"Total Input Tokens: {cost_estimate_correctness['total_input_tokens']}\n") - f.write( - f"Total Output Tokens: {cost_estimate_correctness['total_output_tokens']}\n\n") + f.write(f"Output Costs: ${cost_estimate_correctness['output_cost']}\n") + f.write(f"Total Input Tokens: {cost_estimate_correctness['total_input_tokens']}\n") + f.write(f"Total Output Tokens: {cost_estimate_correctness['total_output_tokens']}\n\n") f.write("\nFaithfulness Evaluation Costs:\n") - f.write( - f"Estimated Total Costs: ${cost_estimate_faithfulness['total_cost']}\n") + f.write(f"Estimated Total Costs: ${cost_estimate_faithfulness['total_cost']}\n") f.write(f"Input Costs: ${cost_estimate_faithfulness['input_cost']}\n") - f.write( - f"Output Costs: ${cost_estimate_faithfulness['output_cost']}\n") - f.write( - f"Total Input Tokens: {cost_estimate_faithfulness['total_input_tokens']}\n") - f.write( - f"Total Output Tokens: {cost_estimate_faithfulness['total_output_tokens']}\n") + f.write(f"Output Costs: ${cost_estimate_faithfulness['output_cost']}\n") + f.write(f"Total Input Tokens: {cost_estimate_faithfulness['total_input_tokens']}\n") + f.write(f"Total Output Tokens: {cost_estimate_faithfulness['total_output_tokens']}\n") + if __name__ == "__main__": main() diff --git a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py index 5c4e9fd..8243678 100644 --- a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py +++ b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py @@ -9,13 +9,10 @@ with open(os.path.join(os.path.dirname(__file__), "data", "input_generate_responses.json"), "r") as config_file: config = json.load(config_file) -DATA_DIR = os.path.abspath(os.path.join( - os.path.dirname(__file__), "..", "..", "data")) +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data")) JSONL_FILE = os.path.join(DATA_DIR, "openai_outputs", config["JSONL_FILE"]) -JSONL_OUTPUT_CSV = os.path.join( - DATA_DIR, "openai_outputs", config["JSONL_FILE"].replace(".jsonl", ".csv")) -RAG_OUTPUT_CSV = os.path.join( - DATA_DIR, "rag_generation", config["RAG_OUTPUT_CSV"]) +JSONL_OUTPUT_CSV = os.path.join(DATA_DIR, "openai_outputs", config["JSONL_FILE"].replace(".jsonl", ".csv")) +RAG_OUTPUT_CSV = os.path.join(DATA_DIR, "rag_generation", config["RAG_OUTPUT_CSV"]) SERVER_URL = config["SERVER_URL"] CORES = config["CORES"] diff --git a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py index bb3c59b..f56180f 100644 --- a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py +++ b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py @@ -14,31 +14,24 @@ load_dotenv() # Load config -INPUT_PATH = os.path.join(os.path.dirname( - __file__), "data", "input_get_metrics.json") +INPUT_PATH = os.path.join(os.path.dirname(__file__), "data", "input_get_metrics.json") with open(INPUT_PATH, "r") as config_file: config = json.load(config_file) -DATA_DIR = os.path.abspath(os.path.join( - os.path.dirname(__file__), "..", "..", "..", "data")) +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "data")) # Files -REFERENCE_ANSWERS_FILE = os.path.join( - DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"]) -GENERATED_ANSWERS_FILE = os.path.join( - DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"]) +REFERENCE_ANSWERS_FILE = os.path.join(DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"]) +GENERATED_ANSWERS_FILE = os.path.join(DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"]) # Openai API Key and model to use OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") LLM_MODEL = config.get("LLM_MODEL") # Output files -CORRECTNESS_RESULTS_CSV = os.path.join( - DATA_DIR, "openai_outputs", config["CORRECTNESS_RESULTS_CSV"]) -FAITHFULNESS_RESULTS_CSV = os.path.join( - DATA_DIR, "openai_outputs", config["FAITHFULNESS_RESULTS_CSV"]) -RESULT_METRICS_TXT = os.path.join(os.path.dirname( - __file__), "data", "output_generation_metrics.txt") +CORRECTNESS_RESULTS_CSV = os.path.join(DATA_DIR, "openai_outputs", config["CORRECTNESS_RESULTS_CSV"]) +FAITHFULNESS_RESULTS_CSV = os.path.join(DATA_DIR, "openai_outputs", config["FAITHFULNESS_RESULTS_CSV"]) +RESULT_METRICS_TXT = os.path.join(os.path.dirname(__file__), "data", "output_generation_metrics.txt") # Columns of interest QUERY_COLUMN = config["QUERY_COLUMN"] @@ -68,13 +61,8 @@ def main(): generated_answers[RESOURCE_ID_COLUMN] = reference_answers[RESOURCE_ID_COLUMN] # Instantiate evaluators - correctness_evaluator = CorrectnessEvaluator(OPENAI_API_KEY, - LLM_MODEL, - threshold=4.0, - max_tokens=300) - faithfulness_evaluator = FaithfulnessEvaluator(OPENAI_API_KEY, - LLM_MODEL, - max_tokens=300) + correctness_evaluator = CorrectnessEvaluator(OPENAI_API_KEY, LLM_MODEL, threshold=4.0, max_tokens=300) + faithfulness_evaluator = FaithfulnessEvaluator(OPENAI_API_KEY, LLM_MODEL, max_tokens=300) # Run batch evaluations correctnes_mean_score = correctness_evaluator.run_batch_evaluation( generated_answers, @@ -82,31 +70,28 @@ def main(): QUERY_COLUMN, REFERENCE_ANSWER_COLUMN, GENERATED_ANSWER_COLUMN, - RESOURCE_ID_COLUMN + RESOURCE_ID_COLUMN, ) - faithfulness_relevancy, \ - faithfulness_accuracy, \ - faithfulness_conciseness_and_pertinence = faithfulness_evaluator.run_batch_evaluation( - generated_answers, - FAITHFULNESS_RESULTS_CSV, - GENERATED_ANSWER_COLUMN, - CONTEXTS_COLUMN, - RESOURCE_ID_COLUMN + faithfulness_relevancy, faithfulness_accuracy, faithfulness_conciseness_and_pertinence = ( + faithfulness_evaluator.run_batch_evaluation( + generated_answers, FAITHFULNESS_RESULTS_CSV, GENERATED_ANSWER_COLUMN, CONTEXTS_COLUMN, RESOURCE_ID_COLUMN ) + ) - metrics = {"Correctness mean score": correctnes_mean_score, - "Faithfulness relevancy": faithfulness_relevancy, - "Faithfulness accuracy": faithfulness_accuracy, - "Faithfulness conciseness and pertinence": faithfulness_conciseness_and_pertinence} + metrics = { + "Correctness mean score": correctnes_mean_score, + "Faithfulness relevancy": faithfulness_relevancy, + "Faithfulness accuracy": faithfulness_accuracy, + "Faithfulness conciseness and pertinence": faithfulness_conciseness_and_pertinence, + } # Save metrics to txt - with open(RESULT_METRICS_TXT, 'w') as f: + with open(RESULT_METRICS_TXT, "w") as f: f.write(f"Correctness score: {correctnes_mean_score}\n\n") f.write(f"Faithfulness relevancy score: {faithfulness_relevancy}\n") f.write(f"Faithfulness accuracy score: {faithfulness_accuracy}\n") - f.write( - f"Faithfulness conciseness_and_pertinence score: {faithfulness_conciseness_and_pertinence}\n") + f.write(f"Faithfulness conciseness_and_pertinence score: {faithfulness_conciseness_and_pertinence}\n") # Upload metrics and artifacts to ClearML if task: @@ -120,8 +105,7 @@ def main(): if UPLOAD_ARTIFACTS: for artifact_name, file_path in artifact_files.items(): - task.upload_artifact(name=artifact_name, - artifact_object=file_path) + task.upload_artifact(name=artifact_name, artifact_object=file_path) task.close()