diff --git a/app/__init__.py b/app/__init__.py
index 5f4040f..a266343 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -2,10 +2,12 @@
 from app.config.elasticsearch_config import create_index_if_not_exists
 from app.models.sentence_transformer import get_sentence_transformer
 from app.config.settings import settings
+from app.services.reranking import RerankingService
 
 
 embedding_model = get_sentence_transformer()
 es_client = create_index_if_not_exists(settings.elasticsearch.index_name)
+reranker_service = RerankingService()
 
 
 def create_app():
diff --git a/app/config/settings.py b/app/config/settings.py
index dfa0684..28b9a41 100644
--- a/app/config/settings.py
+++ b/app/config/settings.py
@@ -47,4 +47,4 @@ def __init__(self):
 # Logging setup
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-logging.getLogger("elastic_transport").setLevel(logging.WARNING)
\ No newline at end of file
+logging.getLogger("elastic_transport").setLevel(logging.WARNING)
diff --git a/app/data_models/__init__.py b/app/data_models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/data_models/search_result.py b/app/data_models/search_result.py
new file mode 100644
index 0000000..1db20d8
--- /dev/null
+++ b/app/data_models/search_result.py
@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class SearchResult:
+    score: float
+    content: str
+    metadata: dict
diff --git a/app/evaluation/retrieval/retrieval_metrics.py b/app/evaluation/retrieval/retrieval_metrics.py
index d40c000..5e097b3 100644
--- a/app/evaluation/retrieval/retrieval_metrics.py
+++ b/app/evaluation/retrieval/retrieval_metrics.py
@@ -13,7 +13,8 @@ def evaluate_resources_summaries_retrieval(
     qa_references: list[dict],
     search_text_boost: int = 1,
     search_embedding_boost: int = 1,
-    k: int = 5
+    k: int = 5,
+    rerank_top_k: int = 0,
 ) -> dict:
     # Initialize counters and sums for metrics
     total_questions = 0
@@ -29,8 +30,7 @@ def evaluate_resources_summaries_retrieval(
         reference_resource_id = response["custom_id"]
         content = response["response"]["body"]["choices"][0]["message"]["content"]
 
-        questions_and_answers = json.loads(
-            content)["questions_and_answers"]
+        questions_and_answers = json.loads(content)["questions_and_answers"]
 
         if len(questions_and_answers) > 0:
             # Sample one random question per resource_id to evaluate
@@ -42,13 +42,15 @@ def evaluate_resources_summaries_retrieval(
                     total_questions += 1
 
                     # Query question
-                    search_results = search_query(question,
-                                                  embedding_model,
-                                                  es_client,
-                                                  k=k,
-                                                  text_boost=search_text_boost,
-                                                  embedding_boost=search_embedding_boost)
-
+                    search_results = search_query(
+                        question,
+                        embedding_model,
+                        es_client,
+                        k=k,
+                        text_boost=search_text_boost,
+                        embedding_boost=search_embedding_boost,
+                        rerank_top_k=rerank_top_k,
+                    )
                     # Evaluate if any returned chunk belongs to the correct resource_id
                     found = False
                     rank = 0
@@ -59,7 +61,7 @@ def evaluate_resources_summaries_retrieval(
 
                     if search_results != {"detail": "Not Found"}:
                         for i, result in enumerate(search_results):
-                            if result["metadata"]["resource_id"] == reference_resource_id:
+                            if result.metadata["resource_id"] == reference_resource_id:
                                 if not found:
                                     total_contexts_found += 1
                                     rank = i + 1
@@ -67,11 +69,10 @@ def evaluate_resources_summaries_retrieval(
                                     found = True
                                 retrieved_relevant_chunks += 1
                     elif search_results == {"detail": "Not Found"}:
-                        search_results = {}
+                        search_results = []
 
                     # Calculate precision and recall for this specific question
-                    precision = retrieved_relevant_chunks / \
-                        len(search_results) if len(search_results) > 0 else 0
+                    precision = retrieved_relevant_chunks / len(search_results) if len(search_results) > 0 else 0
                     recall = retrieved_relevant_chunks / relevant_chunks if relevant_chunks > 0 else 0
 
                     precision_sum += precision
@@ -81,16 +82,11 @@ def evaluate_resources_summaries_retrieval(
                         position_sum += rank
 
     # Calculate final metrics
-    retrieval_accuracy = round(
-        total_contexts_found / total_questions, 3) if total_questions > 0 else 0
-    average_position = round(
-        position_sum / total_contexts_found, 3) if total_contexts_found > 0 else 0
-    mrr = round(reciprocal_rank_sum / total_questions,
-                3) if total_questions > 0 else 0
-    average_precision = round(
-        precision_sum / total_questions, 3) if total_questions > 0 else 0
-    average_recall = round(recall_sum / total_questions,
-                           3) if total_questions > 0 else 0
+    retrieval_accuracy = round(total_contexts_found / total_questions, 3) if total_questions > 0 else 0
+    average_position = round(position_sum / total_contexts_found, 3) if total_contexts_found > 0 else 0
+    mrr = round(reciprocal_rank_sum / total_questions, 3) if total_questions > 0 else 0
+    average_precision = round(precision_sum / total_questions, 3) if total_questions > 0 else 0
+    average_recall = round(recall_sum / total_questions, 3) if total_questions > 0 else 0
 
     return {
         # The percentage of questions for which the system successfully retrieved at least one relevant chunk.
diff --git a/app/routes/database_endpoints.py b/app/routes/database_endpoints.py
index 1180819..e0eddc1 100644
--- a/app/routes/database_endpoints.py
+++ b/app/routes/database_endpoints.py
@@ -23,8 +23,7 @@ async def bulk_load(file: UploadFile = File(...), text_key: str = Form(...)):
         json_data = csv_to_dict(data)
 
     else:
-        raise HTTPException(
-            status_code=400, detail="Unsupported file format. Only JSON and CSV are supported.")
+        raise HTTPException(status_code=400, detail="Unsupported file format. Only JSON and CSV are supported.")
 
     try:
         helpers.bulk(
@@ -43,33 +42,25 @@ async def bulk_load(file: UploadFile = File(...), text_key: str = Form(...)):
 @router.delete("/delete_all_documents")
 async def delete_all_documents(index_name: str):
     try:
-        es_client.delete_by_query(index=index_name, body={
-                                  "query": {"match_all": {}}})
+        es_client.delete_by_query(index=index_name, body={"query": {"match_all": {}}})
         logger.info(f"All documents deleted from index '{index_name}'")
         return {"status": "success", "message": f"All documents deleted from index '{index_name}'"}
     except Exception as e:
         logger.error(f"Failed to delete documents: {str(e)}")
-        raise HTTPException(
-            status_code=500, detail=f"Failed to delete documents: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Failed to delete documents: {str(e)}")
 
 
 @router.get("/get_all_documents")
-async def get_all_documents(index_name: str = settings.elasticsearch.index_name,
-                            size: int = 2000):
+async def get_all_documents(index_name: str = settings.elasticsearch.index_name, size: int = 2000):
     try:
-        documents = fetch_all_documents(
-            index_name=index_name, 
-            es_client=es_client,
-            size=size)
+        documents = fetch_all_documents(index_name=index_name, es_client=es_client, size=size)
         return documents
     except Exception as e:
         logger.error(f"Error retrieving documents: {str(e)}")
-        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                            detail=f"Error retrieving documents: {str(e)}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error retrieving documents: {str(e)}")
 
 
 @router.get("/search")
 async def search_documents(query: str, k: int = 5, text_boost: float = 0.25, embedding_boost: float = 4.0):
-    results = search_query(query, embedding_model, es_client, k=k,
-                           text_boost=text_boost, embedding_boost=embedding_boost)
+    results = search_query(query, embedding_model, es_client, k=k, text_boost=text_boost, embedding_boost=embedding_boost)
     return results
diff --git a/app/routes/evaluation_endpoints.py b/app/routes/evaluation_endpoints.py
index 9298cc2..f8731fe 100644
--- a/app/routes/evaluation_endpoints.py
+++ b/app/routes/evaluation_endpoints.py
@@ -15,44 +15,40 @@
 
 
 @router.post("/evaluate_retrieval")
-async def evaluate_retrieval(file: UploadFile = File(...),
-                             index_name: str = Form(
-                                 settings.elasticsearch.index_name),
-                             size: int = Form(2000),
-                             search_text_boost: float = Form(1),
-                             search_embedding_boost: float = Form(1),
-                             k: int = Form(5),
-                             urls_in_resources: bool = Form(None),
-                             questions_with_ids_and_dates: str = Form(None),
-                             chunk_size: int = Form(None),
-                             chunk_overlap: int = Form(None),
-                             clearml_track_experiment: bool = Form(False),
-                             clearml_experiment_name: str = Form("Retrieval evaluation"),
-                             clearml_project_name: str = Form("Fasten")):
+async def evaluate_retrieval(
+    file: UploadFile = File(...),
+    index_name: str = Form(settings.elasticsearch.index_name),
+    size: int = Form(2000),
+    search_text_boost: float = Form(1),
+    search_embedding_boost: float = Form(1),
+    k: int = Form(5),
+    rerank_top_k: int = Form(0),
+    urls_in_resources: bool = Form(None),
+    questions_with_ids_and_dates: str = Form(None),
+    chunk_size: int = Form(None),
+    chunk_overlap: int = Form(None),
+    clearml_track_experiment: bool = Form(False),
+    clearml_experiment_name: str = Form("Retrieval evaluation"),
+    clearml_project_name: str = Form("Fasten"),
+):
     # Read and process reference questions and answers in JSONL
     try:
         qa_references = []
 
         file_data = await file.read()
 
-        for line in file_data.decode('utf-8').splitlines():
+        for line in file_data.decode("utf-8").splitlines():
             qa_references.append(json.loads(line))
     except json.JSONDecodeError:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.")
     # Count total chunks by resource in database
     try:
-        documents = fetch_all_documents(
-            index_name=index_name,
-            es_client=es_client,
-            size=size)
-        id, counts = np.unique([resource["metadata"]["resource_id"]
-                               for resource in documents], return_counts=True)
+        documents = fetch_all_documents(index_name=index_name, es_client=es_client, size=size)
+        id, counts = np.unique([resource["metadata"]["resource_id"] for resource in documents], return_counts=True)
         resources_counts = dict(zip(id, counts))
     except Exception as e:
         logger.error(f"Error retrieving documents: {str(e)}")
-        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                            detail=f"Error retrieving documents: {str(e)}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error retrieving documents: {str(e)}")
     # Evaluate retrieval
     try:
         if clearml_track_experiment:
@@ -60,14 +56,14 @@ async def evaluate_retrieval(file: UploadFile = File(...),
                 "search_text_boost": search_text_boost,
                 "search_embedding_boost": search_embedding_boost,
                 "k": k,
+                "rerank_top_k": rerank_top_k,
                 "urls_in_resources": urls_in_resources,
                 "questions_with_ids_and_dates": questions_with_ids_and_dates,
                 "chunk_size": chunk_size,
-                "chunk_overlap": chunk_overlap
+                "chunk_overlap": chunk_overlap,
             }
             unique_task_name = f"{clearml_experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-            task = Task.init(project_name=clearml_project_name,
-                             task_name=unique_task_name)
+            task = Task.init(project_name=clearml_project_name, task_name=unique_task_name)
             task.connect(params)
 
         retrieval_metrics = evaluate_resources_summaries_retrieval(
@@ -77,7 +73,9 @@ async def evaluate_retrieval(file: UploadFile = File(...),
             qa_references=qa_references,
             search_text_boost=search_text_boost,
             search_embedding_boost=search_embedding_boost,
-            k=k)
+            k=k,
+            rerank_top_k=rerank_top_k,
+        )
 
         # Upload metrics and close task
         if task:
@@ -90,5 +88,6 @@ async def evaluate_retrieval(file: UploadFile = File(...),
 
     except Exception as e:
         logger.error(f"Error during retrieval evaluation: {str(e)}")
-        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                            detail=f"Error during retrieval evaluation: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error during retrieval evaluation: {str(e)}"
+        )
diff --git a/app/routes/llm_endpoints.py b/app/routes/llm_endpoints.py
index e728b86..6e18d67 100644
--- a/app/routes/llm_endpoints.py
+++ b/app/routes/llm_endpoints.py
@@ -19,8 +19,9 @@
 async def answer_query(
     query: str, k: int = 5, params=None, stream: bool = False, text_boost: float = 0.25, embedding_boost: float = 4.0
 ):
-    results = search_query(query, embedding_model, es_client, k=k,
-                           text_boost=text_boost, embedding_boost=embedding_boost)
+    results = search_query(
+        query, embedding_model, es_client, k=k, text_boost=text_boost, embedding_boost=embedding_boost, rerank_top_k=0
+    )
     if not results:
         concatenated_content = "There is no context"
     else:
@@ -41,21 +42,17 @@ async def summarize(
         resources = await file.read()
         resources = json.loads(resources)
     except json.JSONDecodeError:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON format.")
     # Process file and summarize resources
     try:
         limit = 1 if limit <= 0 else limit
         if limit:
-            resources_processed = process_resources(
-                data=resources, remove_urls=remove_urls)[:limit]
+            resources_processed = process_resources(data=resources, remove_urls=remove_urls)[:limit]
         else:
-            resources_processed = process_resources(
-                data=resources, remove_urls=remove_urls)
+            resources_processed = process_resources(data=resources, remove_urls=remove_urls)
         resources_summarized = summarize_resources(resources_processed, stream)
     except Exception as e:
-        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                            detail=f"Error during processing: {str(e)}")
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error during processing: {str(e)}")
     # Save resources
     try:
         helpers.bulk(
diff --git a/app/services/conversation.py b/app/services/conversation.py
index cd912b9..7e8aded 100644
--- a/app/services/conversation.py
+++ b/app/services/conversation.py
@@ -1,19 +1,21 @@
 import time
+from typing import List
 
 from fastapi.responses import StreamingResponse
 
 from app.services.llama_client import llm_client
 from app.config.settings import logger
+from app.data_models.search_result import SearchResult
 
 
-def process_search_output(search_results):
+def process_search_output(search_results: List[SearchResult]):
     logger.info("Processing search results")
     processed_contents = []
     resources_id = []
 
     for result in search_results:
-        content = result["content"]
-        resource_id = result["metadata"]["resource_id"]
+        content = result.content
+        resource_id = result.metadata["resource_id"]
 
         processed_contents.append(content.replace("\\", ""))
         resources_id.append(resource_id)
diff --git a/app/services/reranking.py b/app/services/reranking.py
new file mode 100644
index 0000000..91f2cf8
--- /dev/null
+++ b/app/services/reranking.py
@@ -0,0 +1,22 @@
+from FlagEmbedding import FlagReranker
+from typing import List, Tuple
+
+from app.data_models.search_result import SearchResult
+
+
+class RerankingService:
+    def __init__(self, model_name="BAAI/bge-reranker-v2-m3"):
+        self.reranker = FlagReranker(model_name, use_fp16=True)
+
+    def rerank(self, query: str, documents: List[SearchResult]) -> List[Tuple[SearchResult, float]]:
+        """Computes a score for each document in the list of documents and returns a ranked list of documents.
+
+        :param str query: user query used for ranking
+        :param List[str] documents: Documents to be ranked
+        :return tuple(str, float): list of tuples containing the document and its score
+        """
+        scores = self.reranker.compute_score([[query, doc.content] for doc in documents], normalize=True)
+        print(scores)
+
+        ranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
+        return ranked_docs
diff --git a/app/services/search_documents.py b/app/services/search_documents.py
index 12c1b09..70f11ea 100644
--- a/app/services/search_documents.py
+++ b/app/services/search_documents.py
@@ -1,4 +1,8 @@
+from typing import List
+
+from app import reranker_service
 from app.config.settings import settings
+from app.data_models.search_result import SearchResult
 
 
 def search_query(
@@ -9,11 +13,11 @@ def search_query(
     k=5,
     text_boost=0.25,
     embedding_boost=4.0,
-):
-    query_embedding = embedding_model.encode(query_text,
-                                             show_progress_bar=False).tolist()
+    rerank_top_k=0,
+) -> List[SearchResult]:
+    query_embedding = embedding_model.encode(query_text, show_progress_bar=False).tolist()
     query_body = {
-        "size": k,
+        "size": max(k, rerank_top_k),
         "query": {
             "bool": {
                 "should": [
@@ -41,32 +45,26 @@ def search_query(
     }
     response = es_client.search(index=index_name, body=query_body)
     results = response["hits"]["hits"]
-    return [
-        {
-            "score": result["_score"],
-            "content": str(result["_source"]["content"]),
-            "metadata": result["_source"].get("metadata", {}),
-        }
+    search_results = [
+        SearchResult(
+            score=result["_score"],
+            content=str(result["_source"]["content"]),
+            metadata=result["_source"].get("metadata", {}),
+        )
         for result in results
     ]
+    if rerank_top_k > 0:
+        search_results = [result for result, score in reranker_service.rerank(query_text, search_results)[:k]]
+    return search_results
 
 
-def fetch_all_documents(es_client,
-                        index_name=settings.elasticsearch.index_name,
-                        size: int = 2000):
-    query_body = {"query":
-                  {
-                      "match_all": {}
-                  },
-                  "_source": ["content", "metadata"],
-                  "size": size
-                  }
+def fetch_all_documents(es_client, index_name=settings.elasticsearch.index_name, size: int = 2000):
+    query_body = {"query": {"match_all": {}}, "_source": ["content", "metadata"], "size": size}
 
     response = es_client.search(index=index_name, body=query_body)
     results = response["hits"]["hits"]
 
     return [
-        {"id": result["_id"], "content": result["_source"].get(
-            "content", ""), "metadata": result["_source"].get("metadata", {})}
+        {"id": result["_id"], "content": result["_source"].get("content", ""), "metadata": result["_source"].get("metadata", {})}
         for result in results
     ]
diff --git a/evaluation/core/evaluators/generation/correctness.py b/evaluation/core/evaluators/generation/correctness.py
index 2d9ad90..5ac9aca 100644
--- a/evaluation/core/evaluators/generation/correctness.py
+++ b/evaluation/core/evaluators/generation/correctness.py
@@ -88,25 +88,24 @@ def run_correctness_eval(self, query_str: str, reference_answer: str, generated_
         """
         try:
             user_prompt = CORRECTNESS_USER_TMPL.format(
-                query=query_str,
-                reference_answer=reference_answer,
-                generated_answer=generated_answer)
+                query=query_str, reference_answer=reference_answer, generated_answer=generated_answer
+            )
 
             system_prompt = CORRECTNESS_SYS_TMPL
 
-            open_ai_response = get_chat_completion(self.openai_api_key,
-                                                   user_prompt,
-                                                   system_prompt,
-                                                   ANSWER_JSON_SCHEMA,
-                                                   model=self.model,
-                                                   max_tokens=self.max_tokens)
-            json_answer = json.loads(open_ai_response.get("choices")[
-                                     0].get("message").get("content"))
+            open_ai_response = get_chat_completion(
+                self.openai_api_key, user_prompt, system_prompt, ANSWER_JSON_SCHEMA, model=self.model, max_tokens=self.max_tokens
+            )
+            json_answer = json.loads(open_ai_response.get("choices")[0].get("message").get("content"))
 
             score = json_answer["score"]
             reasoning = json_answer["reasoning"]
 
-            return {"score": score, "reasoning": reasoning, "passing": score >= self.threshold, }
+            return {
+                "score": score,
+                "reasoning": reasoning,
+                "passing": score >= self.threshold,
+            }
 
         except json.JSONDecodeError as e:
             logging.error(f"Failed to decode JSON response: {e}")
@@ -120,14 +119,15 @@ def run_correctness_eval(self, query_str: str, reference_answer: str, generated_
             logging.error(f"An error occurred: {e}")
             return {"score": None, "passing": None, "reasoning": "An unexpected error occurred"}
 
-    def run_batch_evaluation(self,
-                             df: pd.DataFrame,
-                             output_file: str,
-                             query_column: str,
-                             reference_answer_column: str,
-                             generated_answer_column: str,
-                             resource_id_column: str
-                             ):
+    def run_batch_evaluation(
+        self,
+        df: pd.DataFrame,
+        output_file: str,
+        query_column: str,
+        reference_answer_column: str,
+        generated_answer_column: str,
+        resource_id_column: str,
+    ):
         """
         Runs correctness evaluation on a batch of queries, reference answers, and generated answers.
         Saves results incrementally to avoid data loss in case of failure.
@@ -143,9 +143,8 @@ def run_batch_evaluation(self,
         # Determine if the file already exists
         file_exists = os.path.isfile(output_file)
 
-        with open(output_file, mode='a', newline='') as file:
-            writer = csv.DictWriter(
-                file, fieldnames=[resource_id_column, 'score', 'reasoning', 'passing'])
+        with open(output_file, mode="a", newline="") as file:
+            writer = csv.DictWriter(file, fieldnames=[resource_id_column, "score", "reasoning", "passing"])
 
             # Write header only if the file does not exist
             if not file_exists:
@@ -154,9 +153,8 @@ def run_batch_evaluation(self,
             try:
                 for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing correctness"):
                     result = self.run_correctness_eval(
-                        row[query_column],
-                        row[reference_answer_column],
-                        row[generated_answer_column])
+                        row[query_column], row[reference_answer_column], row[generated_answer_column]
+                    )
                     result[resource_id_column] = row[resource_id_column]
                     # Write the result to the CSV file
                     writer.writerow(result)
@@ -171,7 +169,6 @@ def run_batch_evaluation(self,
         # Load the results back into a DataFrame and concatenate with the original
         results_df = pd.read_csv(output_file)
 
-        correctnes_mean_score = round(results_df["score"].sum(
-        ) / (len(results_df) * 5), 2)
+        correctnes_mean_score = round(results_df["score"].sum() / (len(results_df) * 5), 2)
 
         return correctnes_mean_score
diff --git a/evaluation/core/evaluators/generation/faithfullness.py b/evaluation/core/evaluators/generation/faithfullness.py
index 88daf2c..9f93407 100644
--- a/evaluation/core/evaluators/generation/faithfullness.py
+++ b/evaluation/core/evaluators/generation/faithfullness.py
@@ -90,24 +90,18 @@ def run_faithfulness_eval(self, generated_answer: str, contexts: str):
         - dict, containing evaluations on relevancy, accuracy, conciseness and pertinence, and reasoning.
         """
         try:
-            user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer,
-                                                         contexts=contexts)
+            user_prompt = FAITHFULLNESS_USER_TMPL.format(generated_answer=generated_answer, contexts=contexts)
             system_prompt = FAITHFULLNESS_SYS_TMPL
 
-            open_ai_response = get_chat_completion(self.openai_api_key,
-                                                   user_prompt,
-                                                   system_prompt,
-                                                   ANSWER_JSON_SCHEMA,
-                                                   model=self.model,
-                                                   max_tokens=self.max_tokens)
+            open_ai_response = get_chat_completion(
+                self.openai_api_key, user_prompt, system_prompt, ANSWER_JSON_SCHEMA, model=self.model, max_tokens=self.max_tokens
+            )
 
-            json_answer = json.loads(open_ai_response.get("choices")[
-                                     0].get("message").get("content"))
+            json_answer = json.loads(open_ai_response.get("choices")[0].get("message").get("content"))
 
             relevancy = 1 if json_answer["relevancy"] == "YES" else 0
             accuracy = 1 if json_answer["accuracy"] == "YES" else 0
-            conciseness_and_pertinence = 1 if json_answer[
-                "conciseness_and_pertinence"] == "YES" else 0
+            conciseness_and_pertinence = 1 if json_answer["conciseness_and_pertinence"] == "YES" else 0
             reasoning = json_answer["reasoning"]
 
             return {
@@ -123,18 +117,25 @@ def run_faithfulness_eval(self, generated_answer: str, contexts: str):
 
         except KeyError as e:
             logging.error(f"Missing key in JSON response: {e}")
-            return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "Incomplete JSON response"}
+            return {
+                "relevancy": None,
+                "accuracy": None,
+                "conciseness_and_pertinence": None,
+                "reasoning": "Incomplete JSON response",
+            }
 
         except Exception as e:
             logging.error(f"An error occurred: {e}")
-            return {"relevancy": None, "accuracy": None, "conciseness_and_pertinence": None, "reasoning": "An unexpected error occurred"}
-
-    def run_batch_evaluation(self,
-                             df: pd.DataFrame,
-                             output_file: str,
-                             generated_answer_column: str,
-                             contexts_column: str,
-                             resource_id_column: str):
+            return {
+                "relevancy": None,
+                "accuracy": None,
+                "conciseness_and_pertinence": None,
+                "reasoning": "An unexpected error occurred",
+            }
+
+    def run_batch_evaluation(
+        self, df: pd.DataFrame, output_file: str, generated_answer_column: str, contexts_column: str, resource_id_column: str
+    ):
         """
         Runs faithfulness evaluation on a batch of generated answers and contexts.
         Saves results incrementally to avoid data loss in case of failure.
@@ -149,9 +150,10 @@ def run_batch_evaluation(self,
         # Determine if the file already exists
         file_exists = os.path.isfile(output_file)
 
-        with open(output_file, mode='a', newline='') as file:
-            writer = csv.DictWriter(file, fieldnames=[
-                                    resource_id_column, 'relevancy', 'accuracy', 'conciseness_and_pertinence', 'reasoning'])
+        with open(output_file, mode="a", newline="") as file:
+            writer = csv.DictWriter(
+                file, fieldnames=[resource_id_column, "relevancy", "accuracy", "conciseness_and_pertinence", "reasoning"]
+            )
 
             # Write header only if the file does not exist
             if not file_exists:
@@ -159,9 +161,7 @@ def run_batch_evaluation(self,
 
             try:
                 for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing faithfulness"):
-                    result = self.run_faithfulness_eval(
-                        row[generated_answer_column],
-                        row[contexts_column])
+                    result = self.run_faithfulness_eval(row[generated_answer_column], row[contexts_column])
                     result[resource_id_column] = row[resource_id_column]
                     # Write the result to the CSV file
                     writer.writerow(result)
@@ -177,11 +177,8 @@ def run_batch_evaluation(self,
         results_df = pd.read_csv(output_file)
 
         total_questions = len(results_df)
-        faithfulness_relevancy = round(results_df["relevancy"].sum(
-        ) / total_questions, 2)
-        faithfulness_accuracy = round(
-            results_df["accuracy"].sum() / total_questions, 2)
-        faithfulness_conciseness_and_pertinence = round(results_df["conciseness_and_pertinence"].sum(
-        ) / total_questions, 2)
+        faithfulness_relevancy = round(results_df["relevancy"].sum() / total_questions, 2)
+        faithfulness_accuracy = round(results_df["accuracy"].sum() / total_questions, 2)
+        faithfulness_conciseness_and_pertinence = round(results_df["conciseness_and_pertinence"].sum() / total_questions, 2)
 
         return faithfulness_relevancy, faithfulness_accuracy, faithfulness_conciseness_and_pertinence
diff --git a/evaluation/core/openai/openai.py b/evaluation/core/openai/openai.py
index 1452dd7..e305831 100644
--- a/evaluation/core/openai/openai.py
+++ b/evaluation/core/openai/openai.py
@@ -25,20 +25,17 @@ def get_chat_completion(
     - dict, the response from the OpenAI API.
     """
 
-    headers = {"Content-Type": "application/json",
-               "Authorization": f"Bearer {openai_api_key}"}
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {openai_api_key}"}
 
     payload = {
         "model": model,
         "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
         "response_format": answer_json_schema,
-        "max_tokens": max_tokens
+        "max_tokens": max_tokens,
     }
 
     try:
-        response = requests.post(endpoint_url,
-                                 headers=headers,
-                                 json=payload)
+        response = requests.post(endpoint_url, headers=headers, json=payload)
         response.raise_for_status()
         return response.json()
     except requests.exceptions.HTTPError as http_err:
@@ -49,42 +46,39 @@ def get_chat_completion(
         return None
 
 
-def get_total_tokens_from_string(string: str,
-                                 encoding_name: str = "o200k_base") -> int:
+def get_total_tokens_from_string(string: str, encoding_name: str = "o200k_base") -> int:
     """Returns the number of tokens in a text string."""
     encoding = tiktoken.get_encoding(encoding_name)
     return len(encoding.encode(string))
 
 
-def calculate_total_tokens(df,
-                           query_column,
-                           generated_answer_column,
-                           contexts_column,
-                           reference_answer_column,
-                           system_template: str,
-                           user_template: str,
-                           template_type: str,
-                           encoding_name: str) -> int:
+def calculate_total_tokens(
+    df,
+    query_column,
+    generated_answer_column,
+    contexts_column,
+    reference_answer_column,
+    system_template: str,
+    user_template: str,
+    template_type: str,
+    encoding_name: str,
+) -> int:
     """Calculates the total number of tokens required for the batch based on the type of evaluation."""
     total_tokens = 0
 
     for _, row in df.iterrows():
-        if template_type == 'correctness':
+        if template_type == "correctness":
             user_prompt = user_template.format(
                 query=row[query_column],
                 reference_answer=row[reference_answer_column],
-                generated_answer=row[generated_answer_column]
-            )
-        elif template_type == 'faithfulness':
-            user_prompt = user_template.format(
                 generated_answer=row[generated_answer_column],
-                contexts=row[contexts_column]
             )
+        elif template_type == "faithfulness":
+            user_prompt = user_template.format(generated_answer=row[generated_answer_column], contexts=row[contexts_column])
 
         system_prompt = system_template
         complete_prompt = system_prompt + user_prompt
-        total_tokens += get_total_tokens_from_string(
-            complete_prompt, encoding_name=encoding_name)
+        total_tokens += get_total_tokens_from_string(complete_prompt, encoding_name=encoding_name)
 
     return total_tokens
 
@@ -111,13 +105,11 @@ def calculate_api_costs(
     """
 
     # Calculate input token costs
-    input_cost = round(total_input_tokens *
-                       (cost_per_million_input_tokens / 1_000_000), 3)
+    input_cost = round(total_input_tokens * (cost_per_million_input_tokens / 1_000_000), 3)
 
     # Calculate total output tokens and their costs
     total_output_tokens = total_openai_requests * tokens_per_response
-    output_cost = round(total_output_tokens *
-                        (cost_per_million_output_tokens / 1_000_000), 3)
+    output_cost = round(total_output_tokens * (cost_per_million_output_tokens / 1_000_000), 3)
 
     # Calculate the total cost
     total_cost = round(input_cost + output_cost, 2)
@@ -127,5 +119,5 @@ def calculate_api_costs(
         "input_cost": input_cost,
         "output_cost": output_cost,
         "total_input_tokens": total_input_tokens,
-        "total_output_tokens": total_output_tokens
+        "total_output_tokens": total_output_tokens,
     }
diff --git a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py
index 0b3335f..dfa7880 100644
--- a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py
+++ b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/estimate_costs.py
@@ -13,14 +13,11 @@
 with open(INPUT_PATH, "r") as config_file:
     config = json.load(config_file)
 
-DATA_DIR = os.path.abspath(os.path.join(
-    os.path.dirname(__file__), "..", "..", "..", "data"))
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "data"))
 
 # Files
-REFERENCE_ANSWERS_FILE = os.path.join(
-    DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"])
-GENERATED_ANSWERS_FILE = os.path.join(
-    DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"])
+REFERENCE_ANSWERS_FILE = os.path.join(DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"])
+GENERATED_ANSWERS_FILE = os.path.join(DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"])
 
 # Interest columns
 QUERY_COLUMN = config.get("QUERY_COLUMN")
@@ -30,27 +27,27 @@
 
 # Output costs
 OUTPUT_COSTS = os.path.join(os.path.dirname(__file__), "data", "output_estimate_costs.txt")
-COST_PER_MILLION_INPUT_TOKENS= config.get("COST_PER_MILLION_INPUT_TOKENS")
-COST_PER_MILLION_OUTPUT_TOKENS= config.get("COST_PER_MILLION_OUTPUT_TOKENS")
+COST_PER_MILLION_INPUT_TOKENS = config.get("COST_PER_MILLION_INPUT_TOKENS")
+COST_PER_MILLION_OUTPUT_TOKENS = config.get("COST_PER_MILLION_OUTPUT_TOKENS")
 
 
 def main():
     # Read data
-    generated_answers= pd.read_csv(GENERATED_ANSWERS_FILE)
-    reference_answers= pd.read_csv(REFERENCE_ANSWERS_FILE)
+    generated_answers = pd.read_csv(GENERATED_ANSWERS_FILE)
+    reference_answers = pd.read_csv(REFERENCE_ANSWERS_FILE)
 
     generated_answers[REFERENCE_ANSWER_COLUMN] = reference_answers[REFERENCE_ANSWER_COLUMN]
     # Calculate total tokens
-    total_tokens_input_correctness= calculate_total_tokens(
-        df = generated_answers,
+    total_tokens_input_correctness = calculate_total_tokens(
+        df=generated_answers,
         query_column=QUERY_COLUMN,
         generated_answer_column=GENERATED_ANSWER_COLUMN,
         contexts_column=CONTEXTS_COLUMN,
         reference_answer_column=REFERENCE_ANSWER_COLUMN,
-        system_template = CORRECTNESS_SYS_TMPL,
-        user_template = CORRECTNESS_USER_TMPL,
-        template_type = 'correctness',
-        encoding_name = 'o200k_base'
+        system_template=CORRECTNESS_SYS_TMPL,
+        user_template=CORRECTNESS_USER_TMPL,
+        template_type="correctness",
+        encoding_name="o200k_base",
     )
     total_tokens_input_faithfulness = calculate_total_tokens(
         df=generated_answers,
@@ -60,8 +57,8 @@ def main():
         reference_answer_column=REFERENCE_ANSWER_COLUMN,
         system_template=FAITHFULLNESS_SYS_TMPL,
         user_template=FAITHFULLNESS_USER_TMPL,
-        template_type='faithfulness',
-        encoding_name='o200k_base'
+        template_type="faithfulness",
+        encoding_name="o200k_base",
     )
 
     # Calculate costs
@@ -83,31 +80,23 @@ def main():
     )
 
     # Save total costs
-    with open(OUTPUT_COSTS, 'w') as f:
+    with open(OUTPUT_COSTS, "w") as f:
         f.write("Total Estimated Costs:\n")
-        f.write(
-            f"${cost_estimate_correctness['total_cost'] + cost_estimate_faithfulness['total_cost']}\n\n\n")
+        f.write(f"${cost_estimate_correctness['total_cost'] + cost_estimate_faithfulness['total_cost']}\n\n\n")
         f.write("Correctness Evaluation Costs:\n")
-        f.write(
-            f"Estimated Total Costs: ${cost_estimate_correctness['total_cost']}\n")
+        f.write(f"Estimated Total Costs: ${cost_estimate_correctness['total_cost']}\n")
         f.write(f"Input Costs: ${cost_estimate_correctness['input_cost']}\n")
-        f.write(
-            f"Output Costs: ${cost_estimate_correctness['output_cost']}\n")
-        f.write(
-            f"Total Input Tokens: {cost_estimate_correctness['total_input_tokens']}\n")
-        f.write(
-            f"Total Output Tokens: {cost_estimate_correctness['total_output_tokens']}\n\n")
+        f.write(f"Output Costs: ${cost_estimate_correctness['output_cost']}\n")
+        f.write(f"Total Input Tokens: {cost_estimate_correctness['total_input_tokens']}\n")
+        f.write(f"Total Output Tokens: {cost_estimate_correctness['total_output_tokens']}\n\n")
 
         f.write("\nFaithfulness Evaluation Costs:\n")
-        f.write(
-            f"Estimated Total Costs: ${cost_estimate_faithfulness['total_cost']}\n")
+        f.write(f"Estimated Total Costs: ${cost_estimate_faithfulness['total_cost']}\n")
         f.write(f"Input Costs: ${cost_estimate_faithfulness['input_cost']}\n")
-        f.write(
-            f"Output Costs: ${cost_estimate_faithfulness['output_cost']}\n")
-        f.write(
-            f"Total Input Tokens: {cost_estimate_faithfulness['total_input_tokens']}\n")
-        f.write(
-            f"Total Output Tokens: {cost_estimate_faithfulness['total_output_tokens']}\n")
+        f.write(f"Output Costs: ${cost_estimate_faithfulness['output_cost']}\n")
+        f.write(f"Total Input Tokens: {cost_estimate_faithfulness['total_input_tokens']}\n")
+        f.write(f"Total Output Tokens: {cost_estimate_faithfulness['total_output_tokens']}\n")
+
 
 if __name__ == "__main__":
     main()
diff --git a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py
index 5c4e9fd..8243678 100644
--- a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py
+++ b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/generate_responses.py
@@ -9,13 +9,10 @@
 with open(os.path.join(os.path.dirname(__file__), "data", "input_generate_responses.json"), "r") as config_file:
     config = json.load(config_file)
 
-DATA_DIR = os.path.abspath(os.path.join(
-    os.path.dirname(__file__), "..", "..", "data"))
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data"))
 JSONL_FILE = os.path.join(DATA_DIR, "openai_outputs", config["JSONL_FILE"])
-JSONL_OUTPUT_CSV = os.path.join(
-    DATA_DIR, "openai_outputs", config["JSONL_FILE"].replace(".jsonl", ".csv"))
-RAG_OUTPUT_CSV = os.path.join(
-    DATA_DIR, "rag_generation", config["RAG_OUTPUT_CSV"])
+JSONL_OUTPUT_CSV = os.path.join(DATA_DIR, "openai_outputs", config["JSONL_FILE"].replace(".jsonl", ".csv"))
+RAG_OUTPUT_CSV = os.path.join(DATA_DIR, "rag_generation", config["RAG_OUTPUT_CSV"])
 
 SERVER_URL = config["SERVER_URL"]
 CORES = config["CORES"]
diff --git a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py
index bb3c59b..f56180f 100644
--- a/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py
+++ b/evaluation/evaluation_metrics/evaluate_generation/generation_metrics/get_metrics.py
@@ -14,31 +14,24 @@
 load_dotenv()
 
 # Load config
-INPUT_PATH = os.path.join(os.path.dirname(
-    __file__), "data", "input_get_metrics.json")
+INPUT_PATH = os.path.join(os.path.dirname(__file__), "data", "input_get_metrics.json")
 with open(INPUT_PATH, "r") as config_file:
     config = json.load(config_file)
 
-DATA_DIR = os.path.abspath(os.path.join(
-    os.path.dirname(__file__), "..", "..", "..", "data"))
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "data"))
 
 # Files
-REFERENCE_ANSWERS_FILE = os.path.join(
-    DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"])
-GENERATED_ANSWERS_FILE = os.path.join(
-    DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"])
+REFERENCE_ANSWERS_FILE = os.path.join(DATA_DIR, "openai_outputs", config["REFERENCE_ANSWERS_FILE"])
+GENERATED_ANSWERS_FILE = os.path.join(DATA_DIR, "rag_generation", config["GENERATED_ANSWERS_FILE"])
 
 # Openai API Key and model to use
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 LLM_MODEL = config.get("LLM_MODEL")
 
 # Output files
-CORRECTNESS_RESULTS_CSV = os.path.join(
-    DATA_DIR, "openai_outputs", config["CORRECTNESS_RESULTS_CSV"])
-FAITHFULNESS_RESULTS_CSV = os.path.join(
-    DATA_DIR, "openai_outputs", config["FAITHFULNESS_RESULTS_CSV"])
-RESULT_METRICS_TXT = os.path.join(os.path.dirname(
-    __file__), "data", "output_generation_metrics.txt")
+CORRECTNESS_RESULTS_CSV = os.path.join(DATA_DIR, "openai_outputs", config["CORRECTNESS_RESULTS_CSV"])
+FAITHFULNESS_RESULTS_CSV = os.path.join(DATA_DIR, "openai_outputs", config["FAITHFULNESS_RESULTS_CSV"])
+RESULT_METRICS_TXT = os.path.join(os.path.dirname(__file__), "data", "output_generation_metrics.txt")
 
 # Columns of interest
 QUERY_COLUMN = config["QUERY_COLUMN"]
@@ -68,13 +61,8 @@ def main():
     generated_answers[RESOURCE_ID_COLUMN] = reference_answers[RESOURCE_ID_COLUMN]
 
     # Instantiate evaluators
-    correctness_evaluator = CorrectnessEvaluator(OPENAI_API_KEY,
-                                                 LLM_MODEL,
-                                                 threshold=4.0,
-                                                 max_tokens=300)
-    faithfulness_evaluator = FaithfulnessEvaluator(OPENAI_API_KEY,
-                                                   LLM_MODEL,
-                                                   max_tokens=300)
+    correctness_evaluator = CorrectnessEvaluator(OPENAI_API_KEY, LLM_MODEL, threshold=4.0, max_tokens=300)
+    faithfulness_evaluator = FaithfulnessEvaluator(OPENAI_API_KEY, LLM_MODEL, max_tokens=300)
     # Run batch evaluations
     correctnes_mean_score = correctness_evaluator.run_batch_evaluation(
         generated_answers,
@@ -82,31 +70,28 @@ def main():
         QUERY_COLUMN,
         REFERENCE_ANSWER_COLUMN,
         GENERATED_ANSWER_COLUMN,
-        RESOURCE_ID_COLUMN
+        RESOURCE_ID_COLUMN,
     )
 
-    faithfulness_relevancy, \
-        faithfulness_accuracy, \
-        faithfulness_conciseness_and_pertinence = faithfulness_evaluator.run_batch_evaluation(
-            generated_answers,
-            FAITHFULNESS_RESULTS_CSV,
-            GENERATED_ANSWER_COLUMN,
-            CONTEXTS_COLUMN,
-            RESOURCE_ID_COLUMN
+    faithfulness_relevancy, faithfulness_accuracy, faithfulness_conciseness_and_pertinence = (
+        faithfulness_evaluator.run_batch_evaluation(
+            generated_answers, FAITHFULNESS_RESULTS_CSV, GENERATED_ANSWER_COLUMN, CONTEXTS_COLUMN, RESOURCE_ID_COLUMN
         )
+    )
 
-    metrics = {"Correctness mean score": correctnes_mean_score,
-               "Faithfulness relevancy": faithfulness_relevancy,
-               "Faithfulness accuracy": faithfulness_accuracy,
-               "Faithfulness conciseness and pertinence": faithfulness_conciseness_and_pertinence}
+    metrics = {
+        "Correctness mean score": correctnes_mean_score,
+        "Faithfulness relevancy": faithfulness_relevancy,
+        "Faithfulness accuracy": faithfulness_accuracy,
+        "Faithfulness conciseness and pertinence": faithfulness_conciseness_and_pertinence,
+    }
 
     # Save metrics to txt
-    with open(RESULT_METRICS_TXT, 'w') as f:
+    with open(RESULT_METRICS_TXT, "w") as f:
         f.write(f"Correctness score: {correctnes_mean_score}\n\n")
         f.write(f"Faithfulness relevancy score: {faithfulness_relevancy}\n")
         f.write(f"Faithfulness accuracy score: {faithfulness_accuracy}\n")
-        f.write(
-            f"Faithfulness conciseness_and_pertinence score: {faithfulness_conciseness_and_pertinence}\n")
+        f.write(f"Faithfulness conciseness_and_pertinence score: {faithfulness_conciseness_and_pertinence}\n")
 
     # Upload metrics and artifacts to ClearML
     if task:
@@ -120,8 +105,7 @@ def main():
 
         if UPLOAD_ARTIFACTS:
             for artifact_name, file_path in artifact_files.items():
-                task.upload_artifact(name=artifact_name,
-                                     artifact_object=file_path)
+                task.upload_artifact(name=artifact_name, artifact_object=file_path)
 
         task.close()