feat: add micro example notebook

superlinear-ai · Oct 13, 2023 · 8e162dd · 8e162dd
1 parent 73792c6
commit 8e162dd
Show file tree

Hide file tree

Showing 4 changed files with 375 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -4,15 +4,17 @@
 
 Evaluates LLM-based applications.
 
+**Check the `example.ipynb` notebook for an example of how to use this package.**
+
 ## To-do's
 - [x] Convert EHBO notes into question-answer pairs, using OpenAI Function Calling.
 - [x] Turn the question-answer pairs into a test set.
 - [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
 - [x] Build LLM 'app' that can answer the questions.
 - [x] Evaluate the LLM app with the LLM evaluator.
 - [x] Streamlit page to view the evaluation results.
-- [ ] Combine the evaluation results into a single metric.
-- [ ] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
+- [x] Combine the evaluation results into a single metric.
+- [x] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
 - [ ] Streamlit page to visualize the comparison.
 - [ ] Streamlit page to view, edit and add test cases.
 - [ ] Integrate with MLflow for experiment tracking (?)

diff --git a/src/llm_app_eval/eval_properties.py b/src/llm_app_eval/eval_properties.py
@@ -1,15 +1,57 @@
+from functools import lru_cache
+
+import numpy as np
 import openai
 from evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
+from pydantic import BaseModel
+
+PROPERTY_LLM = "gpt-3.5-turbo-0613"
+
+
+@lru_cache
+def get_embedding(text, model="text-embedding-ada-002"):
+    text = text.replace("\n", " ")
+    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
 
-property_llm = "gpt-3.5-turbo-0613"
+
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+
+def output_similarity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+    if test_case.reference_output and llm_app_result.answer:
+        app_output_emb = get_embedding(llm_app_result.answer)
+        reference_emb = get_embedding(test_case.reference_output.answer)
+        result = PropertyResult(
+            feedback="",
+            score=cosine_similarity(app_output_emb, reference_emb),
+        )
+    else:
+        result = None
+    return result
+
+
+def output_verbosity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+    if test_case.reference_output and llm_app_result.answer:
+        result = PropertyResult(
+            feedback="", score=len(llm_app_result.answer) / len(test_case.reference_output.answer)
+        )
+    else:
+        result = None
+    return result
+
+
+class LlmPropertyResult(BaseModel):
+    feedback: str
+    pass_fail: bool
 
 
 def evaluate_property_with_llm(
     model: str, system_message: str, user_message: str
 ) -> PropertyResult:
     return openai.ChatCompletion.create(
         model=model,
-        response_model=PropertyResult,
+        response_model=LlmPropertyResult,
         messages=[
             {"role": "system", "content": system_message},
             {"role": "user", "content": user_message},
@@ -20,7 +62,7 @@ def evaluate_property_with_llm(
 def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
     if test_case.reference_output and llm_app_result.answer:
         result = evaluate_property_with_llm(
-            model=property_llm,
+            model=PROPERTY_LLM,
             system_message="Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.",
             user_message=f"Answer: {llm_app_result.answer}\nReference Answer: {test_case.reference_output.answer}",
         )
@@ -32,7 +74,7 @@ def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> P
 def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
     if test_case.test_input and test_case.historical_output and llm_app_result.answer:
         result = evaluate_property_with_llm(
-            model=property_llm,
+            model=PROPERTY_LLM,
             system_message="Evaluate the new answer. Is the new answer better than the old answer? Explain why.",
             user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nNew answer: {llm_app_result.answer}",
         )
@@ -51,7 +93,7 @@ def takes_feedback_into_account(
         and test_case.historical_feedback
     ):
         result = evaluate_property_with_llm(
-            model=property_llm,
+            model=PROPERTY_LLM,
             system_message="Evaluate the new answer. Does the new answer improve upon the old one by taking the feedback into account? Explain why.",
             user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nOld feedback: {test_case.historical_feedback}\nNew answer: {llm_app_result.answer}",
         )
@@ -96,4 +138,14 @@ def length_within_bounds(test_case: TestCase, llm_app_result: OutputFormat) -> P
         description="The answer is max 20% longer than the reference answer.",
         eval_func=length_within_bounds,
     ),
+    EvalProperty(
+        property_name="CosineSimilarity",
+        description="The answer is similar to the reference answer.",
+        eval_func=output_similarity,
+    ),
+    EvalProperty(
+        property_name="Verbosity",
+        description="The answer is not too verbose.",
+        eval_func=output_verbosity,
+    ),
 ]
diff --git a/src/llm_app_eval/evaluator.py b/src/llm_app_eval/evaluator.py
@@ -1,8 +1,10 @@
 import json
 import os
+import time
 from datetime import datetime
 from typing import Callable, Optional
 
+import pandas as pd
 from llm_app import BaseApp, InputFormat, OutputFormat
 from pydantic import BaseModel
 from tqdm import tqdm
@@ -23,15 +25,17 @@ class EvalProperty(BaseModel):
 
 
 class PropertyResult(BaseModel):
-    feedback: str
-    pass_fail: bool
-    property_name: Optional[str] = None
+    feedback: Optional[str]
+    score: float
 
 
 class TestCaseResult(BaseModel):
     test_case_id: str
     output: OutputFormat
-    property_results: list[PropertyResult]
+    property_results: dict[str, PropertyResult]
+    latency: float
+    cosine_similarity: Optional[float] = None
+    verbosity: Optional[float] = None
 
 
 class Evaluator:
@@ -59,33 +63,64 @@ def evaluate(
         os.makedirs(exp_dir, exist_ok=True)
 
         # Loop over test cases
+        test_case_results = []
         for test_case in tqdm(
             self.test_set, desc="Evaluating test cases", unit="test case", total=len(self.test_set)
         ):
             # Pass the test case to the LLM app
+            # Measure the time it takes to run the LLM app
+            start_time = time.time()
             app_output = llm_app(app_input=test_case.test_input)
+            latency = time.time() - start_time
             # Evaluate properties
-            property_results = []
+            property_results = {}
             for prop in self.properties:
-                print(f"Evaluating property {prop.property_name}")
+                # print(f"Evaluating property {prop.property_name}")
                 r = prop.eval_func(test_case=test_case, llm_app_result=app_output)
                 # If the property is None, then it is not applicable to this test case, so skip it
                 if r:
                     # Store the property results per test case in a list
-                    property_results.append(
-                        PropertyResult(
-                            property_name=prop.property_name,
-                            feedback=r.feedback,
-                            pass_fail=r.pass_fail,
-                        )
+                    property_results[prop.property_name] = PropertyResult(
+                        feedback=r.feedback,
+                        score=r.score if "score" in r.model_fields else float(r.pass_fail),
                     )
             # Store results as JSON
             tcr = TestCaseResult(
-                test_case_id=test_case.test_id, output=app_output, property_results=property_results
+                test_case_id=test_case.test_id,
+                output=app_output,
+                property_results=property_results,
+                latency=latency,
             )
+            test_case_results.append(tcr)
             tcr_json = tcr.model_dump_json()
             with open(os.path.join(exp_dir, f"{tcr.test_case_id}.json"), "w") as f:
                 f.write(tcr_json)
         # Save the Llm app config dict as JSON
         with open(os.path.join(exp_dir, "llm_app.json"), "w") as f:
             f.write(json.dumps(llm_app.cfg))
+
+        # Convert all test case results into a dataframe
+        df = pd.DataFrame([tcr.dict() for tcr in test_case_results])
+        # Split the `property_results` into separate columns. The `property_results` column is a dict of dicts.
+        # Each top level key is a property name. Each second level key is a property result (feedback and pass_fail).
+        # The `property_results` column is split into separate columns for each combination of property name and property result.
+        # The values of these columns are the values of the `feedback` and `pass_fail` respectively.
+        df = df.join(pd.json_normalize(df["property_results"]))
+        # Drop the `property_results` column.
+        df = df.drop(columns=["property_results"])
+        # Save the dataframe as CSV
+        df.to_csv(os.path.join(exp_dir, "results.csv"), index=False)
+        # Aggregate the results by taking the mean over the test cases for the `latency` and all `pass_fail` columns.
+        agg_columns = ["latency"] + [col for col in df.columns if "score" in col]
+        df_agg = df[agg_columns].mean().reset_index()
+        df_agg.columns = ["metric", "value"]
+        # Pivot the metric column to get a column for each metric and a row for each LLM app.
+        df_agg = df_agg.pivot_table(index=None, columns="metric", values="value")
+        # Drop the `metric` index.
+        df_agg = df_agg.reset_index(drop=True)
+        # Add the llm app config dict as columns in front of the aggregated results.
+        df_agg = pd.concat([pd.json_normalize(llm_app.cfg), df_agg], axis=1)
+        # Save the aggregated results as CSV.
+        df_agg.to_csv(os.path.join(exp_dir, "results_agg.csv"), index=False)
+
+        return df_agg