Skip to content

Commit

Permalink
feat: add micro example notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
StijnGoossens committed Oct 13, 2023
1 parent 73792c6 commit 8e162dd
Show file tree
Hide file tree
Showing 4 changed files with 375 additions and 20 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@

Evaluates LLM-based applications.

**Check the `example.ipynb` notebook for an example of how to use this package.**

## To-do's
- [x] Convert EHBO notes into question-answer pairs, using OpenAI Function Calling.
- [x] Turn the question-answer pairs into a test set.
- [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
- [x] Build LLM 'app' that can answer the questions.
- [x] Evaluate the LLM app with the LLM evaluator.
- [x] Streamlit page to view the evaluation results.
- [ ] Combine the evaluation results into a single metric.
- [ ] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
- [x] Combine the evaluation results into a single metric.
- [x] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
- [ ] Streamlit page to visualize the comparison.
- [ ] Streamlit page to view, edit and add test cases.
- [ ] Integrate with MLflow for experiment tracking (?)
Expand Down
62 changes: 57 additions & 5 deletions src/llm_app_eval/eval_properties.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,57 @@
from functools import lru_cache

import numpy as np
import openai
from evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
from pydantic import BaseModel

PROPERTY_LLM = "gpt-3.5-turbo-0613"


@lru_cache
def get_embedding(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

property_llm = "gpt-3.5-turbo-0613"

def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def output_similarity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.reference_output and llm_app_result.answer:
app_output_emb = get_embedding(llm_app_result.answer)
reference_emb = get_embedding(test_case.reference_output.answer)
result = PropertyResult(
feedback="",
score=cosine_similarity(app_output_emb, reference_emb),
)
else:
result = None
return result


def output_verbosity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.reference_output and llm_app_result.answer:
result = PropertyResult(
feedback="", score=len(llm_app_result.answer) / len(test_case.reference_output.answer)
)
else:
result = None
return result


class LlmPropertyResult(BaseModel):
feedback: str
pass_fail: bool


def evaluate_property_with_llm(
model: str, system_message: str, user_message: str
) -> PropertyResult:
return openai.ChatCompletion.create(
model=model,
response_model=PropertyResult,
response_model=LlmPropertyResult,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": user_message},
Expand All @@ -20,7 +62,7 @@ def evaluate_property_with_llm(
def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.reference_output and llm_app_result.answer:
result = evaluate_property_with_llm(
model=property_llm,
model=PROPERTY_LLM,
system_message="Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.",
user_message=f"Answer: {llm_app_result.answer}\nReference Answer: {test_case.reference_output.answer}",
)
Expand All @@ -32,7 +74,7 @@ def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> P
def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.test_input and test_case.historical_output and llm_app_result.answer:
result = evaluate_property_with_llm(
model=property_llm,
model=PROPERTY_LLM,
system_message="Evaluate the new answer. Is the new answer better than the old answer? Explain why.",
user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nNew answer: {llm_app_result.answer}",
)
Expand All @@ -51,7 +93,7 @@ def takes_feedback_into_account(
and test_case.historical_feedback
):
result = evaluate_property_with_llm(
model=property_llm,
model=PROPERTY_LLM,
system_message="Evaluate the new answer. Does the new answer improve upon the old one by taking the feedback into account? Explain why.",
user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nOld feedback: {test_case.historical_feedback}\nNew answer: {llm_app_result.answer}",
)
Expand Down Expand Up @@ -96,4 +138,14 @@ def length_within_bounds(test_case: TestCase, llm_app_result: OutputFormat) -> P
description="The answer is max 20% longer than the reference answer.",
eval_func=length_within_bounds,
),
EvalProperty(
property_name="CosineSimilarity",
description="The answer is similar to the reference answer.",
eval_func=output_similarity,
),
EvalProperty(
property_name="Verbosity",
description="The answer is not too verbose.",
eval_func=output_verbosity,
),
]
61 changes: 48 additions & 13 deletions src/llm_app_eval/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
import os
import time
from datetime import datetime
from typing import Callable, Optional

import pandas as pd
from llm_app import BaseApp, InputFormat, OutputFormat
from pydantic import BaseModel
from tqdm import tqdm
Expand All @@ -23,15 +25,17 @@ class EvalProperty(BaseModel):


class PropertyResult(BaseModel):
feedback: str
pass_fail: bool
property_name: Optional[str] = None
feedback: Optional[str]
score: float


class TestCaseResult(BaseModel):
test_case_id: str
output: OutputFormat
property_results: list[PropertyResult]
property_results: dict[str, PropertyResult]
latency: float
cosine_similarity: Optional[float] = None
verbosity: Optional[float] = None


class Evaluator:
Expand Down Expand Up @@ -59,33 +63,64 @@ def evaluate(
os.makedirs(exp_dir, exist_ok=True)

# Loop over test cases
test_case_results = []
for test_case in tqdm(
self.test_set, desc="Evaluating test cases", unit="test case", total=len(self.test_set)
):
# Pass the test case to the LLM app
# Measure the time it takes to run the LLM app
start_time = time.time()
app_output = llm_app(app_input=test_case.test_input)
latency = time.time() - start_time
# Evaluate properties
property_results = []
property_results = {}
for prop in self.properties:
print(f"Evaluating property {prop.property_name}")
# print(f"Evaluating property {prop.property_name}")
r = prop.eval_func(test_case=test_case, llm_app_result=app_output)
# If the property is None, then it is not applicable to this test case, so skip it
if r:
# Store the property results per test case in a list
property_results.append(
PropertyResult(
property_name=prop.property_name,
feedback=r.feedback,
pass_fail=r.pass_fail,
)
property_results[prop.property_name] = PropertyResult(
feedback=r.feedback,
score=r.score if "score" in r.model_fields else float(r.pass_fail),
)
# Store results as JSON
tcr = TestCaseResult(
test_case_id=test_case.test_id, output=app_output, property_results=property_results
test_case_id=test_case.test_id,
output=app_output,
property_results=property_results,
latency=latency,
)
test_case_results.append(tcr)
tcr_json = tcr.model_dump_json()
with open(os.path.join(exp_dir, f"{tcr.test_case_id}.json"), "w") as f:
f.write(tcr_json)
# Save the Llm app config dict as JSON
with open(os.path.join(exp_dir, "llm_app.json"), "w") as f:
f.write(json.dumps(llm_app.cfg))

# Convert all test case results into a dataframe
df = pd.DataFrame([tcr.dict() for tcr in test_case_results])
# Split the `property_results` into separate columns. The `property_results` column is a dict of dicts.
# Each top level key is a property name. Each second level key is a property result (feedback and pass_fail).
# The `property_results` column is split into separate columns for each combination of property name and property result.
# The values of these columns are the values of the `feedback` and `pass_fail` respectively.
df = df.join(pd.json_normalize(df["property_results"]))
# Drop the `property_results` column.
df = df.drop(columns=["property_results"])
# Save the dataframe as CSV
df.to_csv(os.path.join(exp_dir, "results.csv"), index=False)
# Aggregate the results by taking the mean over the test cases for the `latency` and all `pass_fail` columns.
agg_columns = ["latency"] + [col for col in df.columns if "score" in col]
df_agg = df[agg_columns].mean().reset_index()
df_agg.columns = ["metric", "value"]
# Pivot the metric column to get a column for each metric and a row for each LLM app.
df_agg = df_agg.pivot_table(index=None, columns="metric", values="value")
# Drop the `metric` index.
df_agg = df_agg.reset_index(drop=True)
# Add the llm app config dict as columns in front of the aggregated results.
df_agg = pd.concat([pd.json_normalize(llm_app.cfg), df_agg], axis=1)
# Save the aggregated results as CSV.
df_agg.to_csv(os.path.join(exp_dir, "results_agg.csv"), index=False)

return df_agg
Loading

0 comments on commit 8e162dd

Please sign in to comment.