Merge pull request #230 from Kiln-AI/evals

Evals! See the commits for more details (over 100). docs: https://docs.getkiln.ai/docs/evaluations
Kiln-AI · Mar 1, 2025 · 76bfd3b · 76bfd3b
2 parents 9f76578 + 1c31181
commit 76bfd3b
Show file tree

Hide file tree

Showing 114 changed files with 11,797 additions and 1,091 deletions.
diff --git a/.cursorrules b/.cursorrules
@@ -1,3 +1,4 @@
  - Always assume pydantic 2 (not pydantic 1)
  - Always use pytest for tests
+ - The project supports Python 3.10 and above
 
diff --git a/README.md b/README.md
@@ -19,8 +19,8 @@
 | CI      | [![Build and Test](https://github.com/Kiln-AI/kiln/actions/workflows/build_and_test.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/build_and_test.yml) [![Format and Lint](https://github.com/Kiln-AI/kiln/actions/workflows/format_and_lint.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/format_and_lint.yml) [![Desktop Apps Build](https://github.com/Kiln-AI/kiln/actions/workflows/build_desktop.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/build_desktop.yml) [![Web UI Build](https://github.com/Kiln-AI/kiln/actions/workflows/web_format_lint_build.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/web_format_lint_build.yml) [![Test Count Badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/scosman/57742c1b1b60d597a6aba5d5148d728e/raw/test_count_kiln.json)](https://github.com/Kiln-AI/kiln/actions/workflows/test_count.yml) [![Test Coverage Badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/scosman/57742c1b1b60d597a6aba5d5148d728e/raw/library_coverage_kiln.json)](https://github.com/Kiln-AI/kiln/actions/workflows/test_count.yml) [![Docs](https://github.com/Kiln-AI/Kiln/actions/workflows/build_docs.yml/badge.svg)](https://github.com/Kiln-AI/Kiln/actions/workflows/build_docs.yml) |
 | Package | [![PyPI - Version](https://img.shields.io/pypi/v/kiln-ai.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/kiln-ai/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/kiln-ai.svg?logo=python&label=Python&logoColor=gold)](https://pypi.org/project/kiln-ai/)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
 | Meta    | [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv) [![linting - Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) [![types - Pyright](https://img.shields.io/badge/types-pyright-blue.svg)](https://github.com/microsoft/pyright) [![Docs](https://img.shields.io/badge/docs-pdoc-blue)](https://kiln-ai.github.io/Kiln/kiln_core_docs/index.html)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| Apps    | [![MacOS](https://img.shields.io/badge/MacOS-black?logo=apple)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Windows](https://img.shields.io/badge/Windows-0067b8.svg?logo=data:image/svg%2bxml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHN2ZyBmaWxsPSIjZmZmIiB2aWV3Qm94PSIwIDAgMzIgMzIiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTE2Ljc0MiAxNi43NDJ2MTQuMjUzaDE0LjI1M3YtMTQuMjUzek0xLjAwNCAxNi43NDJ2MTQuMjUzaDE0LjI1NnYtMTQuMjUzek0xNi43NDIgMS4wMDR2MTQuMjU2aDE0LjI1M3YtMTQuMjU2ek0xLjAwNCAxLjAwNHYxNC4yNTZoMTQuMjU2di0xNC4yNTZ6Ij48L3BhdGg+Cjwvc3ZnPg==)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Linux](https://img.shields.io/badge/Linux-444444?logo=linux&logoColor=ffffff)](https://github.com/Kiln-AI/Kiln/releases/latest) ![Github Downsloads](https://img.shields.io/github/downloads/kiln-ai/kiln/total)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| Connect  | [![Discord](https://img.shields.io/badge/Discord-Kiln_AI-blue?logo=Discord&logoColor=white)](https://discord.gg/sVJEzDGu) [![Newsletter](https://img.shields.io/badge/Newsletter-kilnai-blue?logo=Substack&logoColor=white)](https://kilnai.substack.com)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| Apps    | [![MacOS](https://img.shields.io/badge/MacOS-black?logo=apple)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Windows](https://img.shields.io/badge/Windows-0067b8.svg?logo=data:image/svg%2bxml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHN2ZyBmaWxsPSIjZmZmIiB2aWV3Qm94PSIwIDAgMzIgMzIiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTE2Ljc0MiAxNi43NDJ2MTQuMjUzaDE0LjI1M3YtMTQuMjUzek0xLjAwNCAxNi43NDJ2MTQuMjUzaDE0LjI1NnYtMTQuMjUzek0xNi43NDIgMS4wMDR2MTQuMjU2aDE0LjI1M3YtMTQuMjU2ek0xLjAwNCAxLjAwNHYxNC4yNTZoMTQuMjU2di0xNC4yNTZ6Ij48L3BhdGg+Cjwvc3ZnPg==)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Linux](https://img.shields.io/badge/Linux-444444?logo=linux&logoColor=ffffff)](https://github.com/Kiln-AI/Kiln/releases/latest) ![Github Downsloads](https://img.shields.io/github/downloads/kiln-ai/kiln/total)                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| Connect | [![Discord](https://img.shields.io/badge/Discord-Kiln_AI-blue?logo=Discord&logoColor=white)](https://discord.gg/sVJEzDGu) [![Newsletter](https://img.shields.io/badge/Newsletter-kilnai-blue?logo=Substack&logoColor=white)](https://kilnai.substack.com)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 
 [<img width="220" alt="Download button" src="https://github.com/user-attachments/assets/a5d51b8b-b30a-4a16-a902-ab6ef1d58dc0">](https://github.com/Kiln-AI/Kiln/releases/latest) [<img width="220" alt="Quick start button" src="https://github.com/user-attachments/assets/aff1b35f-72c0-4286-9b28-40a415558359">](https://docs.getkiln.ai/getting-started/quickstart)
 
@@ -61,6 +61,7 @@ Kiln is quite intuitive, so we suggest launching the desktop app and diving in.
 - [Fine Tuning LLM Models](https://docs.getkiln.ai/docs/fine-tuning-guide)
 - [Guide: Train a Reasoning Model](https://docs.getkiln.ai/docs/guide-train-a-reasoning-model)
 - [Reasoning & Chain of Thought](https://docs.getkiln.ai/docs/reasoning-and-chain-of-thought)
+- [Evaluators](https://docs.getkiln.ai/docs/evaluators)
 - [Synthetic Data Generation](https://docs.getkiln.ai/docs/synthetic-data-generation)
 - [Collaborating with Kiln](https://docs.getkiln.ai/docs/collaboration)
 - [Rating and Labeling Data](https://docs.getkiln.ai/docs/reviewing-and-rating)

diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py
@@ -10,6 +10,7 @@
 
 from app.desktop.log_config import log_config
 from app.desktop.studio_server.data_gen_api import connect_data_gen_api
+from app.desktop.studio_server.eval_api import connect_evals_api
 from app.desktop.studio_server.finetune_api import connect_fine_tune_api
 from app.desktop.studio_server.prompt_api import connect_prompt_api
 from app.desktop.studio_server.provider_api import connect_provider_api
@@ -36,6 +37,7 @@ def make_app():
     connect_settings(app)
     connect_data_gen_api(app)
     connect_fine_tune_api(app)
+    connect_evals_api(app)
 
     # Important: webhost must be last, it handles all other URLs
     connect_webhost(app)

diff --git a/app/desktop/pyproject.toml b/app/desktop/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "pillow>=11.0.0",
     "pystray>=0.19.5",
     "pyinstaller==6.11.1",
+    "scipy>=1.15.2",
 ]
 
 

diff --git a/app/desktop/studio_server/correlation_calculator.py b/app/desktop/studio_server/correlation_calculator.py
@@ -0,0 +1,110 @@
+import math
+from dataclasses import dataclass
+from typing import List
+
+from scipy import stats
+
+
+@dataclass
+class CorrelationScore:
+    measured_score: float
+    human_score: float
+    normalized_measured_score: float
+    normalized_human_score: float
+
+
+@dataclass
+class CorrelationResult:
+    mean_absolute_error: float
+    mean_normalized_absolute_error: float
+    mean_squared_error: float
+    mean_normalized_squared_error: float
+    spearman_correlation: float | None
+    pearson_correlation: float | None
+    kendalltau_correlation: float | None
+
+
+class CorrelationCalculator:
+    def __init__(self):
+        self.scores: List[CorrelationScore] = []
+
+    def add_score(self, score: CorrelationScore):
+        self.scores.append(score)
+
+    def calculate_correlation(self) -> CorrelationResult:
+        if len(self.scores) == 0:
+            raise ValueError("No scores to calculate correlation")
+
+        return CorrelationResult(
+            mean_absolute_error=self.calculate_mean_absolute_error(),
+            mean_normalized_absolute_error=self.calculate_mean_normalized_absolute_error(),
+            mean_squared_error=self.calculate_mean_squared_error(),
+            mean_normalized_squared_error=self.calculate_mean_normalized_squared_error(),
+            spearman_correlation=self.calculate_spearman_correlation(),
+            pearson_correlation=self.calculate_pearson_correlation(),
+            kendalltau_correlation=self.calculate_kendalltau_correlation(),
+        )
+
+    def calculate_mean_absolute_error(self) -> float:
+        total_absolute_error = sum(
+            abs(score.measured_score - score.human_score) for score in self.scores
+        )
+        return total_absolute_error / len(self.scores)
+
+    def calculate_mean_normalized_absolute_error(self) -> float:
+        total_normalized_absolute_error = sum(
+            abs(score.normalized_measured_score - score.normalized_human_score)
+            for score in self.scores
+        )
+        return total_normalized_absolute_error / len(self.scores)
+
+    def calculate_mean_squared_error(self) -> float:
+        total_squared_error = sum(
+            (score.measured_score - score.human_score) ** 2 for score in self.scores
+        )
+        return total_squared_error / len(self.scores)
+
+    def calculate_mean_normalized_squared_error(self) -> float:
+        total_normalized_squared_error = sum(
+            (score.normalized_measured_score - score.normalized_human_score) ** 2
+            for score in self.scores
+        )
+        return total_normalized_squared_error / len(self.scores)
+
+    def calculate_spearman_correlation(self) -> float | None:
+        if len(self.scores) < 2:
+            # If there is only one pair, no correlation
+            return None
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.spearmanr(x, y)
+        # library doesn't support proper types
+        correlation = result.__getattribute__("correlation")
+        if math.isnan(correlation) or not isinstance(correlation, float):
+            # Very small samples may have a NaN result (unknown correlation)
+            return None
+        return correlation
+
+    def calculate_pearson_correlation(self) -> float | None:
+        if len(self.scores) < 2:
+            # If there is only one pair,  no correlation
+            return None
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.pearsonr(x, y)
+        if math.isnan(result.correlation):
+            # Very small samples may have a NaN result (unknown correlation)
+            return None
+        return result.correlation
+
+    def calculate_kendalltau_correlation(self) -> float | None:
+        if len(self.scores) < 2:
+            # If there is only one pair, no correlation
+            return None
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.kendalltau(x, y)
+        if math.isnan(result.correlation):
+            # Very small samples may have a NaN result (unknown correlation)
+            return None
+        return result.correlation
diff --git a/app/desktop/studio_server/data_gen_api.py b/app/desktop/studio_server/data_gen_api.py
@@ -5,9 +5,10 @@
     DataGenCategoriesTaskInput,
     DataGenSampleTask,
     DataGenSampleTaskInput,
+    wrap_task_with_guidance,
 )
-from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name
-from kiln_ai.datamodel import DataSource, DataSourceType, TaskRun
+from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig
+from kiln_ai.datamodel import DataSource, DataSourceType, PromptId, TaskRun
 from kiln_server.run_api import model_provider_from_string
 from kiln_server.task_api import task_from_id
 from pydantic import BaseModel, ConfigDict, Field
@@ -60,9 +61,13 @@ class DataGenSaveSamplesApiInput(BaseModel):
     )
     output_model_name: str = Field(description="The name of the model to use")
     output_provider: str = Field(description="The provider of the model to use")
-    prompt_method: str = Field(
+    prompt_method: PromptId = Field(
         description="The prompt method used to generate the output"
     )
+    human_guidance: str | None = Field(
+        description="Optional human guidance for generation",
+        default=None,
+    )
 
 
 def connect_data_gen_api(app: FastAPI):
@@ -122,7 +127,11 @@ async def save_sample(
     ) -> TaskRun:
         task = task_from_id(project_id, task_id)
 
-        prompt_builder = prompt_builder_from_ui_name(sample.prompt_method, task)
+        # Wrap the task instuctions with human guidance, if provided
+        if sample.human_guidance is not None and sample.human_guidance.strip() != "":
+            task.instruction = wrap_task_with_guidance(
+                task.instruction, sample.human_guidance
+            )
 
         tags = ["synthetic"]
         if session_id:
@@ -132,8 +141,8 @@ async def save_sample(
             task,
             model_name=sample.output_model_name,
             provider=model_provider_from_string(sample.output_provider),
-            prompt_builder=prompt_builder,
-            tags=tags,
+            prompt_id=sample.prompt_method,
+            base_adapter_config=AdapterConfig(default_tags=tags),
         )
 
         properties: dict[str, str | int | float] = {