More and better correlation coefficients for comparing eval configs t…

…o human scores
Kiln-AI · Feb 27, 2025 · 8102a92 · 8102a92
1 parent 9f07168
commit 8102a92
Show file tree

Hide file tree

Showing 8 changed files with 549 additions and 76 deletions.
diff --git a/app/desktop/pyproject.toml b/app/desktop/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "pillow>=11.0.0",
     "pystray>=0.19.5",
     "pyinstaller==6.11.1",
+    "scipy>=1.15.2",
 ]
 
 

diff --git a/app/desktop/studio_server/correlation_calculator.py b/app/desktop/studio_server/correlation_calculator.py
@@ -0,0 +1,110 @@
+import math
+from dataclasses import dataclass
+from typing import List
+
+from scipy import stats
+
+
+@dataclass
+class CorrelationScore:
+    measured_score: float
+    human_score: float
+    normalized_measured_score: float
+    normalized_human_score: float
+
+
+@dataclass
+class CorrelationResult:
+    mean_absolute_error: float
+    mean_normalized_absolute_error: float
+    mean_squared_error: float
+    mean_normalized_squared_error: float
+    spearman_correlation: float
+    pearson_correlation: float
+    kendalltau_correlation: float
+
+
+class CorrelationCalculator:
+    def __init__(self):
+        self.scores: List[CorrelationScore] = []
+
+    def add_score(self, score: CorrelationScore):
+        self.scores.append(score)
+
+    def calculate_correlation(self) -> CorrelationResult:
+        if len(self.scores) == 0:
+            raise ValueError("No scores to calculate correlation")
+
+        return CorrelationResult(
+            mean_absolute_error=self.calculate_mean_absolute_error(),
+            mean_normalized_absolute_error=self.calculate_mean_normalized_absolute_error(),
+            mean_squared_error=self.calculate_mean_squared_error(),
+            mean_normalized_squared_error=self.calculate_mean_normalized_squared_error(),
+            spearman_correlation=self.calculate_spearman_correlation(),
+            pearson_correlation=self.calculate_pearson_correlation(),
+            kendalltau_correlation=self.calculate_kendalltau_correlation(),
+        )
+
+    def calculate_mean_absolute_error(self) -> float:
+        total_absolute_error = sum(
+            abs(score.measured_score - score.human_score) for score in self.scores
+        )
+        return total_absolute_error / len(self.scores)
+
+    def calculate_mean_normalized_absolute_error(self) -> float:
+        total_normalized_absolute_error = sum(
+            abs(score.normalized_measured_score - score.normalized_human_score)
+            for score in self.scores
+        )
+        return total_normalized_absolute_error / len(self.scores)
+
+    def calculate_mean_squared_error(self) -> float:
+        total_squared_error = sum(
+            (score.measured_score - score.human_score) ** 2 for score in self.scores
+        )
+        return total_squared_error / len(self.scores)
+
+    def calculate_mean_normalized_squared_error(self) -> float:
+        total_normalized_squared_error = sum(
+            (score.normalized_measured_score - score.normalized_human_score) ** 2
+            for score in self.scores
+        )
+        return total_normalized_squared_error / len(self.scores)
+
+    def calculate_spearman_correlation(self) -> float:
+        if len(self.scores) < 2:
+            # If there is only one pair, return 0 = no correlation
+            return 0
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.spearmanr(x, y)
+        # library doesn't support proper types
+        correlation = result.__getattribute__("correlation")
+        if math.isnan(correlation) or not isinstance(correlation, float):
+            # Very small samples may have a NaN result (unknown correlation)
+            return 0
+        return correlation
+
+    def calculate_pearson_correlation(self) -> float:
+        if len(self.scores) < 2:
+            # If there is only one pair, return 0 = no correlation
+            return 0
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.pearsonr(x, y)
+        if math.isnan(result.correlation):
+            # Very small samples may have a NaN result (unknown correlation)
+            return 0
+        return result.correlation
+
+    def calculate_kendalltau_correlation(self) -> float:
+        if len(self.scores) < 2:
+            # If there is only one pair, return 0 = no correlation
+            return 0
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.kendalltau(x, y)
+        if math.isnan(result.correlation):
+            # Very small samples may have a NaN result (unknown correlation)
+            return 0
+        return result.correlation
diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py
@@ -32,6 +32,12 @@
 from kiln_server.task_api import task_from_id
 from pydantic import BaseModel
 
+from .correlation_calculator import (
+    CorrelationCalculator,
+    CorrelationResult,
+    CorrelationScore,
+)
+
 
 def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval:
     task = task_from_id(project_id, task_id)
@@ -143,16 +149,9 @@ class EvalResultSummary(BaseModel):
     dataset_size: int
 
 
-class EvalConfigScoreSummary(BaseModel):
-    mean_absolute_error: float
-    mean_normalized_absolute_error: float
-    mean_squared_error: float
-    mean_normalized_squared_error: float
-
-
 class EvalConfigCompareSummary(BaseModel):
-    # Summary of results. eval_config_id -> output_score_id -> ScoreSummary
-    results: Dict[str, Dict[str, EvalConfigScoreSummary]]
+    # Summary of results. eval_config_id -> output_score_id -> CorrelationResult
+    results: Dict[str, Dict[str, CorrelationResult]]
     # eval_config_id -> percent of the dataset that has been processed (run with eval scores)
     eval_config_percent_complete: Dict[str, float]
     # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size)
@@ -589,12 +588,8 @@ async def get_eval_configs_score_summary(
             for eval_config in eval_configs
         }
 
-        # eval_config_id -> output_score_id -> scores/total
-        total_squared_error: Dict[str, Dict[str, float]] = {}
-        total_normalized_squared_error: Dict[str, Dict[str, float]] = {}
-        total_absolute_error: Dict[str, Dict[str, float]] = {}
-        total_normalized_absolute_error: Dict[str, Dict[str, float]] = {}
-        total_count: Dict[str, Dict[str, int]] = {}
+        # eval_config_id -> output_score_id -> correlation calculator
+        correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {}
 
         # important: readonly makes this much faster
         for eval_config in eval_configs:
@@ -631,62 +626,42 @@ async def get_eval_configs_score_summary(
                         # This score doesn't have both a human eval and eval score, so we can't compare
                         continue
 
-                    if eval_config_id not in total_squared_error:
-                        total_squared_error[eval_config_id] = {}
-                        total_absolute_error[eval_config_id] = {}
-                        total_count[eval_config_id] = {}
-                        total_normalized_squared_error[eval_config_id] = {}
-                        total_normalized_absolute_error[eval_config_id] = {}
-                    if score_key not in total_squared_error[eval_config_id]:
-                        total_squared_error[eval_config_id][score_key] = 0
-                        total_absolute_error[eval_config_id][score_key] = 0
-                        total_count[eval_config_id][score_key] = 0
-                        total_normalized_squared_error[eval_config_id][score_key] = 0
-                        total_normalized_absolute_error[eval_config_id][score_key] = 0
+                    if eval_config_id not in correlation_calculators:
+                        correlation_calculators[eval_config_id] = {}
+
+                    if score_key not in correlation_calculators[eval_config_id]:
+                        correlation_calculators[eval_config_id][score_key] = (
+                            CorrelationCalculator()
+                        )
 
                     normalized_eval_score = normalize_rating(
                         eval_score, output_score.type
                     )
                     normalized_human_score = normalize_rating(
                         human_score, output_score.type
                     )
-                    total_squared_error[eval_config_id][score_key] += (
-                        eval_score - human_score
-                    ) ** 2
-                    total_normalized_squared_error[eval_config_id][score_key] += (
-                        normalized_eval_score - normalized_human_score
-                    ) ** 2
-                    total_absolute_error[eval_config_id][score_key] += abs(
-                        eval_score - human_score
+                    correlation_calculators[eval_config_id][score_key].add_score(
+                        CorrelationScore(
+                            measured_score=eval_score,
+                            human_score=human_score,
+                            normalized_measured_score=normalized_eval_score,
+                            normalized_human_score=normalized_human_score,
+                        )
                     )
-                    total_normalized_absolute_error[eval_config_id][score_key] += abs(
-                        normalized_eval_score - normalized_human_score
-                    )
-                    total_count[eval_config_id][score_key] += 1
 
         # Convert to score summaries
-        results: Dict[str, Dict[str, EvalConfigScoreSummary]] = {}
-        for eval_config_id in total_count.keys():
+        results: Dict[str, Dict[str, CorrelationResult]] = {}
+        for eval_config_id in correlation_calculators.keys():
             results[eval_config_id] = {}
-            for score_key in total_count[eval_config_id].keys():
-                count = total_count[eval_config_id][score_key]
-                if count > 0:
-                    results[eval_config_id][score_key] = EvalConfigScoreSummary(
-                        mean_squared_error=(
-                            total_squared_error[eval_config_id][score_key] / count
-                        ),
-                        mean_absolute_error=(
-                            total_absolute_error[eval_config_id][score_key] / count
-                        ),
-                        mean_normalized_squared_error=(
-                            total_normalized_squared_error[eval_config_id][score_key]
-                            / count
-                        ),
-                        mean_normalized_absolute_error=(
-                            total_normalized_absolute_error[eval_config_id][score_key]
-                            / count
-                        ),
-                    )
+            for score_key in correlation_calculators[eval_config_id].keys():
+                if not correlation_calculators[eval_config_id][score_key]:
+                    # No scores to calculate correlation for this pair
+                    continue
+
+                correlation_result = correlation_calculators[eval_config_id][
+                    score_key
+                ].calculate_correlation()
+                results[eval_config_id][score_key] = correlation_result
 
         # Calculate the percent of the dataset that has been processed
         eval_config_percent_complete: Dict[str, float] = {}