Skip to content

Commit

Permalink
More and better correlation coefficients for comparing eval configs t…
Browse files Browse the repository at this point in the history
…o human scores
  • Loading branch information
scosman committed Feb 27, 2025
1 parent 9f07168 commit 8102a92
Show file tree
Hide file tree
Showing 8 changed files with 549 additions and 76 deletions.
1 change: 1 addition & 0 deletions app/desktop/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies = [
"pillow>=11.0.0",
"pystray>=0.19.5",
"pyinstaller==6.11.1",
"scipy>=1.15.2",
]


Expand Down
110 changes: 110 additions & 0 deletions app/desktop/studio_server/correlation_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import math
from dataclasses import dataclass
from typing import List

from scipy import stats


@dataclass
class CorrelationScore:
measured_score: float
human_score: float
normalized_measured_score: float
normalized_human_score: float


@dataclass
class CorrelationResult:
mean_absolute_error: float
mean_normalized_absolute_error: float
mean_squared_error: float
mean_normalized_squared_error: float
spearman_correlation: float
pearson_correlation: float
kendalltau_correlation: float


class CorrelationCalculator:
def __init__(self):
self.scores: List[CorrelationScore] = []

def add_score(self, score: CorrelationScore):
self.scores.append(score)

def calculate_correlation(self) -> CorrelationResult:
if len(self.scores) == 0:
raise ValueError("No scores to calculate correlation")

return CorrelationResult(
mean_absolute_error=self.calculate_mean_absolute_error(),
mean_normalized_absolute_error=self.calculate_mean_normalized_absolute_error(),
mean_squared_error=self.calculate_mean_squared_error(),
mean_normalized_squared_error=self.calculate_mean_normalized_squared_error(),
spearman_correlation=self.calculate_spearman_correlation(),
pearson_correlation=self.calculate_pearson_correlation(),
kendalltau_correlation=self.calculate_kendalltau_correlation(),
)

def calculate_mean_absolute_error(self) -> float:
total_absolute_error = sum(
abs(score.measured_score - score.human_score) for score in self.scores
)
return total_absolute_error / len(self.scores)

def calculate_mean_normalized_absolute_error(self) -> float:
total_normalized_absolute_error = sum(
abs(score.normalized_measured_score - score.normalized_human_score)
for score in self.scores
)
return total_normalized_absolute_error / len(self.scores)

def calculate_mean_squared_error(self) -> float:
total_squared_error = sum(
(score.measured_score - score.human_score) ** 2 for score in self.scores
)
return total_squared_error / len(self.scores)

def calculate_mean_normalized_squared_error(self) -> float:
total_normalized_squared_error = sum(
(score.normalized_measured_score - score.normalized_human_score) ** 2
for score in self.scores
)
return total_normalized_squared_error / len(self.scores)

def calculate_spearman_correlation(self) -> float:
if len(self.scores) < 2:
# If there is only one pair, return 0 = no correlation
return 0
x = [score.measured_score for score in self.scores]
y = [score.human_score for score in self.scores]
result = stats.spearmanr(x, y)
# library doesn't support proper types
correlation = result.__getattribute__("correlation")
if math.isnan(correlation) or not isinstance(correlation, float):
# Very small samples may have a NaN result (unknown correlation)
return 0
return correlation

def calculate_pearson_correlation(self) -> float:
if len(self.scores) < 2:
# If there is only one pair, return 0 = no correlation
return 0
x = [score.measured_score for score in self.scores]
y = [score.human_score for score in self.scores]
result = stats.pearsonr(x, y)
if math.isnan(result.correlation):
# Very small samples may have a NaN result (unknown correlation)
return 0
return result.correlation

def calculate_kendalltau_correlation(self) -> float:
if len(self.scores) < 2:
# If there is only one pair, return 0 = no correlation
return 0
x = [score.measured_score for score in self.scores]
y = [score.human_score for score in self.scores]
result = stats.kendalltau(x, y)
if math.isnan(result.correlation):
# Very small samples may have a NaN result (unknown correlation)
return 0
return result.correlation
95 changes: 35 additions & 60 deletions app/desktop/studio_server/eval_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
from kiln_server.task_api import task_from_id
from pydantic import BaseModel

from .correlation_calculator import (
CorrelationCalculator,
CorrelationResult,
CorrelationScore,
)


def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval:
task = task_from_id(project_id, task_id)
Expand Down Expand Up @@ -143,16 +149,9 @@ class EvalResultSummary(BaseModel):
dataset_size: int


class EvalConfigScoreSummary(BaseModel):
mean_absolute_error: float
mean_normalized_absolute_error: float
mean_squared_error: float
mean_normalized_squared_error: float


class EvalConfigCompareSummary(BaseModel):
# Summary of results. eval_config_id -> output_score_id -> ScoreSummary
results: Dict[str, Dict[str, EvalConfigScoreSummary]]
# Summary of results. eval_config_id -> output_score_id -> CorrelationResult
results: Dict[str, Dict[str, CorrelationResult]]
# eval_config_id -> percent of the dataset that has been processed (run with eval scores)
eval_config_percent_complete: Dict[str, float]
# The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size)
Expand Down Expand Up @@ -589,12 +588,8 @@ async def get_eval_configs_score_summary(
for eval_config in eval_configs
}

# eval_config_id -> output_score_id -> scores/total
total_squared_error: Dict[str, Dict[str, float]] = {}
total_normalized_squared_error: Dict[str, Dict[str, float]] = {}
total_absolute_error: Dict[str, Dict[str, float]] = {}
total_normalized_absolute_error: Dict[str, Dict[str, float]] = {}
total_count: Dict[str, Dict[str, int]] = {}
# eval_config_id -> output_score_id -> correlation calculator
correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {}

# important: readonly makes this much faster
for eval_config in eval_configs:
Expand Down Expand Up @@ -631,62 +626,42 @@ async def get_eval_configs_score_summary(
# This score doesn't have both a human eval and eval score, so we can't compare
continue

if eval_config_id not in total_squared_error:
total_squared_error[eval_config_id] = {}
total_absolute_error[eval_config_id] = {}
total_count[eval_config_id] = {}
total_normalized_squared_error[eval_config_id] = {}
total_normalized_absolute_error[eval_config_id] = {}
if score_key not in total_squared_error[eval_config_id]:
total_squared_error[eval_config_id][score_key] = 0
total_absolute_error[eval_config_id][score_key] = 0
total_count[eval_config_id][score_key] = 0
total_normalized_squared_error[eval_config_id][score_key] = 0
total_normalized_absolute_error[eval_config_id][score_key] = 0
if eval_config_id not in correlation_calculators:
correlation_calculators[eval_config_id] = {}

if score_key not in correlation_calculators[eval_config_id]:
correlation_calculators[eval_config_id][score_key] = (
CorrelationCalculator()
)

normalized_eval_score = normalize_rating(
eval_score, output_score.type
)
normalized_human_score = normalize_rating(
human_score, output_score.type
)
total_squared_error[eval_config_id][score_key] += (
eval_score - human_score
) ** 2
total_normalized_squared_error[eval_config_id][score_key] += (
normalized_eval_score - normalized_human_score
) ** 2
total_absolute_error[eval_config_id][score_key] += abs(
eval_score - human_score
correlation_calculators[eval_config_id][score_key].add_score(
CorrelationScore(
measured_score=eval_score,
human_score=human_score,
normalized_measured_score=normalized_eval_score,
normalized_human_score=normalized_human_score,
)
)
total_normalized_absolute_error[eval_config_id][score_key] += abs(
normalized_eval_score - normalized_human_score
)
total_count[eval_config_id][score_key] += 1

# Convert to score summaries
results: Dict[str, Dict[str, EvalConfigScoreSummary]] = {}
for eval_config_id in total_count.keys():
results: Dict[str, Dict[str, CorrelationResult]] = {}
for eval_config_id in correlation_calculators.keys():
results[eval_config_id] = {}
for score_key in total_count[eval_config_id].keys():
count = total_count[eval_config_id][score_key]
if count > 0:
results[eval_config_id][score_key] = EvalConfigScoreSummary(
mean_squared_error=(
total_squared_error[eval_config_id][score_key] / count
),
mean_absolute_error=(
total_absolute_error[eval_config_id][score_key] / count
),
mean_normalized_squared_error=(
total_normalized_squared_error[eval_config_id][score_key]
/ count
),
mean_normalized_absolute_error=(
total_normalized_absolute_error[eval_config_id][score_key]
/ count
),
)
for score_key in correlation_calculators[eval_config_id].keys():
if not correlation_calculators[eval_config_id][score_key]:
# No scores to calculate correlation for this pair
continue

correlation_result = correlation_calculators[eval_config_id][
score_key
].calculate_correlation()
results[eval_config_id][score_key] = correlation_result

# Calculate the percent of the dataset that has been processed
eval_config_percent_complete: Dict[str, float] = {}
Expand Down
Loading

0 comments on commit 8102a92

Please sign in to comment.