From 8102a92f4017dc74a82b3241912e62b729989f8c Mon Sep 17 00:00:00 2001 From: scosman Date: Thu, 27 Feb 2025 18:21:32 -0500 Subject: [PATCH] More and better correlation coefficients for comparing eval configs to human scores --- app/desktop/pyproject.toml | 1 + .../studio_server/correlation_calculator.py | 110 ++++++++ app/desktop/studio_server/eval_api.py | 95 +++---- .../test_correlation_calculator.py | 246 ++++++++++++++++++ app/desktop/studio_server/test_eval_api.py | 17 +- app/web_ui/src/lib/api_schema.d.ts | 30 ++- .../[eval_id]/eval_configs/+page.svelte | 68 ++++- uv.lock | 58 +++++ 8 files changed, 549 insertions(+), 76 deletions(-) create mode 100644 app/desktop/studio_server/correlation_calculator.py create mode 100644 app/desktop/studio_server/test_correlation_calculator.py diff --git a/app/desktop/pyproject.toml b/app/desktop/pyproject.toml index 1cf5e5e5..e28ea1c4 100644 --- a/app/desktop/pyproject.toml +++ b/app/desktop/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "pillow>=11.0.0", "pystray>=0.19.5", "pyinstaller==6.11.1", + "scipy>=1.15.2", ] diff --git a/app/desktop/studio_server/correlation_calculator.py b/app/desktop/studio_server/correlation_calculator.py new file mode 100644 index 00000000..0bbcde46 --- /dev/null +++ b/app/desktop/studio_server/correlation_calculator.py @@ -0,0 +1,110 @@ +import math +from dataclasses import dataclass +from typing import List + +from scipy import stats + + +@dataclass +class CorrelationScore: + measured_score: float + human_score: float + normalized_measured_score: float + normalized_human_score: float + + +@dataclass +class CorrelationResult: + mean_absolute_error: float + mean_normalized_absolute_error: float + mean_squared_error: float + mean_normalized_squared_error: float + spearman_correlation: float + pearson_correlation: float + kendalltau_correlation: float + + +class CorrelationCalculator: + def __init__(self): + self.scores: List[CorrelationScore] = [] + + def add_score(self, score: CorrelationScore): + self.scores.append(score) + + def calculate_correlation(self) -> CorrelationResult: + if len(self.scores) == 0: + raise ValueError("No scores to calculate correlation") + + return CorrelationResult( + mean_absolute_error=self.calculate_mean_absolute_error(), + mean_normalized_absolute_error=self.calculate_mean_normalized_absolute_error(), + mean_squared_error=self.calculate_mean_squared_error(), + mean_normalized_squared_error=self.calculate_mean_normalized_squared_error(), + spearman_correlation=self.calculate_spearman_correlation(), + pearson_correlation=self.calculate_pearson_correlation(), + kendalltau_correlation=self.calculate_kendalltau_correlation(), + ) + + def calculate_mean_absolute_error(self) -> float: + total_absolute_error = sum( + abs(score.measured_score - score.human_score) for score in self.scores + ) + return total_absolute_error / len(self.scores) + + def calculate_mean_normalized_absolute_error(self) -> float: + total_normalized_absolute_error = sum( + abs(score.normalized_measured_score - score.normalized_human_score) + for score in self.scores + ) + return total_normalized_absolute_error / len(self.scores) + + def calculate_mean_squared_error(self) -> float: + total_squared_error = sum( + (score.measured_score - score.human_score) ** 2 for score in self.scores + ) + return total_squared_error / len(self.scores) + + def calculate_mean_normalized_squared_error(self) -> float: + total_normalized_squared_error = sum( + (score.normalized_measured_score - score.normalized_human_score) ** 2 + for score in self.scores + ) + return total_normalized_squared_error / len(self.scores) + + def calculate_spearman_correlation(self) -> float: + if len(self.scores) < 2: + # If there is only one pair, return 0 = no correlation + return 0 + x = [score.measured_score for score in self.scores] + y = [score.human_score for score in self.scores] + result = stats.spearmanr(x, y) + # library doesn't support proper types + correlation = result.__getattribute__("correlation") + if math.isnan(correlation) or not isinstance(correlation, float): + # Very small samples may have a NaN result (unknown correlation) + return 0 + return correlation + + def calculate_pearson_correlation(self) -> float: + if len(self.scores) < 2: + # If there is only one pair, return 0 = no correlation + return 0 + x = [score.measured_score for score in self.scores] + y = [score.human_score for score in self.scores] + result = stats.pearsonr(x, y) + if math.isnan(result.correlation): + # Very small samples may have a NaN result (unknown correlation) + return 0 + return result.correlation + + def calculate_kendalltau_correlation(self) -> float: + if len(self.scores) < 2: + # If there is only one pair, return 0 = no correlation + return 0 + x = [score.measured_score for score in self.scores] + y = [score.human_score for score in self.scores] + result = stats.kendalltau(x, y) + if math.isnan(result.correlation): + # Very small samples may have a NaN result (unknown correlation) + return 0 + return result.correlation diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 5dde89ae..d1fb9e38 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -32,6 +32,12 @@ from kiln_server.task_api import task_from_id from pydantic import BaseModel +from .correlation_calculator import ( + CorrelationCalculator, + CorrelationResult, + CorrelationScore, +) + def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval: task = task_from_id(project_id, task_id) @@ -143,16 +149,9 @@ class EvalResultSummary(BaseModel): dataset_size: int -class EvalConfigScoreSummary(BaseModel): - mean_absolute_error: float - mean_normalized_absolute_error: float - mean_squared_error: float - mean_normalized_squared_error: float - - class EvalConfigCompareSummary(BaseModel): - # Summary of results. eval_config_id -> output_score_id -> ScoreSummary - results: Dict[str, Dict[str, EvalConfigScoreSummary]] + # Summary of results. eval_config_id -> output_score_id -> CorrelationResult + results: Dict[str, Dict[str, CorrelationResult]] # eval_config_id -> percent of the dataset that has been processed (run with eval scores) eval_config_percent_complete: Dict[str, float] # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size) @@ -589,12 +588,8 @@ async def get_eval_configs_score_summary( for eval_config in eval_configs } - # eval_config_id -> output_score_id -> scores/total - total_squared_error: Dict[str, Dict[str, float]] = {} - total_normalized_squared_error: Dict[str, Dict[str, float]] = {} - total_absolute_error: Dict[str, Dict[str, float]] = {} - total_normalized_absolute_error: Dict[str, Dict[str, float]] = {} - total_count: Dict[str, Dict[str, int]] = {} + # eval_config_id -> output_score_id -> correlation calculator + correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {} # important: readonly makes this much faster for eval_config in eval_configs: @@ -631,18 +626,13 @@ async def get_eval_configs_score_summary( # This score doesn't have both a human eval and eval score, so we can't compare continue - if eval_config_id not in total_squared_error: - total_squared_error[eval_config_id] = {} - total_absolute_error[eval_config_id] = {} - total_count[eval_config_id] = {} - total_normalized_squared_error[eval_config_id] = {} - total_normalized_absolute_error[eval_config_id] = {} - if score_key not in total_squared_error[eval_config_id]: - total_squared_error[eval_config_id][score_key] = 0 - total_absolute_error[eval_config_id][score_key] = 0 - total_count[eval_config_id][score_key] = 0 - total_normalized_squared_error[eval_config_id][score_key] = 0 - total_normalized_absolute_error[eval_config_id][score_key] = 0 + if eval_config_id not in correlation_calculators: + correlation_calculators[eval_config_id] = {} + + if score_key not in correlation_calculators[eval_config_id]: + correlation_calculators[eval_config_id][score_key] = ( + CorrelationCalculator() + ) normalized_eval_score = normalize_rating( eval_score, output_score.type @@ -650,43 +640,28 @@ async def get_eval_configs_score_summary( normalized_human_score = normalize_rating( human_score, output_score.type ) - total_squared_error[eval_config_id][score_key] += ( - eval_score - human_score - ) ** 2 - total_normalized_squared_error[eval_config_id][score_key] += ( - normalized_eval_score - normalized_human_score - ) ** 2 - total_absolute_error[eval_config_id][score_key] += abs( - eval_score - human_score + correlation_calculators[eval_config_id][score_key].add_score( + CorrelationScore( + measured_score=eval_score, + human_score=human_score, + normalized_measured_score=normalized_eval_score, + normalized_human_score=normalized_human_score, + ) ) - total_normalized_absolute_error[eval_config_id][score_key] += abs( - normalized_eval_score - normalized_human_score - ) - total_count[eval_config_id][score_key] += 1 # Convert to score summaries - results: Dict[str, Dict[str, EvalConfigScoreSummary]] = {} - for eval_config_id in total_count.keys(): + results: Dict[str, Dict[str, CorrelationResult]] = {} + for eval_config_id in correlation_calculators.keys(): results[eval_config_id] = {} - for score_key in total_count[eval_config_id].keys(): - count = total_count[eval_config_id][score_key] - if count > 0: - results[eval_config_id][score_key] = EvalConfigScoreSummary( - mean_squared_error=( - total_squared_error[eval_config_id][score_key] / count - ), - mean_absolute_error=( - total_absolute_error[eval_config_id][score_key] / count - ), - mean_normalized_squared_error=( - total_normalized_squared_error[eval_config_id][score_key] - / count - ), - mean_normalized_absolute_error=( - total_normalized_absolute_error[eval_config_id][score_key] - / count - ), - ) + for score_key in correlation_calculators[eval_config_id].keys(): + if not correlation_calculators[eval_config_id][score_key]: + # No scores to calculate correlation for this pair + continue + + correlation_result = correlation_calculators[eval_config_id][ + score_key + ].calculate_correlation() + results[eval_config_id][score_key] = correlation_result # Calculate the percent of the dataset that has been processed eval_config_percent_complete: Dict[str, float] = {} diff --git a/app/desktop/studio_server/test_correlation_calculator.py b/app/desktop/studio_server/test_correlation_calculator.py new file mode 100644 index 00000000..f396c1ad --- /dev/null +++ b/app/desktop/studio_server/test_correlation_calculator.py @@ -0,0 +1,246 @@ +import pytest + +from app.desktop.studio_server.correlation_calculator import ( + CorrelationCalculator, + CorrelationScore, +) + + +class TestCorrelationCalculator: + def create_correlation_scores(self, measured, human): + """Helper method to create correlation scores from raw data with normalization""" + scores = [] + + # Calculate normalized values + min_m, max_m = min(measured), max(measured) + min_h, max_h = min(human), max(human) + + for m, h in zip(measured, human): + norm_m = (m - min_m) / (max_m - min_m) if max_m != min_m else 0 + norm_h = (h - min_h) / (max_h - min_h) if max_h != min_h else 0 + scores.append( + CorrelationScore( + measured_score=m, + human_score=h, + normalized_measured_score=norm_m, + normalized_human_score=norm_h, + ) + ) + return scores + + @pytest.fixture + def perfect_correlation_data(self): + """Dataset with perfect correlation (r=1.0)""" + measured = list(range(10)) + human = list(range(10)) + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def high_correlation_data(self): + """Dataset with high correlation (r≈0.9)""" + measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + human = [1.1, 2.2, 2.9, 3.8, 5.2, 5.8, 7.1, 8.3, 8.7, 10.2] + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def no_correlation_data(self): + """Dataset with no correlation""" + measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + human = [5.5, 6.2, 4.8, 7.3, 2.1, 8.9, 3.7, 5.4, 6.8, 4.2] + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def inverse_correlation_data(self): + """Dataset with inverse correlation (r≈-0.9)""" + measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + human = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def single_data_point(self): + """Dataset with only one data point""" + return [ + CorrelationScore( + measured_score=5, + human_score=5, + normalized_measured_score=0.5, + normalized_human_score=0.5, + ) + ] + + @pytest.fixture + def two_data_points(self): + """Dataset with only two data points""" + measured = [1, 10] + human = [2, 9] + return self.create_correlation_scores(measured, human) + + def setup_calculator_with_data(self, data): + """Helper method to create and populate a calculator with data""" + calculator = CorrelationCalculator() + for score in data: + calculator.add_score(score) + return calculator + + def test_add_score(self): + """Test adding scores to the calculator""" + calculator = CorrelationCalculator() + score = CorrelationScore( + measured_score=5, + human_score=6, + normalized_measured_score=0.5, + normalized_human_score=0.6, + ) + + calculator.add_score(score) + assert len(calculator.scores) == 1 + assert calculator.scores[0] == score + + def test_empty_calculator(self): + """Test that calculating correlation with no scores raises an error""" + calculator = CorrelationCalculator() + + with pytest.raises(ValueError, match="No scores to calculate correlation"): + calculator.calculate_correlation() + + def test_perfect_correlation(self, perfect_correlation_data): + """Test correlation calculations with perfectly correlated data""" + calculator = CorrelationCalculator() + for score in perfect_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Perfect correlation should have: + # - MAE and MSE of 0 (no error) + # - Correlation coefficients of 1.0 + assert result.mean_absolute_error == 0.0 + assert result.mean_normalized_absolute_error == 0.0 + assert result.mean_squared_error == 0.0 + assert result.mean_normalized_squared_error == 0.0 + assert result.spearman_correlation == pytest.approx(1.0) + assert result.pearson_correlation == pytest.approx(1.0) + assert result.kendalltau_correlation == pytest.approx(1.0) + + def test_high_correlation(self, high_correlation_data): + """Test correlation calculations with highly correlated data""" + calculator = CorrelationCalculator() + for score in high_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # High correlation should have: + # - Low but non-zero error metrics + # - Correlation coefficients close to 1.0 + assert 0 < result.mean_absolute_error < 1.0 + assert 0 < result.mean_normalized_absolute_error < 0.2 + assert 0 < result.mean_squared_error < 1.0 + assert 0 < result.mean_normalized_squared_error < 0.1 + assert result.spearman_correlation > 0.9 + assert result.pearson_correlation > 0.9 + assert result.kendalltau_correlation > 0.8 + + def test_no_correlation(self, no_correlation_data): + """Test correlation calculations with uncorrelated data""" + calculator = CorrelationCalculator() + for score in no_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # No correlation should have: + # - Higher error metrics + # - Correlation coefficients close to 0 + assert result.mean_absolute_error > 1.0 + assert result.mean_normalized_absolute_error > 0.2 + assert result.mean_squared_error > 2.0 + assert result.mean_normalized_squared_error > 0.1 + assert -0.3 < result.spearman_correlation < 0.3 + assert -0.3 < result.pearson_correlation < 0.3 + assert -0.3 < result.kendalltau_correlation < 0.3 + + def test_inverse_correlation(self, inverse_correlation_data): + """Test correlation calculations with inversely correlated data""" + calculator = CorrelationCalculator() + for score in inverse_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Inverse correlation should have: + # - Higher error metrics + # - Correlation coefficients close to -1.0 + assert result.mean_absolute_error > 4.0 + assert result.mean_normalized_absolute_error > 0.5 + assert result.mean_squared_error > 20.0 + assert result.mean_normalized_squared_error > 0.3 + assert result.spearman_correlation < -0.9 + assert result.pearson_correlation < -0.9 + assert result.kendalltau_correlation < -0.9 + + def test_single_data_point(self, single_data_point): + """Test correlation calculations with a single data point""" + calculator = CorrelationCalculator() + for score in single_data_point: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Single data point should have: + # - Zero error (since the point matches itself) + # - Correlation coefficients of 0 (as defined in the implementation) + assert result.mean_absolute_error == 0.0 + assert result.mean_normalized_absolute_error == 0.0 + assert result.mean_squared_error == 0.0 + assert result.mean_normalized_squared_error == 0.0 + assert result.spearman_correlation == 0.0 + assert result.pearson_correlation == 0.0 + assert result.kendalltau_correlation == 0.0 + + def test_two_data_points(self, two_data_points): + """Test correlation calculations with two data points""" + calculator = CorrelationCalculator() + for score in two_data_points: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Two data points with positive correlation should have: + # - Some error + # - Positive correlation coefficients + assert result.mean_absolute_error == 1.0 + assert result.mean_normalized_absolute_error == 0.0 + assert result.mean_squared_error == 1.0 + assert result.mean_normalized_squared_error == 0.0 + assert result.spearman_correlation == pytest.approx(1.0) + assert result.pearson_correlation == pytest.approx(1.0) + assert result.kendalltau_correlation == pytest.approx(1.0) + + def test_individual_calculation_methods(self, high_correlation_data): + """Test that individual calculation methods match the combined result""" + calculator = CorrelationCalculator() + for score in high_correlation_data: + calculator.add_score(score) + + # Calculate individual metrics + mae = calculator.calculate_mean_absolute_error() + # Our spell checker thinks n-m-a-e is a misspelling of name :) + n_mae = calculator.calculate_mean_normalized_absolute_error() + mse = calculator.calculate_mean_squared_error() + nmse = calculator.calculate_mean_normalized_squared_error() + spearman = calculator.calculate_spearman_correlation() + pearson = calculator.calculate_pearson_correlation() + kendall = calculator.calculate_kendalltau_correlation() + + # Calculate combined result + result = calculator.calculate_correlation() + + # Verify they match + assert result.mean_absolute_error == mae + assert result.mean_normalized_absolute_error == n_mae + assert result.mean_squared_error == mse + assert result.mean_normalized_squared_error == nmse + assert result.spearman_correlation == spearman + assert result.pearson_correlation == pearson + assert result.kendalltau_correlation == kendall diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 29d174db..f4e922ff 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -758,7 +758,7 @@ class EvalCondigSummaryTestData: score1_overall_rating=5.0, eval_overall_rating=4.0, eval__score1_rating=4.0, - eval_config_id="ec2", + eval_config_id="ec1", skip_golden_tag=True, ), # Test 2: ec2 - Test multiple, and correct averaging @@ -925,12 +925,18 @@ class EvalCondigSummaryTestData: "mean_absolute_error": 4.0, # error 4.0 "mean_normalized_squared_error": 1, # max error: 1 v 5 "mean_normalized_absolute_error": 1, # max error: 1 v 5 + "spearman_correlation": 0, # default value for 1 pair + "pearson_correlation": 0, + "kendalltau_correlation": 0, }, "score1": { "mean_squared_error": 2.25, # error (3.5-5.0)^2 "mean_absolute_error": 1.5, # error 1.5 "mean_normalized_squared_error": 0.140625, # hand calc "mean_normalized_absolute_error": 0.375, # 1.5/4 + "spearman_correlation": 0, # default value for 1 pair + "pearson_correlation": 0, + "kendalltau_correlation": 0, }, } # 1 of total_in_dataset eval configs are are in ec1 test @@ -943,12 +949,18 @@ class EvalCondigSummaryTestData: "mean_absolute_error": 1.5, # (1+2)/2 "mean_normalized_squared_error": 0.15625, # (0.25^2 + 0.5^2) / 2 "mean_normalized_absolute_error": 0.375, # (0.25 + 0.5) / 2 + "spearman_correlation": 0, + "pearson_correlation": 0, + "kendalltau_correlation": 0, }, "score1": { "mean_squared_error": 2.5, # (1^2+2^2)/2 "mean_absolute_error": 1.5, # (1+2)/2 "mean_normalized_squared_error": 0.15625, # (0.25^2 + 0.5^2) / 2 "mean_normalized_absolute_error": 0.375, # (0.25 + 0.5) / 2 + "spearman_correlation": 0.9999999999999999, + "pearson_correlation": 1, + "kendalltau_correlation": 1, }, } # 2 of total_in_dataset eval configs are are in ec2 test @@ -961,6 +973,9 @@ class EvalCondigSummaryTestData: "mean_absolute_error": 2, "mean_normalized_squared_error": 0.25, "mean_normalized_absolute_error": 0.5, + "spearman_correlation": 0, + "pearson_correlation": 0, + "kendalltau_correlation": 0, }, } # 2 of total_in_dataset eval configs are are in ec2 test diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index b00c118e..fe0857e0 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -984,6 +984,23 @@ export interface components { /** Remove Tags */ remove_tags?: string[] | null; }; + /** CorrelationResult */ + CorrelationResult: { + /** Mean Absolute Error */ + mean_absolute_error: number; + /** Mean Normalized Absolute Error */ + mean_normalized_absolute_error: number; + /** Mean Squared Error */ + mean_squared_error: number; + /** Mean Normalized Squared Error */ + mean_normalized_squared_error: number; + /** Spearman Correlation */ + spearman_correlation: number; + /** Pearson Correlation */ + pearson_correlation: number; + /** Kendalltau Correlation */ + kendalltau_correlation: number; + }; /** * CreateDatasetSplitRequest * @description Request to create a dataset split @@ -1386,7 +1403,7 @@ export interface components { /** Results */ results: { [key: string]: { - [key: string]: components["schemas"]["EvalConfigScoreSummary"]; + [key: string]: components["schemas"]["CorrelationResult"]; }; }; /** Eval Config Percent Complete */ @@ -1402,17 +1419,6 @@ export interface components { /** Not Rated Count */ not_rated_count: number; }; - /** EvalConfigScoreSummary */ - EvalConfigScoreSummary: { - /** Mean Absolute Error */ - mean_absolute_error: number; - /** Mean Normalized Absolute Error */ - mean_normalized_absolute_error: number; - /** Mean Squared Error */ - mean_squared_error: number; - /** Mean Normalized Squared Error */ - mean_normalized_squared_error: number; - }; /** * EvalConfigType * @enum {string} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte index 2b736b25..84ae0dd7 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -40,7 +40,16 @@ let score_summary: EvalConfigCompareSummary | null = null let score_summary_error: KilnError | null = null - let score_type: "mse" | "mae" | "norm_mse" | "norm_mae" = "norm_mse" + type ScoreType = + | "mse" + | "mae" + | "norm_mse" + | "norm_mae" + | "spearman" + | "pearson" + | "kendalltau" + + let score_type: ScoreType = "kendalltau" $: loading = eval_loading || eval_configs_loading // Score summary not blocking whole UI $: error = eval_error || eval_configs_error || score_summary_error @@ -248,7 +257,7 @@ function info_tooltip_text( rating_type: TaskOutputRatingType, - score_type: "mse" | "mae" | "norm_mse" | "norm_mae", + score_type: ScoreType, ) { let label = "" if (score_type === "mae") { @@ -259,6 +268,12 @@ label = "Normalized mean squared error" } else if (score_type === "norm_mae") { label = "Normalized mean absolute error" + } else if (score_type === "spearman") { + label = "Spearman's rank correlation" + } else if (score_type === "pearson") { + label = "Pearson's correlation" + } else if (score_type === "kendalltau") { + label = "Kendall Tau correlation" } label += " for " if (rating_type === "five_star") { @@ -354,6 +369,9 @@ ["norm_mae", "Normalized Mean Absolute Error"], ["mse", "Mean Squared Error"], ["mae", "Mean Absolute Error"], + ["spearman", "Spearman Rank Correlation"], + ["pearson", "Pearson Correlation"], + ["kendalltau", "Kendall Tau Correlation"], ]} bind:value={score_type} /> @@ -499,6 +517,12 @@ {scores.mean_normalized_squared_error.toFixed(3)} {:else if score_type === "norm_mae"} {scores.mean_normalized_absolute_error.toFixed(3)} + {:else if score_type === "spearman"} + {scores.spearman_correlation.toFixed(3)} + {:else if score_type === "pearson"} + {scores.pearson_correlation.toFixed(3)} + {:else if score_type === "kendalltau"} + {scores.kendalltau_correlation.toFixed(3)} {/if} {:else} unknown @@ -532,7 +556,7 @@ +
+
TL;DR
+
+ We suggest you use Kendall Tau correlation scores to compare results. +
+
+ Higher values are better. 1.0 is a perfect correlation between the + evaluator and human scores. 0 is no correlation. -1.0 is perfect negative + correlation. +
+
+ Subjective tasks will never reach a perfect 1.0 score, so don't worry if + your score isn't perfect. +
+
+
+ Spearman, Kendall Tau, and Pearson Correlation +
+
+ From -1 to 1, higher is better +
+
+ These are three scientific correlation coefficients. For all three, The + value tends to be high (close to 1) for samples with a strongly positive + correlation, low (close to -1) for samples with a strongly negative + correlation, and close to zero for samples with weak correlation. +
+
Mean Absolute Error
Lower is better
diff --git a/uv.lock b/uv.lock index 77f10d0e..6718115d 100644 --- a/uv.lock +++ b/uv.lock @@ -902,6 +902,7 @@ dependencies = [ { name = "pillow" }, { name = "pyinstaller" }, { name = "pystray" }, + { name = "scipy" }, ] [package.metadata] @@ -910,6 +911,7 @@ requires-dist = [ { name = "pillow", specifier = ">=11.0.0" }, { name = "pyinstaller", specifier = "==6.11.1" }, { name = "pystray", specifier = ">=0.19.5" }, + { name = "scipy", specifier = ">=1.15.2" }, ] [[package]] @@ -1985,6 +1987,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/c0/b0fba8259b61c938c9733da9346b9f93e00881a9db22aafdd72f6ae0ec05/s3transfer-0.10.3-py3-none-any.whl", hash = "sha256:263ed587a5803c6c708d3ce44dc4dfedaab4c1a32e8329bab818933d79ddcf5d", size = 82625 }, ] +[[package]] +name = "scipy" +version = "1.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/b9/31ba9cd990e626574baf93fbc1ac61cf9ed54faafd04c479117517661637/scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec", size = 59417316 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/df/ef233fff6838fe6f7840d69b5ef9f20d2b5c912a8727b21ebf876cb15d54/scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9", size = 38692502 }, + { url = "https://files.pythonhosted.org/packages/5c/20/acdd4efb8a68b842968f7bc5611b1aeb819794508771ad104de418701422/scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5", size = 30085508 }, + { url = "https://files.pythonhosted.org/packages/42/55/39cf96ca7126f1e78ee72a6344ebdc6702fc47d037319ad93221063e6cf4/scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e", size = 22359166 }, + { url = "https://files.pythonhosted.org/packages/51/48/708d26a4ab8a1441536bf2dfcad1df0ca14a69f010fba3ccbdfc02df7185/scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9", size = 25112047 }, + { url = "https://files.pythonhosted.org/packages/dd/65/f9c5755b995ad892020381b8ae11f16d18616208e388621dfacc11df6de6/scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3", size = 35536214 }, + { url = "https://files.pythonhosted.org/packages/de/3c/c96d904b9892beec978562f64d8cc43f9cca0842e65bd3cd1b7f7389b0ba/scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d", size = 37646981 }, + { url = "https://files.pythonhosted.org/packages/3d/74/c2d8a24d18acdeae69ed02e132b9bc1bb67b7bee90feee1afe05a68f9d67/scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58", size = 37230048 }, + { url = "https://files.pythonhosted.org/packages/42/19/0aa4ce80eca82d487987eff0bc754f014dec10d20de2f66754fa4ea70204/scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa", size = 40010322 }, + { url = "https://files.pythonhosted.org/packages/d0/d2/f0683b7e992be44d1475cc144d1f1eeae63c73a14f862974b4db64af635e/scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65", size = 41233385 }, + { url = "https://files.pythonhosted.org/packages/40/1f/bf0a5f338bda7c35c08b4ed0df797e7bafe8a78a97275e9f439aceb46193/scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4", size = 38703651 }, + { url = "https://files.pythonhosted.org/packages/de/54/db126aad3874601048c2c20ae3d8a433dbfd7ba8381551e6f62606d9bd8e/scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1", size = 30102038 }, + { url = "https://files.pythonhosted.org/packages/61/d8/84da3fffefb6c7d5a16968fe5b9f24c98606b165bb801bb0b8bc3985200f/scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971", size = 22375518 }, + { url = "https://files.pythonhosted.org/packages/44/78/25535a6e63d3b9c4c90147371aedb5d04c72f3aee3a34451f2dc27c0c07f/scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655", size = 25142523 }, + { url = "https://files.pythonhosted.org/packages/e0/22/4b4a26fe1cd9ed0bc2b2cb87b17d57e32ab72c346949eaf9288001f8aa8e/scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e", size = 35491547 }, + { url = "https://files.pythonhosted.org/packages/32/ea/564bacc26b676c06a00266a3f25fdfe91a9d9a2532ccea7ce6dd394541bc/scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0", size = 37634077 }, + { url = "https://files.pythonhosted.org/packages/43/c2/bfd4e60668897a303b0ffb7191e965a5da4056f0d98acfb6ba529678f0fb/scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40", size = 37231657 }, + { url = "https://files.pythonhosted.org/packages/4a/75/5f13050bf4f84c931bcab4f4e83c212a36876c3c2244475db34e4b5fe1a6/scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462", size = 40035857 }, + { url = "https://files.pythonhosted.org/packages/b9/8b/7ec1832b09dbc88f3db411f8cdd47db04505c4b72c99b11c920a8f0479c3/scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737", size = 41217654 }, + { url = "https://files.pythonhosted.org/packages/4b/5d/3c78815cbab499610f26b5bae6aed33e227225a9fa5290008a733a64f6fc/scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd", size = 38756184 }, + { url = "https://files.pythonhosted.org/packages/37/20/3d04eb066b471b6e171827548b9ddb3c21c6bbea72a4d84fc5989933910b/scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301", size = 30163558 }, + { url = "https://files.pythonhosted.org/packages/a4/98/e5c964526c929ef1f795d4c343b2ff98634ad2051bd2bbadfef9e772e413/scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93", size = 22437211 }, + { url = "https://files.pythonhosted.org/packages/1d/cd/1dc7371e29195ecbf5222f9afeedb210e0a75057d8afbd942aa6cf8c8eca/scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20", size = 25232260 }, + { url = "https://files.pythonhosted.org/packages/f0/24/1a181a9e5050090e0b5138c5f496fee33293c342b788d02586bc410c6477/scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e", size = 35198095 }, + { url = "https://files.pythonhosted.org/packages/c0/53/eaada1a414c026673eb983f8b4a55fe5eb172725d33d62c1b21f63ff6ca4/scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8", size = 37297371 }, + { url = "https://files.pythonhosted.org/packages/e9/06/0449b744892ed22b7e7b9a1994a866e64895363572677a316a9042af1fe5/scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11", size = 36872390 }, + { url = "https://files.pythonhosted.org/packages/6a/6f/a8ac3cfd9505ec695c1bc35edc034d13afbd2fc1882a7c6b473e280397bb/scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53", size = 39700276 }, + { url = "https://files.pythonhosted.org/packages/f5/6f/e6e5aff77ea2a48dd96808bb51d7450875af154ee7cbe72188afb0b37929/scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded", size = 40942317 }, + { url = "https://files.pythonhosted.org/packages/53/40/09319f6e0f276ea2754196185f95cd191cb852288440ce035d5c3a931ea2/scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf", size = 38717587 }, + { url = "https://files.pythonhosted.org/packages/fe/c3/2854f40ecd19585d65afaef601e5e1f8dbf6758b2f95b5ea93d38655a2c6/scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37", size = 30100266 }, + { url = "https://files.pythonhosted.org/packages/dd/b1/f9fe6e3c828cb5930b5fe74cb479de5f3d66d682fa8adb77249acaf545b8/scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d", size = 22373768 }, + { url = "https://files.pythonhosted.org/packages/15/9d/a60db8c795700414c3f681908a2b911e031e024d93214f2d23c6dae174ab/scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb", size = 25154719 }, + { url = "https://files.pythonhosted.org/packages/37/3b/9bda92a85cd93f19f9ed90ade84aa1e51657e29988317fabdd44544f1dd4/scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27", size = 35163195 }, + { url = "https://files.pythonhosted.org/packages/03/5a/fc34bf1aa14dc7c0e701691fa8685f3faec80e57d816615e3625f28feb43/scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0", size = 37255404 }, + { url = "https://files.pythonhosted.org/packages/4a/71/472eac45440cee134c8a180dbe4c01b3ec247e0338b7c759e6cd71f199a7/scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32", size = 36860011 }, + { url = "https://files.pythonhosted.org/packages/01/b3/21f890f4f42daf20e4d3aaa18182dddb9192771cd47445aaae2e318f6738/scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d", size = 39657406 }, + { url = "https://files.pythonhosted.org/packages/0d/76/77cf2ac1f2a9cc00c073d49e1e16244e389dd88e2490c91d84e1e3e4d126/scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f", size = 40961243 }, + { url = "https://files.pythonhosted.org/packages/4c/4b/a57f8ddcf48e129e6054fa9899a2a86d1fc6b07a0e15c7eebff7ca94533f/scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9", size = 38870286 }, + { url = "https://files.pythonhosted.org/packages/0c/43/c304d69a56c91ad5f188c0714f6a97b9c1fed93128c691148621274a3a68/scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f", size = 30141634 }, + { url = "https://files.pythonhosted.org/packages/44/1a/6c21b45d2548eb73be9b9bff421aaaa7e85e22c1f9b3bc44b23485dfce0a/scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6", size = 22415179 }, + { url = "https://files.pythonhosted.org/packages/74/4b/aefac4bba80ef815b64f55da06f62f92be5d03b467f2ce3668071799429a/scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af", size = 25126412 }, + { url = "https://files.pythonhosted.org/packages/b1/53/1cbb148e6e8f1660aacd9f0a9dfa2b05e9ff1cb54b4386fe868477972ac2/scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274", size = 34952867 }, + { url = "https://files.pythonhosted.org/packages/2c/23/e0eb7f31a9c13cf2dca083828b97992dd22f8184c6ce4fec5deec0c81fcf/scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776", size = 36890009 }, + { url = "https://files.pythonhosted.org/packages/03/f3/e699e19cabe96bbac5189c04aaa970718f0105cff03d458dc5e2b6bd1e8c/scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828", size = 36545159 }, + { url = "https://files.pythonhosted.org/packages/af/f5/ab3838e56fe5cc22383d6fcf2336e48c8fe33e944b9037fbf6cbdf5a11f8/scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28", size = 39136566 }, + { url = "https://files.pythonhosted.org/packages/0a/c8/b3f566db71461cabd4b2d5b39bcc24a7e1c119535c8361f81426be39bb47/scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db", size = 40477705 }, +] + [[package]] name = "setuptools" version = "75.3.0"