From 8102a92f4017dc74a82b3241912e62b729989f8c Mon Sep 17 00:00:00 2001
From: scosman <scosman@users.noreply.github.com>
Date: Thu, 27 Feb 2025 18:21:32 -0500
Subject: [PATCH] More and better correlation coefficients for comparing eval
 configs to human scores

---
 app/desktop/pyproject.toml                    |   1 +
 .../studio_server/correlation_calculator.py   | 110 ++++++++
 app/desktop/studio_server/eval_api.py         |  95 +++----
 .../test_correlation_calculator.py            | 246 ++++++++++++++++++
 app/desktop/studio_server/test_eval_api.py    |  17 +-
 app/web_ui/src/lib/api_schema.d.ts            |  30 ++-
 .../[eval_id]/eval_configs/+page.svelte       |  68 ++++-
 uv.lock                                       |  58 +++++
 8 files changed, 549 insertions(+), 76 deletions(-)
 create mode 100644 app/desktop/studio_server/correlation_calculator.py
 create mode 100644 app/desktop/studio_server/test_correlation_calculator.py

diff --git a/app/desktop/pyproject.toml b/app/desktop/pyproject.toml
index 1cf5e5e5..e28ea1c4 100644
--- a/app/desktop/pyproject.toml
+++ b/app/desktop/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "pillow>=11.0.0",
     "pystray>=0.19.5",
     "pyinstaller==6.11.1",
+    "scipy>=1.15.2",
 ]
 
 
diff --git a/app/desktop/studio_server/correlation_calculator.py b/app/desktop/studio_server/correlation_calculator.py
new file mode 100644
index 00000000..0bbcde46
--- /dev/null
+++ b/app/desktop/studio_server/correlation_calculator.py
@@ -0,0 +1,110 @@
+import math
+from dataclasses import dataclass
+from typing import List
+
+from scipy import stats
+
+
+@dataclass
+class CorrelationScore:
+    measured_score: float
+    human_score: float
+    normalized_measured_score: float
+    normalized_human_score: float
+
+
+@dataclass
+class CorrelationResult:
+    mean_absolute_error: float
+    mean_normalized_absolute_error: float
+    mean_squared_error: float
+    mean_normalized_squared_error: float
+    spearman_correlation: float
+    pearson_correlation: float
+    kendalltau_correlation: float
+
+
+class CorrelationCalculator:
+    def __init__(self):
+        self.scores: List[CorrelationScore] = []
+
+    def add_score(self, score: CorrelationScore):
+        self.scores.append(score)
+
+    def calculate_correlation(self) -> CorrelationResult:
+        if len(self.scores) == 0:
+            raise ValueError("No scores to calculate correlation")
+
+        return CorrelationResult(
+            mean_absolute_error=self.calculate_mean_absolute_error(),
+            mean_normalized_absolute_error=self.calculate_mean_normalized_absolute_error(),
+            mean_squared_error=self.calculate_mean_squared_error(),
+            mean_normalized_squared_error=self.calculate_mean_normalized_squared_error(),
+            spearman_correlation=self.calculate_spearman_correlation(),
+            pearson_correlation=self.calculate_pearson_correlation(),
+            kendalltau_correlation=self.calculate_kendalltau_correlation(),
+        )
+
+    def calculate_mean_absolute_error(self) -> float:
+        total_absolute_error = sum(
+            abs(score.measured_score - score.human_score) for score in self.scores
+        )
+        return total_absolute_error / len(self.scores)
+
+    def calculate_mean_normalized_absolute_error(self) -> float:
+        total_normalized_absolute_error = sum(
+            abs(score.normalized_measured_score - score.normalized_human_score)
+            for score in self.scores
+        )
+        return total_normalized_absolute_error / len(self.scores)
+
+    def calculate_mean_squared_error(self) -> float:
+        total_squared_error = sum(
+            (score.measured_score - score.human_score) ** 2 for score in self.scores
+        )
+        return total_squared_error / len(self.scores)
+
+    def calculate_mean_normalized_squared_error(self) -> float:
+        total_normalized_squared_error = sum(
+            (score.normalized_measured_score - score.normalized_human_score) ** 2
+            for score in self.scores
+        )
+        return total_normalized_squared_error / len(self.scores)
+
+    def calculate_spearman_correlation(self) -> float:
+        if len(self.scores) < 2:
+            # If there is only one pair, return 0 = no correlation
+            return 0
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.spearmanr(x, y)
+        # library doesn't support proper types
+        correlation = result.__getattribute__("correlation")
+        if math.isnan(correlation) or not isinstance(correlation, float):
+            # Very small samples may have a NaN result (unknown correlation)
+            return 0
+        return correlation
+
+    def calculate_pearson_correlation(self) -> float:
+        if len(self.scores) < 2:
+            # If there is only one pair, return 0 = no correlation
+            return 0
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.pearsonr(x, y)
+        if math.isnan(result.correlation):
+            # Very small samples may have a NaN result (unknown correlation)
+            return 0
+        return result.correlation
+
+    def calculate_kendalltau_correlation(self) -> float:
+        if len(self.scores) < 2:
+            # If there is only one pair, return 0 = no correlation
+            return 0
+        x = [score.measured_score for score in self.scores]
+        y = [score.human_score for score in self.scores]
+        result = stats.kendalltau(x, y)
+        if math.isnan(result.correlation):
+            # Very small samples may have a NaN result (unknown correlation)
+            return 0
+        return result.correlation
diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py
index 5dde89ae..d1fb9e38 100644
--- a/app/desktop/studio_server/eval_api.py
+++ b/app/desktop/studio_server/eval_api.py
@@ -32,6 +32,12 @@
 from kiln_server.task_api import task_from_id
 from pydantic import BaseModel
 
+from .correlation_calculator import (
+    CorrelationCalculator,
+    CorrelationResult,
+    CorrelationScore,
+)
+
 
 def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval:
     task = task_from_id(project_id, task_id)
@@ -143,16 +149,9 @@ class EvalResultSummary(BaseModel):
     dataset_size: int
 
 
-class EvalConfigScoreSummary(BaseModel):
-    mean_absolute_error: float
-    mean_normalized_absolute_error: float
-    mean_squared_error: float
-    mean_normalized_squared_error: float
-
-
 class EvalConfigCompareSummary(BaseModel):
-    # Summary of results. eval_config_id -> output_score_id -> ScoreSummary
-    results: Dict[str, Dict[str, EvalConfigScoreSummary]]
+    # Summary of results. eval_config_id -> output_score_id -> CorrelationResult
+    results: Dict[str, Dict[str, CorrelationResult]]
     # eval_config_id -> percent of the dataset that has been processed (run with eval scores)
     eval_config_percent_complete: Dict[str, float]
     # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size)
@@ -589,12 +588,8 @@ async def get_eval_configs_score_summary(
             for eval_config in eval_configs
         }
 
-        # eval_config_id -> output_score_id -> scores/total
-        total_squared_error: Dict[str, Dict[str, float]] = {}
-        total_normalized_squared_error: Dict[str, Dict[str, float]] = {}
-        total_absolute_error: Dict[str, Dict[str, float]] = {}
-        total_normalized_absolute_error: Dict[str, Dict[str, float]] = {}
-        total_count: Dict[str, Dict[str, int]] = {}
+        # eval_config_id -> output_score_id -> correlation calculator
+        correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {}
 
         # important: readonly makes this much faster
         for eval_config in eval_configs:
@@ -631,18 +626,13 @@ async def get_eval_configs_score_summary(
                         # This score doesn't have both a human eval and eval score, so we can't compare
                         continue
 
-                    if eval_config_id not in total_squared_error:
-                        total_squared_error[eval_config_id] = {}
-                        total_absolute_error[eval_config_id] = {}
-                        total_count[eval_config_id] = {}
-                        total_normalized_squared_error[eval_config_id] = {}
-                        total_normalized_absolute_error[eval_config_id] = {}
-                    if score_key not in total_squared_error[eval_config_id]:
-                        total_squared_error[eval_config_id][score_key] = 0
-                        total_absolute_error[eval_config_id][score_key] = 0
-                        total_count[eval_config_id][score_key] = 0
-                        total_normalized_squared_error[eval_config_id][score_key] = 0
-                        total_normalized_absolute_error[eval_config_id][score_key] = 0
+                    if eval_config_id not in correlation_calculators:
+                        correlation_calculators[eval_config_id] = {}
+
+                    if score_key not in correlation_calculators[eval_config_id]:
+                        correlation_calculators[eval_config_id][score_key] = (
+                            CorrelationCalculator()
+                        )
 
                     normalized_eval_score = normalize_rating(
                         eval_score, output_score.type
@@ -650,43 +640,28 @@ async def get_eval_configs_score_summary(
                     normalized_human_score = normalize_rating(
                         human_score, output_score.type
                     )
-                    total_squared_error[eval_config_id][score_key] += (
-                        eval_score - human_score
-                    ) ** 2
-                    total_normalized_squared_error[eval_config_id][score_key] += (
-                        normalized_eval_score - normalized_human_score
-                    ) ** 2
-                    total_absolute_error[eval_config_id][score_key] += abs(
-                        eval_score - human_score
+                    correlation_calculators[eval_config_id][score_key].add_score(
+                        CorrelationScore(
+                            measured_score=eval_score,
+                            human_score=human_score,
+                            normalized_measured_score=normalized_eval_score,
+                            normalized_human_score=normalized_human_score,
+                        )
                     )
-                    total_normalized_absolute_error[eval_config_id][score_key] += abs(
-                        normalized_eval_score - normalized_human_score
-                    )
-                    total_count[eval_config_id][score_key] += 1
 
         # Convert to score summaries
-        results: Dict[str, Dict[str, EvalConfigScoreSummary]] = {}
-        for eval_config_id in total_count.keys():
+        results: Dict[str, Dict[str, CorrelationResult]] = {}
+        for eval_config_id in correlation_calculators.keys():
             results[eval_config_id] = {}
-            for score_key in total_count[eval_config_id].keys():
-                count = total_count[eval_config_id][score_key]
-                if count > 0:
-                    results[eval_config_id][score_key] = EvalConfigScoreSummary(
-                        mean_squared_error=(
-                            total_squared_error[eval_config_id][score_key] / count
-                        ),
-                        mean_absolute_error=(
-                            total_absolute_error[eval_config_id][score_key] / count
-                        ),
-                        mean_normalized_squared_error=(
-                            total_normalized_squared_error[eval_config_id][score_key]
-                            / count
-                        ),
-                        mean_normalized_absolute_error=(
-                            total_normalized_absolute_error[eval_config_id][score_key]
-                            / count
-                        ),
-                    )
+            for score_key in correlation_calculators[eval_config_id].keys():
+                if not correlation_calculators[eval_config_id][score_key]:
+                    # No scores to calculate correlation for this pair
+                    continue
+
+                correlation_result = correlation_calculators[eval_config_id][
+                    score_key
+                ].calculate_correlation()
+                results[eval_config_id][score_key] = correlation_result
 
         # Calculate the percent of the dataset that has been processed
         eval_config_percent_complete: Dict[str, float] = {}
diff --git a/app/desktop/studio_server/test_correlation_calculator.py b/app/desktop/studio_server/test_correlation_calculator.py
new file mode 100644
index 00000000..f396c1ad
--- /dev/null
+++ b/app/desktop/studio_server/test_correlation_calculator.py
@@ -0,0 +1,246 @@
+import pytest
+
+from app.desktop.studio_server.correlation_calculator import (
+    CorrelationCalculator,
+    CorrelationScore,
+)
+
+
+class TestCorrelationCalculator:
+    def create_correlation_scores(self, measured, human):
+        """Helper method to create correlation scores from raw data with normalization"""
+        scores = []
+
+        # Calculate normalized values
+        min_m, max_m = min(measured), max(measured)
+        min_h, max_h = min(human), max(human)
+
+        for m, h in zip(measured, human):
+            norm_m = (m - min_m) / (max_m - min_m) if max_m != min_m else 0
+            norm_h = (h - min_h) / (max_h - min_h) if max_h != min_h else 0
+            scores.append(
+                CorrelationScore(
+                    measured_score=m,
+                    human_score=h,
+                    normalized_measured_score=norm_m,
+                    normalized_human_score=norm_h,
+                )
+            )
+        return scores
+
+    @pytest.fixture
+    def perfect_correlation_data(self):
+        """Dataset with perfect correlation (r=1.0)"""
+        measured = list(range(10))
+        human = list(range(10))
+        return self.create_correlation_scores(measured, human)
+
+    @pytest.fixture
+    def high_correlation_data(self):
+        """Dataset with high correlation (r≈0.9)"""
+        measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        human = [1.1, 2.2, 2.9, 3.8, 5.2, 5.8, 7.1, 8.3, 8.7, 10.2]
+        return self.create_correlation_scores(measured, human)
+
+    @pytest.fixture
+    def no_correlation_data(self):
+        """Dataset with no correlation"""
+        measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        human = [5.5, 6.2, 4.8, 7.3, 2.1, 8.9, 3.7, 5.4, 6.8, 4.2]
+        return self.create_correlation_scores(measured, human)
+
+    @pytest.fixture
+    def inverse_correlation_data(self):
+        """Dataset with inverse correlation (r≈-0.9)"""
+        measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        human = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+        return self.create_correlation_scores(measured, human)
+
+    @pytest.fixture
+    def single_data_point(self):
+        """Dataset with only one data point"""
+        return [
+            CorrelationScore(
+                measured_score=5,
+                human_score=5,
+                normalized_measured_score=0.5,
+                normalized_human_score=0.5,
+            )
+        ]
+
+    @pytest.fixture
+    def two_data_points(self):
+        """Dataset with only two data points"""
+        measured = [1, 10]
+        human = [2, 9]
+        return self.create_correlation_scores(measured, human)
+
+    def setup_calculator_with_data(self, data):
+        """Helper method to create and populate a calculator with data"""
+        calculator = CorrelationCalculator()
+        for score in data:
+            calculator.add_score(score)
+        return calculator
+
+    def test_add_score(self):
+        """Test adding scores to the calculator"""
+        calculator = CorrelationCalculator()
+        score = CorrelationScore(
+            measured_score=5,
+            human_score=6,
+            normalized_measured_score=0.5,
+            normalized_human_score=0.6,
+        )
+
+        calculator.add_score(score)
+        assert len(calculator.scores) == 1
+        assert calculator.scores[0] == score
+
+    def test_empty_calculator(self):
+        """Test that calculating correlation with no scores raises an error"""
+        calculator = CorrelationCalculator()
+
+        with pytest.raises(ValueError, match="No scores to calculate correlation"):
+            calculator.calculate_correlation()
+
+    def test_perfect_correlation(self, perfect_correlation_data):
+        """Test correlation calculations with perfectly correlated data"""
+        calculator = CorrelationCalculator()
+        for score in perfect_correlation_data:
+            calculator.add_score(score)
+
+        result = calculator.calculate_correlation()
+
+        # Perfect correlation should have:
+        # - MAE and MSE of 0 (no error)
+        # - Correlation coefficients of 1.0
+        assert result.mean_absolute_error == 0.0
+        assert result.mean_normalized_absolute_error == 0.0
+        assert result.mean_squared_error == 0.0
+        assert result.mean_normalized_squared_error == 0.0
+        assert result.spearman_correlation == pytest.approx(1.0)
+        assert result.pearson_correlation == pytest.approx(1.0)
+        assert result.kendalltau_correlation == pytest.approx(1.0)
+
+    def test_high_correlation(self, high_correlation_data):
+        """Test correlation calculations with highly correlated data"""
+        calculator = CorrelationCalculator()
+        for score in high_correlation_data:
+            calculator.add_score(score)
+
+        result = calculator.calculate_correlation()
+
+        # High correlation should have:
+        # - Low but non-zero error metrics
+        # - Correlation coefficients close to 1.0
+        assert 0 < result.mean_absolute_error < 1.0
+        assert 0 < result.mean_normalized_absolute_error < 0.2
+        assert 0 < result.mean_squared_error < 1.0
+        assert 0 < result.mean_normalized_squared_error < 0.1
+        assert result.spearman_correlation > 0.9
+        assert result.pearson_correlation > 0.9
+        assert result.kendalltau_correlation > 0.8
+
+    def test_no_correlation(self, no_correlation_data):
+        """Test correlation calculations with uncorrelated data"""
+        calculator = CorrelationCalculator()
+        for score in no_correlation_data:
+            calculator.add_score(score)
+
+        result = calculator.calculate_correlation()
+
+        # No correlation should have:
+        # - Higher error metrics
+        # - Correlation coefficients close to 0
+        assert result.mean_absolute_error > 1.0
+        assert result.mean_normalized_absolute_error > 0.2
+        assert result.mean_squared_error > 2.0
+        assert result.mean_normalized_squared_error > 0.1
+        assert -0.3 < result.spearman_correlation < 0.3
+        assert -0.3 < result.pearson_correlation < 0.3
+        assert -0.3 < result.kendalltau_correlation < 0.3
+
+    def test_inverse_correlation(self, inverse_correlation_data):
+        """Test correlation calculations with inversely correlated data"""
+        calculator = CorrelationCalculator()
+        for score in inverse_correlation_data:
+            calculator.add_score(score)
+
+        result = calculator.calculate_correlation()
+
+        # Inverse correlation should have:
+        # - Higher error metrics
+        # - Correlation coefficients close to -1.0
+        assert result.mean_absolute_error > 4.0
+        assert result.mean_normalized_absolute_error > 0.5
+        assert result.mean_squared_error > 20.0
+        assert result.mean_normalized_squared_error > 0.3
+        assert result.spearman_correlation < -0.9
+        assert result.pearson_correlation < -0.9
+        assert result.kendalltau_correlation < -0.9
+
+    def test_single_data_point(self, single_data_point):
+        """Test correlation calculations with a single data point"""
+        calculator = CorrelationCalculator()
+        for score in single_data_point:
+            calculator.add_score(score)
+
+        result = calculator.calculate_correlation()
+
+        # Single data point should have:
+        # - Zero error (since the point matches itself)
+        # - Correlation coefficients of 0 (as defined in the implementation)
+        assert result.mean_absolute_error == 0.0
+        assert result.mean_normalized_absolute_error == 0.0
+        assert result.mean_squared_error == 0.0
+        assert result.mean_normalized_squared_error == 0.0
+        assert result.spearman_correlation == 0.0
+        assert result.pearson_correlation == 0.0
+        assert result.kendalltau_correlation == 0.0
+
+    def test_two_data_points(self, two_data_points):
+        """Test correlation calculations with two data points"""
+        calculator = CorrelationCalculator()
+        for score in two_data_points:
+            calculator.add_score(score)
+
+        result = calculator.calculate_correlation()
+
+        # Two data points with positive correlation should have:
+        # - Some error
+        # - Positive correlation coefficients
+        assert result.mean_absolute_error == 1.0
+        assert result.mean_normalized_absolute_error == 0.0
+        assert result.mean_squared_error == 1.0
+        assert result.mean_normalized_squared_error == 0.0
+        assert result.spearman_correlation == pytest.approx(1.0)
+        assert result.pearson_correlation == pytest.approx(1.0)
+        assert result.kendalltau_correlation == pytest.approx(1.0)
+
+    def test_individual_calculation_methods(self, high_correlation_data):
+        """Test that individual calculation methods match the combined result"""
+        calculator = CorrelationCalculator()
+        for score in high_correlation_data:
+            calculator.add_score(score)
+
+        # Calculate individual metrics
+        mae = calculator.calculate_mean_absolute_error()
+        # Our spell checker thinks n-m-a-e is a misspelling of name :)
+        n_mae = calculator.calculate_mean_normalized_absolute_error()
+        mse = calculator.calculate_mean_squared_error()
+        nmse = calculator.calculate_mean_normalized_squared_error()
+        spearman = calculator.calculate_spearman_correlation()
+        pearson = calculator.calculate_pearson_correlation()
+        kendall = calculator.calculate_kendalltau_correlation()
+
+        # Calculate combined result
+        result = calculator.calculate_correlation()
+
+        # Verify they match
+        assert result.mean_absolute_error == mae
+        assert result.mean_normalized_absolute_error == n_mae
+        assert result.mean_squared_error == mse
+        assert result.mean_normalized_squared_error == nmse
+        assert result.spearman_correlation == spearman
+        assert result.pearson_correlation == pearson
+        assert result.kendalltau_correlation == kendall
diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py
index 29d174db..f4e922ff 100644
--- a/app/desktop/studio_server/test_eval_api.py
+++ b/app/desktop/studio_server/test_eval_api.py
@@ -758,7 +758,7 @@ class EvalCondigSummaryTestData:
             score1_overall_rating=5.0,
             eval_overall_rating=4.0,
             eval__score1_rating=4.0,
-            eval_config_id="ec2",
+            eval_config_id="ec1",
             skip_golden_tag=True,
         ),
         # Test 2: ec2 - Test multiple, and correct averaging
@@ -925,12 +925,18 @@ class EvalCondigSummaryTestData:
             "mean_absolute_error": 4.0,  # error 4.0
             "mean_normalized_squared_error": 1,  # max error: 1 v 5
             "mean_normalized_absolute_error": 1,  # max error: 1 v 5
+            "spearman_correlation": 0,  # default value for 1 pair
+            "pearson_correlation": 0,
+            "kendalltau_correlation": 0,
         },
         "score1": {
             "mean_squared_error": 2.25,  # error (3.5-5.0)^2
             "mean_absolute_error": 1.5,  # error 1.5
             "mean_normalized_squared_error": 0.140625,  # hand calc
             "mean_normalized_absolute_error": 0.375,  # 1.5/4
+            "spearman_correlation": 0,  # default value for 1 pair
+            "pearson_correlation": 0,
+            "kendalltau_correlation": 0,
         },
     }
     # 1 of total_in_dataset eval configs are are in ec1 test
@@ -943,12 +949,18 @@ class EvalCondigSummaryTestData:
             "mean_absolute_error": 1.5,  # (1+2)/2
             "mean_normalized_squared_error": 0.15625,  # (0.25^2 + 0.5^2) / 2
             "mean_normalized_absolute_error": 0.375,  # (0.25 + 0.5) / 2
+            "spearman_correlation": 0,
+            "pearson_correlation": 0,
+            "kendalltau_correlation": 0,
         },
         "score1": {
             "mean_squared_error": 2.5,  # (1^2+2^2)/2
             "mean_absolute_error": 1.5,  # (1+2)/2
             "mean_normalized_squared_error": 0.15625,  # (0.25^2 + 0.5^2) / 2
             "mean_normalized_absolute_error": 0.375,  # (0.25 + 0.5) / 2
+            "spearman_correlation": 0.9999999999999999,
+            "pearson_correlation": 1,
+            "kendalltau_correlation": 1,
         },
     }
     # 2 of total_in_dataset eval configs are are in ec2 test
@@ -961,6 +973,9 @@ class EvalCondigSummaryTestData:
             "mean_absolute_error": 2,
             "mean_normalized_squared_error": 0.25,
             "mean_normalized_absolute_error": 0.5,
+            "spearman_correlation": 0,
+            "pearson_correlation": 0,
+            "kendalltau_correlation": 0,
         },
     }
     # 2 of total_in_dataset eval configs are are in ec2 test
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
index b00c118e..fe0857e0 100644
--- a/app/web_ui/src/lib/api_schema.d.ts
+++ b/app/web_ui/src/lib/api_schema.d.ts
@@ -984,6 +984,23 @@ export interface components {
             /** Remove Tags */
             remove_tags?: string[] | null;
         };
+        /** CorrelationResult */
+        CorrelationResult: {
+            /** Mean Absolute Error */
+            mean_absolute_error: number;
+            /** Mean Normalized Absolute Error */
+            mean_normalized_absolute_error: number;
+            /** Mean Squared Error */
+            mean_squared_error: number;
+            /** Mean Normalized Squared Error */
+            mean_normalized_squared_error: number;
+            /** Spearman Correlation */
+            spearman_correlation: number;
+            /** Pearson Correlation */
+            pearson_correlation: number;
+            /** Kendalltau Correlation */
+            kendalltau_correlation: number;
+        };
         /**
          * CreateDatasetSplitRequest
          * @description Request to create a dataset split
@@ -1386,7 +1403,7 @@ export interface components {
             /** Results */
             results: {
                 [key: string]: {
-                    [key: string]: components["schemas"]["EvalConfigScoreSummary"];
+                    [key: string]: components["schemas"]["CorrelationResult"];
                 };
             };
             /** Eval Config Percent Complete */
@@ -1402,17 +1419,6 @@ export interface components {
             /** Not Rated Count */
             not_rated_count: number;
         };
-        /** EvalConfigScoreSummary */
-        EvalConfigScoreSummary: {
-            /** Mean Absolute Error */
-            mean_absolute_error: number;
-            /** Mean Normalized Absolute Error */
-            mean_normalized_absolute_error: number;
-            /** Mean Squared Error */
-            mean_squared_error: number;
-            /** Mean Normalized Squared Error */
-            mean_normalized_squared_error: number;
-        };
         /**
          * EvalConfigType
          * @enum {string}
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte
index 2b736b25..84ae0dd7 100644
--- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte
+++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte
@@ -40,7 +40,16 @@
   let score_summary: EvalConfigCompareSummary | null = null
   let score_summary_error: KilnError | null = null
 
-  let score_type: "mse" | "mae" | "norm_mse" | "norm_mae" = "norm_mse"
+  type ScoreType =
+    | "mse"
+    | "mae"
+    | "norm_mse"
+    | "norm_mae"
+    | "spearman"
+    | "pearson"
+    | "kendalltau"
+
+  let score_type: ScoreType = "kendalltau"
 
   $: loading = eval_loading || eval_configs_loading // Score summary not blocking whole UI
   $: error = eval_error || eval_configs_error || score_summary_error
@@ -248,7 +257,7 @@
 
   function info_tooltip_text(
     rating_type: TaskOutputRatingType,
-    score_type: "mse" | "mae" | "norm_mse" | "norm_mae",
+    score_type: ScoreType,
   ) {
     let label = ""
     if (score_type === "mae") {
@@ -259,6 +268,12 @@
       label = "Normalized mean squared error"
     } else if (score_type === "norm_mae") {
       label = "Normalized mean absolute error"
+    } else if (score_type === "spearman") {
+      label = "Spearman's rank correlation"
+    } else if (score_type === "pearson") {
+      label = "Pearson's correlation"
+    } else if (score_type === "kendalltau") {
+      label = "Kendall Tau correlation"
     }
     label += " for "
     if (rating_type === "five_star") {
@@ -354,6 +369,9 @@
                 ["norm_mae", "Normalized Mean Absolute Error"],
                 ["mse", "Mean Squared Error"],
                 ["mae", "Mean Absolute Error"],
+                ["spearman", "Spearman Rank Correlation"],
+                ["pearson", "Pearson Correlation"],
+                ["kendalltau", "Kendall Tau Correlation"],
               ]}
               bind:value={score_type}
             />
@@ -499,6 +517,12 @@
                           {scores.mean_normalized_squared_error.toFixed(3)}
                         {:else if score_type === "norm_mae"}
                           {scores.mean_normalized_absolute_error.toFixed(3)}
+                        {:else if score_type === "spearman"}
+                          {scores.spearman_correlation.toFixed(3)}
+                        {:else if score_type === "pearson"}
+                          {scores.pearson_correlation.toFixed(3)}
+                        {:else if score_type === "kendalltau"}
+                          {scores.kendalltau_correlation.toFixed(3)}
                         {/if}
                       {:else}
                         unknown
@@ -532,7 +556,7 @@
 
 <Dialog
   bind:this={score_legend_dialog}
-  title="Score Legend"
+  title="Score Types Explained"
   action_buttons={[
     {
       label: "Close",
@@ -544,6 +568,44 @@
     Each score is a correlation score between the evaluator's score and the
     human score added through the dataset tab.
   </div>
+  <div class="m-8 font-light text-sm">
+    <div class="font-extrabold">TL;DR</div>
+    <div class="mb-2">
+      We suggest you use Kendall Tau correlation scores to compare results.
+    </div>
+    <div class="mb-2">
+      Higher values are better. 1.0 is a perfect correlation between the
+      evaluator and human scores. 0 is no correlation. -1.0 is perfect negative
+      correlation.
+    </div>
+    <div>
+      Subjective tasks will never reach a perfect 1.0 score, so don't worry if
+      your score isn't perfect.
+    </div>
+  </div>
+  <div class="font-medium mt-5">
+    Spearman, Kendall Tau, and Pearson Correlation
+  </div>
+  <div class="text-sm text-gray-500 font-medium mb-1">
+    From -1 to 1, higher is better
+  </div>
+  <div class="font-light text-sm">
+    These are three scientific correlation coefficients. For all three, The
+    value tends to be high (close to 1) for samples with a strongly positive
+    correlation, low (close to -1) for samples with a strongly negative
+    correlation, and close to zero for samples with weak correlation.
+  </div>
+  <ul class="list-disc text-sm text-gray-500 pl-5 pt-2">
+    <li>
+      Spearman evaluates the rank of the scores and is less sensitive to
+      absolute values than Pearson.
+    </li>
+    <li>
+      Kendall Tau evaluates pair order, is more robust to outliers, and performs
+      better on small datasets.
+    </li>
+    <li>Pearson evaluates linear correlation.</li>
+  </ul>
   <div class="font-medium mt-5">Mean Absolute Error</div>
   <div class="text-sm text-gray-500 font-medium mb-1">Lower is better</div>
   <div class="font-light text-sm">
diff --git a/uv.lock b/uv.lock
index 77f10d0e..6718115d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -902,6 +902,7 @@ dependencies = [
     { name = "pillow" },
     { name = "pyinstaller" },
     { name = "pystray" },
+    { name = "scipy" },
 ]
 
 [package.metadata]
@@ -910,6 +911,7 @@ requires-dist = [
     { name = "pillow", specifier = ">=11.0.0" },
     { name = "pyinstaller", specifier = "==6.11.1" },
     { name = "pystray", specifier = ">=0.19.5" },
+    { name = "scipy", specifier = ">=1.15.2" },
 ]
 
 [[package]]
@@ -1985,6 +1987,62 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/c0/b0fba8259b61c938c9733da9346b9f93e00881a9db22aafdd72f6ae0ec05/s3transfer-0.10.3-py3-none-any.whl", hash = "sha256:263ed587a5803c6c708d3ce44dc4dfedaab4c1a32e8329bab818933d79ddcf5d", size = 82625 },
 ]
 
+[[package]]
+name = "scipy"
+version = "1.15.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/b9/31ba9cd990e626574baf93fbc1ac61cf9ed54faafd04c479117517661637/scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec", size = 59417316 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/95/df/ef233fff6838fe6f7840d69b5ef9f20d2b5c912a8727b21ebf876cb15d54/scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9", size = 38692502 },
+    { url = "https://files.pythonhosted.org/packages/5c/20/acdd4efb8a68b842968f7bc5611b1aeb819794508771ad104de418701422/scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5", size = 30085508 },
+    { url = "https://files.pythonhosted.org/packages/42/55/39cf96ca7126f1e78ee72a6344ebdc6702fc47d037319ad93221063e6cf4/scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e", size = 22359166 },
+    { url = "https://files.pythonhosted.org/packages/51/48/708d26a4ab8a1441536bf2dfcad1df0ca14a69f010fba3ccbdfc02df7185/scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9", size = 25112047 },
+    { url = "https://files.pythonhosted.org/packages/dd/65/f9c5755b995ad892020381b8ae11f16d18616208e388621dfacc11df6de6/scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3", size = 35536214 },
+    { url = "https://files.pythonhosted.org/packages/de/3c/c96d904b9892beec978562f64d8cc43f9cca0842e65bd3cd1b7f7389b0ba/scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d", size = 37646981 },
+    { url = "https://files.pythonhosted.org/packages/3d/74/c2d8a24d18acdeae69ed02e132b9bc1bb67b7bee90feee1afe05a68f9d67/scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58", size = 37230048 },
+    { url = "https://files.pythonhosted.org/packages/42/19/0aa4ce80eca82d487987eff0bc754f014dec10d20de2f66754fa4ea70204/scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa", size = 40010322 },
+    { url = "https://files.pythonhosted.org/packages/d0/d2/f0683b7e992be44d1475cc144d1f1eeae63c73a14f862974b4db64af635e/scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65", size = 41233385 },
+    { url = "https://files.pythonhosted.org/packages/40/1f/bf0a5f338bda7c35c08b4ed0df797e7bafe8a78a97275e9f439aceb46193/scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4", size = 38703651 },
+    { url = "https://files.pythonhosted.org/packages/de/54/db126aad3874601048c2c20ae3d8a433dbfd7ba8381551e6f62606d9bd8e/scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1", size = 30102038 },
+    { url = "https://files.pythonhosted.org/packages/61/d8/84da3fffefb6c7d5a16968fe5b9f24c98606b165bb801bb0b8bc3985200f/scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971", size = 22375518 },
+    { url = "https://files.pythonhosted.org/packages/44/78/25535a6e63d3b9c4c90147371aedb5d04c72f3aee3a34451f2dc27c0c07f/scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655", size = 25142523 },
+    { url = "https://files.pythonhosted.org/packages/e0/22/4b4a26fe1cd9ed0bc2b2cb87b17d57e32ab72c346949eaf9288001f8aa8e/scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e", size = 35491547 },
+    { url = "https://files.pythonhosted.org/packages/32/ea/564bacc26b676c06a00266a3f25fdfe91a9d9a2532ccea7ce6dd394541bc/scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0", size = 37634077 },
+    { url = "https://files.pythonhosted.org/packages/43/c2/bfd4e60668897a303b0ffb7191e965a5da4056f0d98acfb6ba529678f0fb/scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40", size = 37231657 },
+    { url = "https://files.pythonhosted.org/packages/4a/75/5f13050bf4f84c931bcab4f4e83c212a36876c3c2244475db34e4b5fe1a6/scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462", size = 40035857 },
+    { url = "https://files.pythonhosted.org/packages/b9/8b/7ec1832b09dbc88f3db411f8cdd47db04505c4b72c99b11c920a8f0479c3/scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737", size = 41217654 },
+    { url = "https://files.pythonhosted.org/packages/4b/5d/3c78815cbab499610f26b5bae6aed33e227225a9fa5290008a733a64f6fc/scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd", size = 38756184 },
+    { url = "https://files.pythonhosted.org/packages/37/20/3d04eb066b471b6e171827548b9ddb3c21c6bbea72a4d84fc5989933910b/scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301", size = 30163558 },
+    { url = "https://files.pythonhosted.org/packages/a4/98/e5c964526c929ef1f795d4c343b2ff98634ad2051bd2bbadfef9e772e413/scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93", size = 22437211 },
+    { url = "https://files.pythonhosted.org/packages/1d/cd/1dc7371e29195ecbf5222f9afeedb210e0a75057d8afbd942aa6cf8c8eca/scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20", size = 25232260 },
+    { url = "https://files.pythonhosted.org/packages/f0/24/1a181a9e5050090e0b5138c5f496fee33293c342b788d02586bc410c6477/scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e", size = 35198095 },
+    { url = "https://files.pythonhosted.org/packages/c0/53/eaada1a414c026673eb983f8b4a55fe5eb172725d33d62c1b21f63ff6ca4/scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8", size = 37297371 },
+    { url = "https://files.pythonhosted.org/packages/e9/06/0449b744892ed22b7e7b9a1994a866e64895363572677a316a9042af1fe5/scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11", size = 36872390 },
+    { url = "https://files.pythonhosted.org/packages/6a/6f/a8ac3cfd9505ec695c1bc35edc034d13afbd2fc1882a7c6b473e280397bb/scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53", size = 39700276 },
+    { url = "https://files.pythonhosted.org/packages/f5/6f/e6e5aff77ea2a48dd96808bb51d7450875af154ee7cbe72188afb0b37929/scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded", size = 40942317 },
+    { url = "https://files.pythonhosted.org/packages/53/40/09319f6e0f276ea2754196185f95cd191cb852288440ce035d5c3a931ea2/scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf", size = 38717587 },
+    { url = "https://files.pythonhosted.org/packages/fe/c3/2854f40ecd19585d65afaef601e5e1f8dbf6758b2f95b5ea93d38655a2c6/scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37", size = 30100266 },
+    { url = "https://files.pythonhosted.org/packages/dd/b1/f9fe6e3c828cb5930b5fe74cb479de5f3d66d682fa8adb77249acaf545b8/scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d", size = 22373768 },
+    { url = "https://files.pythonhosted.org/packages/15/9d/a60db8c795700414c3f681908a2b911e031e024d93214f2d23c6dae174ab/scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb", size = 25154719 },
+    { url = "https://files.pythonhosted.org/packages/37/3b/9bda92a85cd93f19f9ed90ade84aa1e51657e29988317fabdd44544f1dd4/scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27", size = 35163195 },
+    { url = "https://files.pythonhosted.org/packages/03/5a/fc34bf1aa14dc7c0e701691fa8685f3faec80e57d816615e3625f28feb43/scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0", size = 37255404 },
+    { url = "https://files.pythonhosted.org/packages/4a/71/472eac45440cee134c8a180dbe4c01b3ec247e0338b7c759e6cd71f199a7/scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32", size = 36860011 },
+    { url = "https://files.pythonhosted.org/packages/01/b3/21f890f4f42daf20e4d3aaa18182dddb9192771cd47445aaae2e318f6738/scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d", size = 39657406 },
+    { url = "https://files.pythonhosted.org/packages/0d/76/77cf2ac1f2a9cc00c073d49e1e16244e389dd88e2490c91d84e1e3e4d126/scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f", size = 40961243 },
+    { url = "https://files.pythonhosted.org/packages/4c/4b/a57f8ddcf48e129e6054fa9899a2a86d1fc6b07a0e15c7eebff7ca94533f/scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9", size = 38870286 },
+    { url = "https://files.pythonhosted.org/packages/0c/43/c304d69a56c91ad5f188c0714f6a97b9c1fed93128c691148621274a3a68/scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f", size = 30141634 },
+    { url = "https://files.pythonhosted.org/packages/44/1a/6c21b45d2548eb73be9b9bff421aaaa7e85e22c1f9b3bc44b23485dfce0a/scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6", size = 22415179 },
+    { url = "https://files.pythonhosted.org/packages/74/4b/aefac4bba80ef815b64f55da06f62f92be5d03b467f2ce3668071799429a/scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af", size = 25126412 },
+    { url = "https://files.pythonhosted.org/packages/b1/53/1cbb148e6e8f1660aacd9f0a9dfa2b05e9ff1cb54b4386fe868477972ac2/scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274", size = 34952867 },
+    { url = "https://files.pythonhosted.org/packages/2c/23/e0eb7f31a9c13cf2dca083828b97992dd22f8184c6ce4fec5deec0c81fcf/scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776", size = 36890009 },
+    { url = "https://files.pythonhosted.org/packages/03/f3/e699e19cabe96bbac5189c04aaa970718f0105cff03d458dc5e2b6bd1e8c/scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828", size = 36545159 },
+    { url = "https://files.pythonhosted.org/packages/af/f5/ab3838e56fe5cc22383d6fcf2336e48c8fe33e944b9037fbf6cbdf5a11f8/scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28", size = 39136566 },
+    { url = "https://files.pythonhosted.org/packages/0a/c8/b3f566db71461cabd4b2d5b39bcc24a7e1c119535c8361f81426be39bb47/scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db", size = 40477705 },
+]
+
 [[package]]
 name = "setuptools"
 version = "75.3.0"