diff --git a/README.md b/README.md index f32e578..e8032eb 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,44 @@ pinpoint hard-to-find papers. The ATD, on the other hand, measures performance throughout the entire screening process, eliminating reliance on arbitrary cut-off values, and can be used to compare different models. +### Loss +The Loss metric evaluates the performance of an active learning model by +quantifying how closely it approximates the ideal screening process. This +quantification is then normalized between the ideal curve and the worst possible +curve. +While metrics like WSS, Recall, and ERF evaluate the performance at specific +points on the recall curve, the Loss metric provides an overall measure of +performance. + +To compute the loss, we start with three key concepts: + +1. **Optimal AUC**: This is the area under a "perfect recall curve," where + relevant records are identified as early as possible. Mathematically, it is + computed as $Nx \times Ny - \frac{Ny \times (Ny - 1)}{2}$, where $Nx$ is the + total number of records, and $Ny$ is the number of relevant records. + +2. **Worst AUC**: This represents the area under a worst-case recall curve, + where all relevant records appear at the end of the screening process. This + is calculated as $\frac{Ny \times (Ny + 1)}{2}$. + +3. **Actual AUC**: This is the area under the recall curve produced by the model + during the screening process. It can be obtained by summing up the cumulative + recall values for the labeled records. + +The normalized loss is calculated by taking the difference between the optimal +AUC and the actual AUC, divided by the difference between the optimal AUC and +the worst AUC. + +$$\text{Normalized Loss} = \frac{Ny \times \left(Nx - \frac{Ny - 1}{2}\right) - +\sum \text{Cumulative Recall}}{Ny \times (Nx - Ny)}$$ + +The lower the loss, the closer the model is to the perfect recall curve, +indicating higher performance. + +![Recall plot illustrating loss metric](https://github.com/jteijema/asreview-insights/blob/loss-metric/figures/loss_metric_example.png?raw=true) + +In this figure, the green area between the recall curve and the perfect recall line is the lossed performance, which is then normalized for the total area (green and red combined). ## Basic usage @@ -467,6 +504,11 @@ which results in ] ] }, + { + "id": "loss", + "title": "Loss", + "value": 0.01707543880041846 + }, { "id": "erf", "title": "Extra Relevant record Found", diff --git a/asreviewcontrib/insights/algorithms.py b/asreviewcontrib/insights/algorithms.py index 11fd3ce..20b02db 100644 --- a/asreviewcontrib/insights/algorithms.py +++ b/asreviewcontrib/insights/algorithms.py @@ -19,6 +19,36 @@ def _recall_values(labels, x_absolute=False, y_absolute=False): return x.tolist(), y.tolist() +def _loss_value(labels): + Ny = sum(labels) + Nx = len(labels) + + if Ny == 0 or Nx == Ny: + raise ValueError("Need both 0 and 1 labels") + + # The normalized loss is computed based on: + # + # 1. The "optimal" possible AUC, representing the area under an optimal recall + # curve, is the total area, Nx * Ny, minus the area above the stepwise + # curve, (Ny * (Ny - 1)) / 2. Combined to Ny * (Nx - (Ny - 1)) / 2. + # + # 2. The "actual" AUC is the cumulative recall sum, calculated with + # np.cumsum(labels).sum(). + # + # 3. The "worst" AUC, where all positive labels are clustered at the end, is + # calculated as (Ny * (Ny + 1)) / 2. To normalize, we need the difference + # between the optimal and worst AUCs. We simplify this difference: + # + # (Nx * Ny - ((Ny * (Ny - 1)) / 2)) - ((Ny * (Ny + 1)) / 2) + # + # This simplifies to the hyperbolic paraboloid Ny * (Nx - Ny), which is + # the denominator in our normalized loss. + # + # Finally, we compute the normalized loss as: + # (optimal - actual) / (optimal - worst). + return float((Ny * (Nx - (Ny - 1) / 2) - np.cumsum(labels).sum()) / (Ny * (Nx - Ny))) # noqa: E501 + + def _wss_values(labels, x_absolute=False, y_absolute=False): n_docs = len(labels) n_pos_docs = sum(labels) diff --git a/asreviewcontrib/insights/metrics.py b/asreviewcontrib/insights/metrics.py index 2898a9b..ecee421 100644 --- a/asreviewcontrib/insights/metrics.py +++ b/asreviewcontrib/insights/metrics.py @@ -6,6 +6,7 @@ from asreviewcontrib.insights.algorithms import _erf_values from asreviewcontrib.insights.algorithms import _fn_values from asreviewcontrib.insights.algorithms import _fp_values +from asreviewcontrib.insights.algorithms import _loss_value from asreviewcontrib.insights.algorithms import _recall_values from asreviewcontrib.insights.algorithms import _tn_values from asreviewcontrib.insights.algorithms import _tp_values @@ -169,6 +170,20 @@ def _tnr(labels, intercept, x_absolute=False): return _slice_metric(x, y, intercept) +def loss(state_obj, priors=False): + """Compute the loss for active learning problem. + + Computes the loss for active learning problem where all relevant records + have to be seen by a human. + + See the inline documentation for detailed description of loss calculation. + + Returns: + float: The loss value. + """ + labels = _pad_simulation_labels(state_obj, priors=priors) + + return _loss_value(labels) def get_metrics( state_obj, @@ -225,6 +240,11 @@ def get_metrics( "title": "Work Saved over Sampling", "value": [(i, v) for i, v in zip(wss, wss_values)], }, + { + "id": "loss", + "title": "Loss", + "value": _loss_value(labels), + }, { "id": "erf", "title": "Extra Relevant record Found", diff --git a/figures/loss_metric_example.png b/figures/loss_metric_example.png new file mode 100644 index 0000000..47e1f5f Binary files /dev/null and b/figures/loss_metric_example.png differ diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 2fdca7a..aba6a01 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,17 +1,24 @@ from pathlib import Path +import numpy as np from asreview import open_state from numpy import array_equal from numpy.testing import assert_almost_equal +from numpy.testing import assert_raises +from asreviewcontrib.insights.algorithms import _loss_value +from asreviewcontrib.insights.metrics import _erf from asreviewcontrib.insights.metrics import _recall from asreviewcontrib.insights.metrics import _time_to_discovery +from asreviewcontrib.insights.metrics import _wss from asreviewcontrib.insights.metrics import get_metrics +from asreviewcontrib.insights.metrics import loss from asreviewcontrib.insights.metrics import recall TEST_ASREVIEW_FILES = Path(Path(__file__).parent, "asreview_files") + def test_metric_recall_small_data(): labels = [1, 1, 1, 0] r = _recall(labels, 0.5) @@ -111,3 +118,47 @@ def test_label_padding(): stop_if_full = get_metrics(s) assert stop_if_min == stop_if_full + +def test_loss(): + with open_state( + Path(TEST_ASREVIEW_FILES, "sim_van_de_schoot_2017_stop_if_min.asreview") + ) as s: + loss_value = loss(s) + assert_almost_equal(loss_value, 0.011592855205548452) + +def test_loss_value_function(seed=None): + test_cases = [ + ([1, 0], 0), + ([0, 1], 1), + ([1, 1, 0, 0, 0], 0), + ([0, 0, 0, 1, 1], 1), + ([1, 0, 1], 0.5) + ] + + for labels, expected_value in test_cases: + loss_value = _loss_value(labels) + assert_almost_equal(loss_value, expected_value) + + error_cases = [[0, 0, 0], [0], [1]] + for labels in error_cases: + with assert_raises(ValueError): + _loss_value(labels) + + if seed is not None: + np.random.seed(seed) + + for _ in range(100): + length = np.random.randint(2, 100) + labels = np.random.randint(0, 2, length) + + # Ensure labels are not all 0 or all 1 + if np.all(labels == 0) or np.all(labels == 1): + labels[np.random.randint(0, length)] = 1 - labels[0] + + loss_value = _loss_value(labels) + assert 0 <= loss_value <= 1 + +def test_single_value_formats(): + assert isinstance(_wss([1,1,0,0], 0.5), float) + assert isinstance(_loss_value([1,1,0,0]), float) + assert isinstance(_erf([1,1,0,0], 0.5), float)