From 48f236e9c9a850c39504c9da9d50d0debc07bf91 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Thu, 11 Jan 2024 10:37:27 +0100 Subject: [PATCH] removing com_risk_loss --- hazardous/_comp_risks_loss.py | 365 ------------------------ hazardous/tests/test_comp_risks_loss.py | 57 ---- 2 files changed, 422 deletions(-) delete mode 100644 hazardous/_comp_risks_loss.py delete mode 100644 hazardous/tests/test_comp_risks_loss.py diff --git a/hazardous/_comp_risks_loss.py b/hazardous/_comp_risks_loss.py deleted file mode 100644 index 12c209f..0000000 --- a/hazardous/_comp_risks_loss.py +++ /dev/null @@ -1,365 +0,0 @@ -import numpy as np -from scipy.special import xlogy -from sklearn._loss._loss import CyHalfBinomialLoss -from sklearn._loss.link import Interval, LogitLink, MultinomialLogit -from sklearn._loss.loss import BaseLoss - - -def sum_exp_minus(i, raw_prediction): - p = raw_prediction[i] - max_value = np.max(p) - exp_p = np.exp(p - max_value) - sum_exps = np.sum(exp_p) - - return exp_p, max_value, sum_exps - - -class _MultinomialBinaryLoss: - """Half Multinomial deviance loss with multinomial logit link. - - Domain: - y_true in {0, 1, 2, 3, .., n_classes - 1} - y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded - - Link: - y_pred = softmax(raw_prediction) - - Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is - mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1. - """ - - # Note that we do not assume memory alignment/contiguity of 2d arrays. - # There seems to be little benefit in doing so. Benchmarks proofing the - # opposite are welcome. - def loss( - self, - y_true, # IN - raw_prediction, # IN - sample_weight, # IN - loss_out, # OUT - n_threads=1, - ): - y_true = y_true.astype(int) - n_samples = y_true.shape[0] - n_classes = raw_prediction.shape[1] - - # We assume n_samples > n_classes. In this case having the inner loop - # over n_classes is a good default. - # TODO: If every memoryview is contiguous and raw_prediction is - # f-contiguous, can we write a better algo (loops) to improve - # performance? - - for i in range(n_samples): - p, max_value, sum_exps = sum_exp_minus(i, raw_prediction) - log_sum_exps = np.log(sum_exps) - loss_out[i] = n_classes * log_sum_exps + max_value - - for k in range(1, n_classes): - # label decode y_true - if y_true[i] == k: - loss_out[i] -= raw_prediction[i, k] - else: - loss_out[i] -= np.log(sum_exps - p[k]) - if sample_weight is not None: - loss_out[i] *= sample_weight[i] - - return np.asarray(loss_out) - - def loss_gradient( - self, - y_true, # IN - raw_prediction, # IN - sample_weight, # IN - loss_out, # OUT - gradient_out, # OUT - n_threads=1, - ): - y_true = y_true.astype(int) - n_samples = y_true.shape[0] - n_classes = raw_prediction.shape[1] - for i in range(n_samples): - p, max_value, sum_exps = sum_exp_minus(i, raw_prediction) - log_sum_exps = np.log(sum_exps) - loss_out[i] = log_sum_exps + max_value - p /= sum_exps - for k in range(1, n_classes): - # label decode y_true - if y_true[i] == k: - loss_out[i] -= raw_prediction[i, k] - else: - loss_out[i] += log_sum_exps - np.log(sum_exps - p[k]) - # p_k = y_pred_k = prob of class k - # gradient_k = (p_k - (y_true == k)) * sw - - gradient_out[i, k] = (p[k] - 1 * (y_true[i] == k)) * sample_weight[i] - if sample_weight is not None: - loss_out[i] *= sample_weight[i] - gradient_out[:, 0] = 0.0 - return np.asarray(loss_out), np.asarray(gradient_out) - - def gradient( - self, - y_true, # IN - raw_prediction, # IN - sample_weight, # IN - gradient_out, # OUT - n_threads=1, - ): - n_samples = y_true.shape[0] - n_classes = raw_prediction.shape[1] - for i in range(n_samples): - p, _, sum_exps = sum_exp_minus(i, raw_prediction) - p /= sum_exps # p_k = y_pred_k = prob of class k - for k in range(1, n_classes): - gradient_out[i, k] = p[k] - 1 * (y_true[i] == k) - if sample_weight is not None: - gradient_out[i] *= sample_weight[i] - - gradient_out[:, 0] = 0.0 - return np.asarray(gradient_out) - - def gradient_hessian( - self, - y_true, # IN - raw_prediction, # IN - sample_weight, # IN - gradient_out, # OUT - hessian_out, # OUT - n_threads=1, - ): - y_true = y_true.astype(int) - n_samples = y_true.shape[0] - n_classes = raw_prediction.shape[1] - if sample_weight is None: - sample_weight = np.ones(n_samples) - for i in range(n_samples): - p, _, sum_exps = sum_exp_minus(i, raw_prediction) - p /= sum_exps - - for k in range(1, n_classes): - # p_k = y_pred_k = prob of class k - # gradient_k = (p_k - (y_true == k)) * sw - # hessian_k = p_k * (1 - p_k) * sw - gradient_out[i, k] = (p[k] - 1 * (y_true[i] == k)) * sample_weight[i] - hessian_out[i, k] = p[k] * (1 - p[k]) * sample_weight[i] - - gradient_out[:, 0] = 0.0 - hessian_out[:, 0] = 0.0 - return np.asarray(gradient_out), np.asarray(hessian_out) - - # This method simplifies the implementation of hessp in linear models, - # i.e. the matrix-vector product of the full hessian, not only of the - # diagonal (in the classes) approximation as implemented above. - def gradient_proba( - self, - y_true, # IN - raw_prediction, # IN - sample_weight, # IN - gradient_out, # OUT - proba_out, # OUT - n_threads=1, - ): - y_true = y_true.astype(int) - n_samples = y_true.shape[0] - n_classes = raw_prediction.shape[1] - if sample_weight is None: - sample_weight = np.ones(n_samples) - for i in range(n_samples): - p, _, sum_exps = sum_exp_minus(i, raw_prediction) - p /= sum_exps - for k in range(1, n_classes): - proba_out[i, k] = p[k] # y_pred_k = prob of class k - # gradient_k = (p_k - (y_true == k)) * sw - gradient_out[i, k] = (p[k] - 1 * (y_true[i] == k)) * sample_weight[i] - gradient_out[:, 0] = 0.0 - return np.asarray(gradient_out), np.asarray(proba_out) - - -class MultinomialBinaryLoss(BaseLoss): - """Categorical cross-entropy loss, for multiclass classification. - - Domain: - y_true in {0, 1, 2, 3, .., n_classes - 1} - y_pred has n_classes elements, each element in (0, 1) - - Link: - y_pred = softmax(raw_prediction) - - Note: We assume y_true to be already label encoded. The inverse link is - softmax. But the full link function is the symmetric multinomial logit - function. - - For a given sample x_i, the categorical cross-entropy loss is defined as - the negative log-likelihood of the multinomial distribution, it - generalizes the binary cross-entropy to more than 2 classes:: - - loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1)) - - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1) - - See [1]. - - Note that for the hessian, we calculate only the diagonal part in the - classes: If the full hessian for classes k and l and sample i is H_i_k_l, - we calculate H_i_k_k, i.e. k=l. - - Reference - --------- - .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie. - "A Blockwise Descent Algorithm for Group-penalized Multiresponse and - Multinomial Regression". - <1311.6529>` - """ - - is_multiclass = True - - def __init__(self, sample_weight=None, n_classes=3): - self.n_classes = n_classes - - self.interval_y_pred = Interval(0, 1, False, False) - - if n_classes == 1: - self.interval_y_true = Interval(0, 1, True, True) - super().__init__( - closs=CyHalfBinomialLoss(), - link=LogitLink(), - n_classes=1, - ) - - else: - self.interval_y_true = Interval(0, n_classes, True, False) - super().__init__( - closs=_MultinomialBinaryLoss(), - link=MultinomialLogit(), - n_classes=n_classes, - ) - - def in_y_true_range(self, y): - """Return True if y is in the valid range of y_true. - - Parameters - ---------- - y : ndarray - """ - return self.interval_y_true.includes(y) and np.all(y.astype(int) == y) - - def fit_intercept_only(self, y_true, sample_weight=None): - """Compute raw_prediction of an intercept-only model. - - This is the softmax of the weighted average of the target, i.e. over - the samples axis=0. - """ - if self.n_classes > 1: - out = np.zeros(self.n_classes, dtype=y_true.dtype) - eps = np.finfo(y_true.dtype).eps - for k in range(self.n_classes): - out[k] = np.average(y_true == k, weights=sample_weight, axis=0) - out[k] = np.clip(out[k], eps, 1 - eps) - return self.link.link(out[None, :]).reshape(-1) - - return self.fit_intercept_only(y_true, sample_weight=sample_weight) - - def predict_proba(self, raw_prediction): - """Predict probabilities. - - Parameters - ---------- - raw_prediction : array of shape (n_samples, n_classes) - Raw prediction values (in link space). - - Returns - ------- - proba : array of shape (n_samples, n_classes) - Element-wise class probabilities. - """ - if self.n_classes == 1: - if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: - raw_prediction = raw_prediction.squeeze(1) - proba = np.empty( - (raw_prediction.shape[0], 2), dtype=raw_prediction.dtype - ) - proba[:, 1] = self.link.inverse(raw_prediction) - proba[:, 0] = 1 - proba[:, 1] - return proba - return self.link.inverse(raw_prediction) - - def constant_to_optimal_zero(self, y_true, sample_weight=None): - # This is non-zero only if y_true is neither 0 nor 1. - term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true) - if sample_weight is not None: - term *= sample_weight - return term - - def gradient_proba( - self, - y_true, - raw_prediction, - sample_weight=None, - gradient_out=None, - proba_out=None, - n_threads=1, - ): - """Compute gradient and class probabilities fow raw_prediction. - - Parameters - ---------- - y_true : C-contiguous array of shape (n_samples,) - Observed, true target values. - raw_prediction : array of shape (n_samples, n_classes) - Raw prediction values (in link space). - sample_weight : None or C-contiguous array of shape (n_samples,) - Sample weights. - gradient_out : None or array of shape (n_samples, n_classes) - A location into which the gradient is stored. If None, a new array - might be created. - proba_out : None or array of shape (n_samples, n_classes) - A location into which the class probabilities are stored. If None, - a new array might be created. - n_threads : int, default=1 - Might use openmp thread parallelism. - - Returns - ------- - gradient : array of shape (n_samples, n_classes) - Element-wise gradients. - - proba : array of shape (n_samples, n_classes) - Element-wise class probabilities. - """ - if self.n_classes == 1: - return self.gradient_proba( - y_true, - raw_prediction, - sample_weight=sample_weight, - gradient_out=gradient_out, - proba_out=proba_out, - n_threads=n_threads, - ) - - if gradient_out is None: - if proba_out is None: - gradient_out = np.empty_like(raw_prediction) - proba_out = np.empty_like(raw_prediction) - else: - gradient_out = np.empty_like(proba_out) - elif proba_out is None: - proba_out = np.empty_like(gradient_out) - - return self.closs.gradient_proba( - y_true=y_true, - raw_prediction=raw_prediction, - sample_weight=sample_weight, - gradient_out=gradient_out, - proba_out=proba_out, - n_threads=n_threads, - ) - - -# class MultinomialBinaryLoss(BaseLoss): -# def __init__(self, sample_weight=None, n_classes=3): -# self.n_classes=n_classes -# self.interval_y_true = Interval(0, n_classes, True, False) -# self.interval_y_pred = Interval(0, 1, False, False) -# if n_classes==1: -# HalfBinomialLoss(sample_weight=sample_weight) -# else: -# MultiBinLoss(sample_weight=sample_weight, n_classes=n_classes) diff --git a/hazardous/tests/test_comp_risks_loss.py b/hazardous/tests/test_comp_risks_loss.py deleted file mode 100644 index c9f8f47..0000000 --- a/hazardous/tests/test_comp_risks_loss.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_almost_equal - -from hazardous._comp_risks_loss import _MultinomialBinaryLoss, sum_exp_minus - - -def test_multinomial_binary_loss(): - """Test the multinomial binary loss function.""" - # Test the loss function with a single observation. - y_true = np.array([1]) - y_proba = np.array([[0.1, 0.8, 0.1]]) - loss_out = np.empty(1) - sample_weight = np.array([1]) - - loss = _MultinomialBinaryLoss().loss(y_true, y_proba, sample_weight, loss_out) - - exp_p, max_value, sum_exps = sum_exp_minus(0, y_proba) - - assert max_value == 0.8 - assert sum_exps == pytest.approx(1 + 2 * np.exp(-0.7)) - - exp_p = np.exp(y_proba) - exp_p /= exp_p.sum() - expected_loss = -np.log(np.abs(np.array([1, 0, 1]) - exp_p)).sum() - - assert loss == pytest.approx(expected_loss) - - -def test_multinomial_binary_gradient_hessian(): - """Test the multinomial binary gradient function.""" - # Test the gradient function with a single observation. - y_true = np.array([1]) - y_proba = np.array([[0.1, 0.8, 0.1]]) - gradient_out = np.empty((1, 3)) - hessian_out = np.empty((1, 3)) - sample_weight = np.array([1]) - - grad, hessian = _MultinomialBinaryLoss().gradient_hessian( - y_true, y_proba, sample_weight, gradient_out, hessian_out - ) - - exp_p = np.exp(y_proba[0] - 0.8) - exp_p /= exp_p.sum() - - neg_g1 = exp_p[1] / (1 - exp_p[0]) - neg_g2 = exp_p[1] / (1 - exp_p[2]) - pos_g = 2 * exp_p[1] - 1 - expected_grad = np.array([[neg_g1, pos_g, neg_g2]]) - - neg_h1 = exp_p[1] * (1 - exp_p[0] - exp_p[1]) - neg_h2 = exp_p[1] * (1 - exp_p[2] - exp_p[1]) - pos_h = 2 * exp_p[1] * (1 - exp_p[1]) - expected_hessian = np.array([[neg_h1, pos_h, neg_h2]]) - - assert_almost_equal(grad, expected_grad) - assert_almost_equal(hessian, expected_hessian)