Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vendorize copt #88

Merged
merged 5 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions groupyr/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Create base classes based on the sparse group lasso."""

import contextlib
import copt as cp
import numpy as np
import warnings

Expand All @@ -18,6 +17,10 @@
check_is_fitted,
)

from groupyr._copt.loss import SquareLoss, HuberLoss, LogLoss
import groupyr._copt.utils as cp_utils
from groupyr._copt.proximal_gradient import minimize_proximal_gradient

from ._prox import SparseGroupL1
from .utils import check_groups

Expand Down Expand Up @@ -212,14 +215,14 @@ def fit(self, X, y, loss="squared_loss"):
coef = np.zeros(n_features)

if loss == "huber":
f = cp.loss.HuberLoss(X, y)
f = HuberLoss(X, y)
elif loss == "log":
f = cp.loss.LogLoss(X, y)
f = LogLoss(X, y)
else:
f = cp.loss.SquareLoss(X, y)
f = SquareLoss(X, y)

if self.include_solver_trace:
self.solver_trace_ = cp.utils.Trace(f)
self.solver_trace_ = cp_utils.Trace(f)
else:
self.solver_trace_ = None

Expand Down Expand Up @@ -255,7 +258,7 @@ def fit(self, X, y, loss="squared_loss"):
if self.suppress_solver_warnings:
warnings.filterwarnings("ignore", category=RuntimeWarning)

pgd = cp.minimize_proximal_gradient(
pgd = minimize_proximal_gradient(
f.f_grad,
coef,
sg1.prox,
Expand Down
32 changes: 32 additions & 0 deletions groupyr/_copt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Components of the copt package (https://github.com/openopt/copt) are used in
# this software. The copt package is distributed under the following license:

# Copyright (c) 2007–2018 Fabian Pedregosa.
# All rights reserved.


# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

# a. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# b. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# c. Neither the name of the Scikit-learn Developers nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.


# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.from .proximal_gradient import minimize_proximal_gradient
306 changes: 306 additions & 0 deletions groupyr/_copt/loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
import numpy as np
from scipy import sparse, special
from scipy.sparse import linalg as splinalg
from sklearn.utils.extmath import safe_sparse_dot

from . import utils

# import prange


class LogLoss:
r"""Logistic loss function.

The logistic loss function is defined as

.. math::
-\frac{1}{n}\sum_{i=1}^n b_i \log(\sigma(\bs{a}_i^T \bs{x}))
+ (1 - b_i) \log(1 - \sigma(\bs{a}_i^T \bs{x}))

where :math:`\sigma` is the sigmoid function
:math:`\sigma(t) = 1/(1 + e^{-t})`.

The input vector b verifies :math:`0 \leq b_i \leq 1`. When it comes from
class labels, it should have the values 0 or 1.

References:
http://fa.bianp.net/blog/2019/evaluate_logistic/
"""

def __init__(self, A, b, alpha=0.0):
if A is None:
A = sparse.eye(b.size, b.size, format="csr")
self.A = A
if np.max(b) > 1 or np.min(b) < 0:
raise ValueError("b can only contain values between 0 and 1 ")
if not A.shape[0] == b.size:
raise ValueError("Dimensions of A and b do not coincide")
self.b = b
self.alpha = alpha
self.intercept = False

def __call__(self, x):
return self.f_grad(x, return_gradient=False)

def _sigma(self, z, idx):
z0 = np.zeros_like(z)
tmp = np.exp(-z[idx])
z0[idx] = 1 / (1 + tmp)
tmp = np.exp(z[~idx])
z0[~idx] = tmp / (1 + tmp)
return z0

def logsig(self, x):
"""Compute log(1 / (1 + exp(-t))) component-wise."""
out = np.zeros_like(x)
idx0 = x < -33
out[idx0] = x[idx0]
idx1 = (x >= -33) & (x < -18)
out[idx1] = x[idx1] - np.exp(x[idx1])
idx2 = (x >= -18) & (x < 37)
out[idx2] = -np.log1p(np.exp(-x[idx2]))
idx3 = x >= 37
out[idx3] = -np.exp(-x[idx3])
return out

def expit_b(self, x, b):
"""Compute sigmoid(x) - b."""
idx = x < 0
out = np.zeros_like(x)
exp_x = np.exp(x[idx])
b_idx = b[idx]
out[idx] = ((1 - b_idx) * exp_x - b_idx) / (1 + exp_x)
exp_nx = np.exp(-x[~idx])
b_nidx = b[~idx]
out[~idx] = ((1 - b_nidx) - b_nidx * exp_nx) / (1 + exp_nx)
return out

def f_grad(self, x, return_gradient=True):
if self.intercept:
x_, c = x[:-1], x[-1]
else:
x_, c = x, 0.0
z = safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c
loss = np.mean((1 - self.b) * z - self.logsig(z))
penalty = safe_sparse_dot(x_.T, x_, dense_output=True).ravel()[0]
loss += 0.5 * self.alpha * penalty

if not return_gradient:
return loss

z0_b = self.expit_b(z, self.b)

grad = utils.safe_sparse_add(
self.A.T.dot(z0_b) / self.A.shape[0], self.alpha * x_
)
grad = np.asarray(grad).ravel()
grad_c = z0_b.mean()
if self.intercept:
return np.concatenate((grad, [grad_c]))

return loss, grad

def hessian_mv(self, x):
"""Return a callable that returns matrix-vector products with the Hessian."""

n_samples, n_features = self.A.shape
if self.intercept:
x_, c = x[:-1], x[-1]
else:
x_, c = x, 0.0

z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c)

# The mat-vec product of the Hessian
d = z * (1 - z)
if sparse.issparse(self.A):
dX = safe_sparse_dot(
sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A
)
else:
# Precompute as much as possible
dX = d[:, np.newaxis] * self.A

if self.intercept:
# Calculate the double derivative with respect to intercept
# In the case of sparse matrices this returns a matrix object.
dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))

def _Hs(s):
ret = np.empty_like(s)
ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features]))
ret[:n_features] += self.alpha * s[:n_features]

# For the fit intercept case.
if self.intercept:
ret[:n_features] += s[-1] * dd_intercept
ret[-1] = dd_intercept.dot(s[:n_features])
ret[-1] += d.sum() * s[-1]
return ret / n_samples

return _Hs

def hessian_trace(self, x):
"""Return a callable that returns matrix-vector products with the Hessian."""

n_samples, n_features = self.A.shape
if self.intercept:
x_, c = x[:-1], x[-1]
else:
x_, c = x, 0.0

z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c)

# The mat-vec product of the Hessian
d = z * (1 - z)
if sparse.issparse(self.A):
dX = safe_sparse_dot(
sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A
)
else:
# Precompute as much as possible
dX = d[:, np.newaxis] * self.A

if self.intercept:
# Calculate the double derivative with respect to intercept
# In the case of sparse matrices this returns a matrix object.
dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))

def _Hs(s):
ret = np.empty_like(s)
ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features]))
ret[:n_features] += self.alpha * s[:n_features]

# For the fit intercept case.
if self.intercept:
ret[:n_features] += s[-1] * dd_intercept
ret[-1] = dd_intercept.dot(s[:n_features])
ret[-1] += d.sum() * s[-1]
return ret / n_samples

return _Hs

@property
def partial_deriv(self):
"""Note: this will ignore the regularization parameter alpha"""

@utils.njit(parallel=True)
def log_deriv(p, y):
# derivative of logistic loss
# same as in lightning (with minus sign)
out = np.zeros_like(p)
for i in utils.prange(p.size):
if p[i] < 0:
exp_p = np.exp(p[i])
out[i] = ((1 - y[i]) * exp_p - y[i]) / (1 + exp_p)
else:
exp_nx = np.exp(-p[i])
out[i] = ((1 - y[i]) - y[i] * exp_nx) / (1 + exp_nx)
return out

return log_deriv

@property
def lipschitz(self):
s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0]
return 0.25 * (s * s) / self.A.shape[0] + self.alpha

@property
def max_lipschitz(self):
from sklearn.utils.extmath import row_norms

max_squared_sum = row_norms(self.A, squared=True).max()

return 0.25 * max_squared_sum + self.alpha


class SquareLoss:
r"""Squared loss.

The Squared loss is defined as

.. math::
\frac{1}{2n}\|A x - b\|^2 + \frac{1}{2} \alpha \|x\|^2

where :math:`\|\cdot\|` is the euclidean norm.
"""

def __init__(self, A, b, alpha=0):
if A is None:
A = sparse.eye(b.size, b.size, format="csr")
self.b = b
self.alpha = alpha
self.A = A
self.name = "square"

def __call__(self, x):
z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b
pen = self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0]
return 0.5 * (z * z).mean() + 0.5 * pen

def f_grad(self, x, return_gradient=True):
z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b
pen = self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0]
loss = 0.5 * (z * z).mean() + 0.5 * pen
if not return_gradient:
return loss
grad = utils.safe_sparse_add(
self.A.T.dot(z) / self.A.shape[0], self.alpha * x.T
)
return loss, np.asarray(grad).ravel()

@property
def partial_deriv(self):
@utils.njit
def square_deriv(p, y):
return p - y

return square_deriv

@property
def lipschitz(self):
s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0]
return (s * s) / self.A.shape[0] + self.alpha

@property
def max_lipschitz(self):
from sklearn.utils.extmath import row_norms

max_squared_sum = row_norms(self.A, squared=True).max()

return max_squared_sum + self.alpha


class HuberLoss:
"""Huber loss"""

def __init__(self, A, b, alpha=0, delta=1):
self.delta = delta
self.A = A
self.b = b
self.alpha = alpha
self.name = "huber"

def __call__(self, x):
return self.f_grad(x, return_gradient=False)

def f_grad(self, x, return_gradient=True):
z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b
idx = np.abs(z) < self.delta
loss = 0.5 * np.sum(z[idx] * z[idx])
loss += np.sum(self.delta * (np.abs(z[~idx]) - 0.5 * self.delta))
loss = (
loss / z.size
+ 0.5 * self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0]
)
if not return_gradient:
return loss
grad = self.A[idx].T.dot(z[idx]) / self.A.shape[0] + self.alpha * x.T
grad = np.asarray(grad)
grad += self.A[~idx].T.dot(self.delta * np.sign(z[~idx])) / self.A.shape[0]
return loss, np.asarray(grad).ravel()

@property
def lipschitz(self):
s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0]
return (s * s) / self.A.shape[0] + self.alpha
Loading
Loading