Skip to content

Commit

Permalink
Merge pull request #88 from arokem/vendorize_copt
Browse files Browse the repository at this point in the history
Vendorize copt
  • Loading branch information
arokem authored Sep 11, 2024
2 parents dee645c + 1a6bdb7 commit 16aee82
Show file tree
Hide file tree
Showing 7 changed files with 972 additions and 7 deletions.
15 changes: 9 additions & 6 deletions groupyr/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Create base classes based on the sparse group lasso."""

import contextlib
import copt as cp
import numpy as np
import warnings

Expand All @@ -18,6 +17,10 @@
check_is_fitted,
)

from groupyr._copt.loss import SquareLoss, HuberLoss, LogLoss
import groupyr._copt.utils as cp_utils
from groupyr._copt.proximal_gradient import minimize_proximal_gradient

from ._prox import SparseGroupL1
from .utils import check_groups

Expand Down Expand Up @@ -212,14 +215,14 @@ def fit(self, X, y, loss="squared_loss"):
coef = np.zeros(n_features)

if loss == "huber":
f = cp.loss.HuberLoss(X, y)
f = HuberLoss(X, y)
elif loss == "log":
f = cp.loss.LogLoss(X, y)
f = LogLoss(X, y)
else:
f = cp.loss.SquareLoss(X, y)
f = SquareLoss(X, y)

if self.include_solver_trace:
self.solver_trace_ = cp.utils.Trace(f)
self.solver_trace_ = cp_utils.Trace(f)
else:
self.solver_trace_ = None

Expand Down Expand Up @@ -255,7 +258,7 @@ def fit(self, X, y, loss="squared_loss"):
if self.suppress_solver_warnings:
warnings.filterwarnings("ignore", category=RuntimeWarning)

pgd = cp.minimize_proximal_gradient(
pgd = minimize_proximal_gradient(
f.f_grad,
coef,
sg1.prox,
Expand Down
32 changes: 32 additions & 0 deletions groupyr/_copt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Components of the copt package (https://github.com/openopt/copt) are used in
# this software. The copt package is distributed under the following license:

# Copyright (c) 2007–2018 Fabian Pedregosa.
# All rights reserved.


# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

# a. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# b. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# c. Neither the name of the Scikit-learn Developers nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.


# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.from .proximal_gradient import minimize_proximal_gradient
306 changes: 306 additions & 0 deletions groupyr/_copt/loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
import numpy as np
from scipy import sparse, special
from scipy.sparse import linalg as splinalg
from sklearn.utils.extmath import safe_sparse_dot

from . import utils

# import prange


class LogLoss:
r"""Logistic loss function.
The logistic loss function is defined as
.. math::
-\frac{1}{n}\sum_{i=1}^n b_i \log(\sigma(\bs{a}_i^T \bs{x}))
+ (1 - b_i) \log(1 - \sigma(\bs{a}_i^T \bs{x}))
where :math:`\sigma` is the sigmoid function
:math:`\sigma(t) = 1/(1 + e^{-t})`.
The input vector b verifies :math:`0 \leq b_i \leq 1`. When it comes from
class labels, it should have the values 0 or 1.
References:
http://fa.bianp.net/blog/2019/evaluate_logistic/
"""

def __init__(self, A, b, alpha=0.0):
if A is None:
A = sparse.eye(b.size, b.size, format="csr")
self.A = A
if np.max(b) > 1 or np.min(b) < 0:
raise ValueError("b can only contain values between 0 and 1 ")
if not A.shape[0] == b.size:
raise ValueError("Dimensions of A and b do not coincide")
self.b = b
self.alpha = alpha
self.intercept = False

def __call__(self, x):
return self.f_grad(x, return_gradient=False)

def _sigma(self, z, idx):
z0 = np.zeros_like(z)
tmp = np.exp(-z[idx])
z0[idx] = 1 / (1 + tmp)
tmp = np.exp(z[~idx])
z0[~idx] = tmp / (1 + tmp)
return z0

def logsig(self, x):
"""Compute log(1 / (1 + exp(-t))) component-wise."""
out = np.zeros_like(x)
idx0 = x < -33
out[idx0] = x[idx0]
idx1 = (x >= -33) & (x < -18)
out[idx1] = x[idx1] - np.exp(x[idx1])
idx2 = (x >= -18) & (x < 37)
out[idx2] = -np.log1p(np.exp(-x[idx2]))
idx3 = x >= 37
out[idx3] = -np.exp(-x[idx3])
return out

def expit_b(self, x, b):
"""Compute sigmoid(x) - b."""
idx = x < 0
out = np.zeros_like(x)
exp_x = np.exp(x[idx])
b_idx = b[idx]
out[idx] = ((1 - b_idx) * exp_x - b_idx) / (1 + exp_x)
exp_nx = np.exp(-x[~idx])
b_nidx = b[~idx]
out[~idx] = ((1 - b_nidx) - b_nidx * exp_nx) / (1 + exp_nx)
return out

def f_grad(self, x, return_gradient=True):
if self.intercept:
x_, c = x[:-1], x[-1]
else:
x_, c = x, 0.0
z = safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c
loss = np.mean((1 - self.b) * z - self.logsig(z))
penalty = safe_sparse_dot(x_.T, x_, dense_output=True).ravel()[0]
loss += 0.5 * self.alpha * penalty

if not return_gradient:
return loss

z0_b = self.expit_b(z, self.b)

grad = utils.safe_sparse_add(
self.A.T.dot(z0_b) / self.A.shape[0], self.alpha * x_
)
grad = np.asarray(grad).ravel()
grad_c = z0_b.mean()
if self.intercept:
return np.concatenate((grad, [grad_c]))

return loss, grad

def hessian_mv(self, x):
"""Return a callable that returns matrix-vector products with the Hessian."""

n_samples, n_features = self.A.shape
if self.intercept:
x_, c = x[:-1], x[-1]
else:
x_, c = x, 0.0

z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c)

# The mat-vec product of the Hessian
d = z * (1 - z)
if sparse.issparse(self.A):
dX = safe_sparse_dot(
sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A
)
else:
# Precompute as much as possible
dX = d[:, np.newaxis] * self.A

if self.intercept:
# Calculate the double derivative with respect to intercept
# In the case of sparse matrices this returns a matrix object.
dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))

def _Hs(s):
ret = np.empty_like(s)
ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features]))
ret[:n_features] += self.alpha * s[:n_features]

# For the fit intercept case.
if self.intercept:
ret[:n_features] += s[-1] * dd_intercept
ret[-1] = dd_intercept.dot(s[:n_features])
ret[-1] += d.sum() * s[-1]
return ret / n_samples

return _Hs

def hessian_trace(self, x):
"""Return a callable that returns matrix-vector products with the Hessian."""

n_samples, n_features = self.A.shape
if self.intercept:
x_, c = x[:-1], x[-1]
else:
x_, c = x, 0.0

z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c)

# The mat-vec product of the Hessian
d = z * (1 - z)
if sparse.issparse(self.A):
dX = safe_sparse_dot(
sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A
)
else:
# Precompute as much as possible
dX = d[:, np.newaxis] * self.A

if self.intercept:
# Calculate the double derivative with respect to intercept
# In the case of sparse matrices this returns a matrix object.
dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))

def _Hs(s):
ret = np.empty_like(s)
ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features]))
ret[:n_features] += self.alpha * s[:n_features]

# For the fit intercept case.
if self.intercept:
ret[:n_features] += s[-1] * dd_intercept
ret[-1] = dd_intercept.dot(s[:n_features])
ret[-1] += d.sum() * s[-1]
return ret / n_samples

return _Hs

@property
def partial_deriv(self):
"""Note: this will ignore the regularization parameter alpha"""

@utils.njit(parallel=True)
def log_deriv(p, y):
# derivative of logistic loss
# same as in lightning (with minus sign)
out = np.zeros_like(p)
for i in utils.prange(p.size):
if p[i] < 0:
exp_p = np.exp(p[i])
out[i] = ((1 - y[i]) * exp_p - y[i]) / (1 + exp_p)
else:
exp_nx = np.exp(-p[i])
out[i] = ((1 - y[i]) - y[i] * exp_nx) / (1 + exp_nx)
return out

return log_deriv

@property
def lipschitz(self):
s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0]
return 0.25 * (s * s) / self.A.shape[0] + self.alpha

@property
def max_lipschitz(self):
from sklearn.utils.extmath import row_norms

max_squared_sum = row_norms(self.A, squared=True).max()

return 0.25 * max_squared_sum + self.alpha


class SquareLoss:
r"""Squared loss.
The Squared loss is defined as
.. math::
\frac{1}{2n}\|A x - b\|^2 + \frac{1}{2} \alpha \|x\|^2
where :math:`\|\cdot\|` is the euclidean norm.
"""

def __init__(self, A, b, alpha=0):
if A is None:
A = sparse.eye(b.size, b.size, format="csr")
self.b = b
self.alpha = alpha
self.A = A
self.name = "square"

def __call__(self, x):
z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b
pen = self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0]
return 0.5 * (z * z).mean() + 0.5 * pen

def f_grad(self, x, return_gradient=True):
z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b
pen = self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0]
loss = 0.5 * (z * z).mean() + 0.5 * pen
if not return_gradient:
return loss
grad = utils.safe_sparse_add(
self.A.T.dot(z) / self.A.shape[0], self.alpha * x.T
)
return loss, np.asarray(grad).ravel()

@property
def partial_deriv(self):
@utils.njit
def square_deriv(p, y):
return p - y

return square_deriv

@property
def lipschitz(self):
s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0]
return (s * s) / self.A.shape[0] + self.alpha

@property
def max_lipschitz(self):
from sklearn.utils.extmath import row_norms

max_squared_sum = row_norms(self.A, squared=True).max()

return max_squared_sum + self.alpha


class HuberLoss:
"""Huber loss"""

def __init__(self, A, b, alpha=0, delta=1):
self.delta = delta
self.A = A
self.b = b
self.alpha = alpha
self.name = "huber"

def __call__(self, x):
return self.f_grad(x, return_gradient=False)

def f_grad(self, x, return_gradient=True):
z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b
idx = np.abs(z) < self.delta
loss = 0.5 * np.sum(z[idx] * z[idx])
loss += np.sum(self.delta * (np.abs(z[~idx]) - 0.5 * self.delta))
loss = (
loss / z.size
+ 0.5 * self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0]
)
if not return_gradient:
return loss
grad = self.A[idx].T.dot(z[idx]) / self.A.shape[0] + self.alpha * x.T
grad = np.asarray(grad)
grad += self.A[~idx].T.dot(self.delta * np.sign(z[~idx])) / self.A.shape[0]
return loss, np.asarray(grad).ravel()

@property
def lipschitz(self):
s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0]
return (s * s) / self.A.shape[0] + self.alpha
Loading

0 comments on commit 16aee82

Please sign in to comment.