Skip to content

Commit

Permalink
Randomly pivoted Cholesky and positive definite kernels (supersedes PR
Browse files Browse the repository at this point in the history
…#85) (#100)

Add initial support for positive definite kernels (see
/misc/rl_pdkernels.hh).
* Various helper functions for evaluating the Gaussian kernel (or any
kernel that primarily relies on the Euclidean distance matrix).
* Add a block_arrowhead_multiply function to avoid duplicate kernel
evaluations when accessing a kernel matrix as a black-box linear
operator.
* Add an RBFKernelMatrix class that represents a collection of
regularized RBF kernel matrices. This satisfies the
SymmetricLinearOperator concept.


add randomly pivoted Cholesky (see /comps/rl_rpchol.hh):
 * Utility functions
* compute_columns evaluates kernel matrices in parallel where the kernel
matrix is specified by a function handle callable as ``K_stateless(i,
j)``.
   * pack_selected_rows
   * downdate_d_and_cdf
* rp_cholesky: implements Algorithm 4 from
https://arxiv.org/abs/2304.12465.

Add a new driver, krill_full_rpchol, in ``drivers/rl_krill.hh``, which
basically implements
[arXiv:2304.12465's](https://arxiv.org/pdf/2304.12465) Algorithm 1.
  • Loading branch information
rileyjmurray authored Feb 13, 2025
1 parent 8ea1196 commit 66ac027
Show file tree
Hide file tree
Showing 11 changed files with 1,299 additions and 1 deletion.
3 changes: 3 additions & 0 deletions RandLAPACK.hh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "RandLAPACK/misc/rl_util.hh"
#include "RandLAPACK/misc/rl_linops.hh"
#include "RandLAPACK/misc/rl_gen.hh"
#include "RandLAPACK/misc/rl_pdkernels.hh"

// Computational routines
#include "RandLAPACK/comps/rl_determiter.hh"
Expand All @@ -20,13 +21,15 @@
#include "RandLAPACK/comps/rl_syps.hh"
#include "RandLAPACK/comps/rl_syrf.hh"
#include "RandLAPACK/comps/rl_orth.hh"
#include "RandLAPACK/comps/rl_rpchol.hh"

// Drivers
#include "RandLAPACK/drivers/rl_rsvd.hh"
#include "RandLAPACK/drivers/rl_cqrrpt.hh"
#include "RandLAPACK/drivers/rl_bqrrp.hh"
#include "RandLAPACK/drivers/rl_revd2.hh"
#include "RandLAPACK/drivers/rl_rbki.hh"
#include "RandLAPACK/drivers/rl_krill.hh"

// Cuda functions - issues with linking/visibility when present if the below is uncommented.
// A temporary fix is to add the below directly in the test/benchmark files.
Expand Down
2 changes: 2 additions & 0 deletions RandLAPACK/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ set(RandLAPACK_cxx_sources
rl_rf.hh
rl_syps.hh
rl_syrf.hh
rl_rpchol.hh
rl_gen.hh
rl_blaspp.hh
rl_linops.hh
rl_pdkernels.hh

rl_cusolver.hh
rl_cuda_kernels.cuh
Expand Down
23 changes: 23 additions & 0 deletions RandLAPACK/comps/rl_preconditioners.hh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "rl_orth.hh"
#include "rl_syps.hh"
#include "rl_syrf.hh"
#include "rl_rpchol.hh"
#include "rl_revd2.hh"

#include <RandBLAS.hh>
Expand Down Expand Up @@ -337,4 +338,26 @@ RandBLAS::RNGState<RNG> nystrom_pc_data(
}


/**
* TODO: make an overload of rpchol_pc_data that omits "n" and assumes A implements
* some linear operator interface.
*/

template <typename T, typename STATE, typename FUNC>
STATE rpchol_pc_data(
int64_t n, FUNC &A_stateless, int64_t &k, int64_t b, T* V, T* eigvals, STATE state
) {
std::vector<int64_t> selection(k, -1);
state = RandLAPACK::rp_cholesky(n, A_stateless, k, selection.data(), V, b, state);
// ^ A_stateless \approx VV'; need to convert VV' into its eigendecomposition.
std::vector<T> work(k*k, 0.0);
lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.data(), k);
// V has been overwritten with its (nontrivial) left singular vectors
for (int64_t i = 0; i < k; ++i)
eigvals[i] = std::pow(eigvals[i], 2);
return state;
}



} // end namespace RandLAPACK
201 changes: 201 additions & 0 deletions RandLAPACK/comps/rl_rpchol.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#pragma once

#include "rl_lapackpp.hh"
#include <blas.hh>
#include <RandBLAS.hh>
#include <algorithm>
#include <vector>
#include <set>

namespace RandLAPACK {

namespace _rpchol_impl {

using std::vector;
using blas::Layout;

template <typename T, typename FUNC_T>
void compute_columns(
Layout layout, int64_t N, FUNC_T &K_stateless, vector<int64_t> &col_indices, T* buff
) {
randblas_require(layout == Layout::ColMajor);
int64_t num_cols = col_indices.size();
#pragma omp parallel for collapse(2)
for (int64_t ell = 0; ell < num_cols; ++ell) {
for (int64_t i = 0; i < N; ++i) {
int64_t j = col_indices[ell];
buff[i + ell*N] = K_stateless(i, j);
}
}
return;
}

template <typename T>
void pack_selected_rows(
Layout layout, int64_t rows_mat, int64_t cols_mat, T* mat, vector<int64_t> &row_indices, T* submat
) {
randblas_require(layout == Layout::ColMajor);
int64_t num_rows = row_indices.size();
for (int64_t i = 0; i < num_rows; ++i) {
blas::copy(cols_mat, mat + row_indices[i], rows_mat, submat + i, num_rows);
}
return;
}

template <typename T>
int downdate_d_and_cdf(Layout layout, int64_t N, vector<int64_t> &indices, T* F_panel, vector<T> &d, vector<T> &cdf) {
randblas_require(layout == Layout::ColMajor);
int64_t cols_F_panel = indices.size();
for (int64_t j = 0; j < cols_F_panel; ++j) {
for (int64_t i = 0; i < N; ++i) {
T val = F_panel[i + j*N];
d[i] -= val*val;
}
}
// Then, to accound for the possibility of rounding errors, manually zero-out everything in "indices."
for (auto i : indices)
d[i] = 0.0;
cdf = d;
try {
RandBLAS::weights_to_cdf(N, cdf.data());
} catch(RandBLAS::Error &e) {
std::string message{e.what()};
if (message.find("sum >=") != std::string::npos) {
return 1;
} else if (message.find("val >= error_if_below") != std::string::npos) {
return 2;
}
}
return 0;
}

} // end namespace RandLAPACK::_rpchol_impl

/***
* Computes a rank-k approximation of an implicit n-by-n matrix whose (i,j)^{th}
* entry is A_stateless(i,j), where A_stateless is a stateless function. We build
* the approximation iteratively and increase the rank by at most "b" at each iteration.
*
* Implements Algorithm 4 from https://arxiv.org/abs/2304.12465.
*
* Here's example code where the implict matrix is given by a squared exponential kernel:
*
* // Assume we've already defined ...
* // X : a rows_x by cols_x double-precision matrix (suitably standardized)
* // where each column defines a datapoint.
* // bandwidth : scale for the squared exponential kernel
*
* auto A = [X, rows_x, cols_x, bandwidth](int64_t i, int64_t j) {
* double out = 0;
* double* Xi = X + i*rows_x;
* double* Xj = X + j*rows_x;
* for (int64_t ell = 0; ell < rows_x) {
* double val = (Xi[ell] - Xj[ell]) / (std::sqrt(2)*bandwidth);
* out += val*val;
* }
* out = std::exp(out);
* return out;
* };
* std::vector<double> F(rows_x*k, 0.0);
* std::vector<int64_t> selection(k);
* RandBLAS::RNGState state_in(0);
* auto state_out = rp_cholesky(cols_x, A, k, selection.data(), F.data(), 64, state_in);
*
* Notes
* -----
* Compare to
* https://github.com/eepperly/Robust-randomized-preconditioning-for-kernel-ridge-regression/blob/main/code/choleskybase.m
*
*/
template <typename T, typename FUNC_T, typename STATE, typename CALLBACK>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state, CALLBACK &cb) {
// TODO: make this function robust to rank-deficient matrices.
using RandBLAS::sample_indices_iid;
using RandBLAS::weights_to_cdf;
using blas::Op;
using blas::Uplo;
using std::cout;
auto layout = blas::Layout::ColMajor;
auto uplo = blas::Uplo::Upper;

std::vector<T> work_mat(b*k, 0.0);
std::vector<T> d(n, 0.0);
std::vector<T> cdf(n);

std::vector<int64_t> Sprime{};

for (int64_t i = 0; i < n; ++i)
d[i] = A_stateless(i,i);
cdf = d;
weights_to_cdf(n, cdf.data());
int w_status = 0;
int c_status = 0;
int64_t ell = 0;
while (ell < k && w_status == 0 && c_status == 0) {
//
// 1. Compute the next block of column indices
//
int64_t curr_B = std::min(b, k - ell);
Sprime.resize(curr_B);
state = sample_indices_iid(n, cdf.data(), curr_B, Sprime.data(), state);
std::sort( Sprime.begin(), Sprime.end() );
Sprime.erase( unique( Sprime.begin(), Sprime.end() ), Sprime.end() );
int64_t ell_incr = Sprime.size();

//
// 2. Compute F_panel: the next block of ell_incr columns in F.
//
T* F_panel = F + ell*n;
//
// 2.1. Overwrite F_panel with the matrix "G" from Line 5 of [arXiv:2304.12465, Algorithm 4].
//
// First we compute a submatrix of columns of A and then we downdate with GEMM.
// The downdate is delicate since the output matrix shares a buffer with one of the
// input matrices, but it's okay since they're non-overlapping regions of that buffer.
//
_rpchol_impl::compute_columns(layout, n, A_stateless, Sprime, F_panel);
// ^ F_panel = A(:, Sprime).
_rpchol_impl::pack_selected_rows(layout, n, ell, F, Sprime, work_mat.data());
// ^ work_mat is a copy of F(Sprime, 1:ell).
blas::gemm(
layout, Op::NoTrans, Op::Trans, n, ell_incr, ell,
-1.0, F, n, work_mat.data(), ell_incr, 1.0, F_panel, n
);
//
// 2.2. Execute Lines 6 and 7 of [arXiv:2304.12465, Algorithm 4].
//
_rpchol_impl::pack_selected_rows(layout, n, ell_incr, F_panel, Sprime, work_mat.data());
c_status = lapack::potrf(uplo, ell_incr, work_mat.data(), ell_incr);
if (c_status) {
ell_incr = c_status - 1;
Sprime.resize(ell_incr);
}
blas::trsm(
layout, blas::Side::Right, uplo, Op::NoTrans, blas::Diag::NonUnit,
n, ell_incr, 1.0, work_mat.data(), ell_incr, F_panel, n
);

//
// 3. Update S, d, cdf and ell.
//
std::copy(Sprime.begin(), Sprime.end(), S + ell);
w_status = _rpchol_impl::downdate_d_and_cdf(layout, n, Sprime, F_panel, d, cdf);
ell = ell + ell_incr;
}
if (w_status) { cout << "downdate_d_and_cdf failed with exit code " << w_status << ".\n"; }
if (c_status) { cout << "Cholesky failed with exit code " << c_status << ".\n"; }
if (w_status || c_status) { cout << "returning with approximation rank " << ell << "\n"; }
k = ell;
cb(k);
return state;
}

template <typename T, typename FUNC_T, typename STATE>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state) {
auto cb = [](int64_t i) { return i ;};
rp_cholesky(n, A_stateless, k, S, F, b, state, cb);
return state;
}


}
Loading

0 comments on commit 66ac027

Please sign in to comment.