Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Randomly pivoted Cholesky and positive definite kernels (supersedes PR #85) #100

Merged
merged 8 commits into from
Feb 13, 2025
3 changes: 3 additions & 0 deletions RandLAPACK.hh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "RandLAPACK/misc/rl_util.hh"
#include "RandLAPACK/misc/rl_linops.hh"
#include "RandLAPACK/misc/rl_gen.hh"
#include "RandLAPACK/misc/rl_pdkernels.hh"

// Computational routines
#include "RandLAPACK/comps/rl_determiter.hh"
Expand All @@ -20,13 +21,15 @@
#include "RandLAPACK/comps/rl_syps.hh"
#include "RandLAPACK/comps/rl_syrf.hh"
#include "RandLAPACK/comps/rl_orth.hh"
#include "RandLAPACK/comps/rl_rpchol.hh"

// Drivers
#include "RandLAPACK/drivers/rl_rsvd.hh"
#include "RandLAPACK/drivers/rl_cqrrpt.hh"
#include "RandLAPACK/drivers/rl_bqrrp.hh"
#include "RandLAPACK/drivers/rl_revd2.hh"
#include "RandLAPACK/drivers/rl_rbki.hh"
#include "RandLAPACK/drivers/rl_krill.hh"

// Cuda functions - issues with linking/visibility when present if the below is uncommented.
// A temporary fix is to add the below directly in the test/benchmark files.
Expand Down
2 changes: 2 additions & 0 deletions RandLAPACK/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ set(RandLAPACK_cxx_sources
rl_rf.hh
rl_syps.hh
rl_syrf.hh
rl_rpchol.hh
rl_gen.hh
rl_blaspp.hh
rl_linops.hh
rl_pdkernels.hh

rl_cusolver.hh
rl_cuda_kernels.cuh
Expand Down
23 changes: 23 additions & 0 deletions RandLAPACK/comps/rl_preconditioners.hh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "rl_orth.hh"
#include "rl_syps.hh"
#include "rl_syrf.hh"
#include "rl_rpchol.hh"
#include "rl_revd2.hh"

#include <RandBLAS.hh>
Expand Down Expand Up @@ -337,4 +338,26 @@ RandBLAS::RNGState<RNG> nystrom_pc_data(
}


/**
* TODO: make an overload of rpchol_pc_data that omits "n" and assumes A implements
* some linear operator interface.
*/

template <typename T, typename STATE, typename FUNC>
STATE rpchol_pc_data(
int64_t n, FUNC &A_stateless, int64_t &k, int64_t b, T* V, T* eigvals, STATE state
) {
std::vector<int64_t> selection(k, -1);
state = RandLAPACK::rp_cholesky(n, A_stateless, k, selection.data(), V, b, state);
// ^ A_stateless \approx VV'; need to convert VV' into its eigendecomposition.
std::vector<T> work(k*k, 0.0);
lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.data(), k);
// V has been overwritten with its (nontrivial) left singular vectors
for (int64_t i = 0; i < k; ++i)
eigvals[i] = std::pow(eigvals[i], 2);
return state;
}



} // end namespace RandLAPACK
201 changes: 201 additions & 0 deletions RandLAPACK/comps/rl_rpchol.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#pragma once

#include "rl_lapackpp.hh"
#include <blas.hh>
#include <RandBLAS.hh>
#include <algorithm>
#include <vector>
#include <set>

namespace RandLAPACK {

namespace _rpchol_impl {

using std::vector;
using blas::Layout;

template <typename T, typename FUNC_T>
void compute_columns(
Layout layout, int64_t N, FUNC_T &K_stateless, vector<int64_t> &col_indices, T* buff
) {
randblas_require(layout == Layout::ColMajor);
int64_t num_cols = col_indices.size();
#pragma omp parallel for collapse(2)
for (int64_t ell = 0; ell < num_cols; ++ell) {
for (int64_t i = 0; i < N; ++i) {
int64_t j = col_indices[ell];
buff[i + ell*N] = K_stateless(i, j);
}
}
return;
}

template <typename T>
void pack_selected_rows(
Layout layout, int64_t rows_mat, int64_t cols_mat, T* mat, vector<int64_t> &row_indices, T* submat
) {
randblas_require(layout == Layout::ColMajor);
int64_t num_rows = row_indices.size();
for (int64_t i = 0; i < num_rows; ++i) {
blas::copy(cols_mat, mat + row_indices[i], rows_mat, submat + i, num_rows);
}
return;
}

template <typename T>
int downdate_d_and_cdf(Layout layout, int64_t N, vector<int64_t> &indices, T* F_panel, vector<T> &d, vector<T> &cdf) {
randblas_require(layout == Layout::ColMajor);
int64_t cols_F_panel = indices.size();
for (int64_t j = 0; j < cols_F_panel; ++j) {
for (int64_t i = 0; i < N; ++i) {
T val = F_panel[i + j*N];
d[i] -= val*val;
}
}
// Then, to accound for the possibility of rounding errors, manually zero-out everything in "indices."
for (auto i : indices)
d[i] = 0.0;
cdf = d;
try {
RandBLAS::weights_to_cdf(N, cdf.data());
} catch(RandBLAS::Error &e) {
std::string message{e.what()};
if (message.find("sum >=") != std::string::npos) {
return 1;
} else if (message.find("val >= error_if_below") != std::string::npos) {
return 2;
}
}
return 0;
}

} // end namespace RandLAPACK::_rpchol_impl

/***
* Computes a rank-k approximation of an implicit n-by-n matrix whose (i,j)^{th}
* entry is A_stateless(i,j), where A_stateless is a stateless function. We build
* the approximation iteratively and increase the rank by at most "b" at each iteration.
*
* Implements Algorithm 4 from https://arxiv.org/abs/2304.12465.
*
* Here's example code where the implict matrix is given by a squared exponential kernel:
*
* // Assume we've already defined ...
* // X : a rows_x by cols_x double-precision matrix (suitably standardized)
* // where each column defines a datapoint.
* // bandwidth : scale for the squared exponential kernel
*
* auto A = [X, rows_x, cols_x, bandwidth](int64_t i, int64_t j) {
* double out = 0;
* double* Xi = X + i*rows_x;
* double* Xj = X + j*rows_x;
* for (int64_t ell = 0; ell < rows_x) {
* double val = (Xi[ell] - Xj[ell]) / (std::sqrt(2)*bandwidth);
* out += val*val;
* }
* out = std::exp(out);
* return out;
* };
* std::vector<double> F(rows_x*k, 0.0);
* std::vector<int64_t> selection(k);
* RandBLAS::RNGState state_in(0);
* auto state_out = rp_cholesky(cols_x, A, k, selection.data(), F.data(), 64, state_in);
*
* Notes
* -----
* Compare to
* https://github.com/eepperly/Robust-randomized-preconditioning-for-kernel-ridge-regression/blob/main/code/choleskybase.m
*
*/
template <typename T, typename FUNC_T, typename STATE, typename CALLBACK>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state, CALLBACK &cb) {
// TODO: make this function robust to rank-deficient matrices.
using RandBLAS::sample_indices_iid;
using RandBLAS::weights_to_cdf;
using blas::Op;
using blas::Uplo;
using std::cout;
auto layout = blas::Layout::ColMajor;
auto uplo = blas::Uplo::Upper;

std::vector<T> work_mat(b*k, 0.0);
std::vector<T> d(n, 0.0);
std::vector<T> cdf(n);

std::vector<int64_t> Sprime{};

for (int64_t i = 0; i < n; ++i)
d[i] = A_stateless(i,i);
cdf = d;
weights_to_cdf(n, cdf.data());
int w_status = 0;
int c_status = 0;
int64_t ell = 0;
while (ell < k && w_status == 0 && c_status == 0) {
//
// 1. Compute the next block of column indices
//
int64_t curr_B = std::min(b, k - ell);
Sprime.resize(curr_B);
state = sample_indices_iid(n, cdf.data(), curr_B, Sprime.data(), state);
std::sort( Sprime.begin(), Sprime.end() );
Sprime.erase( unique( Sprime.begin(), Sprime.end() ), Sprime.end() );
int64_t ell_incr = Sprime.size();

//
// 2. Compute F_panel: the next block of ell_incr columns in F.
//
T* F_panel = F + ell*n;
//
// 2.1. Overwrite F_panel with the matrix "G" from Line 5 of [arXiv:2304.12465, Algorithm 4].
//
// First we compute a submatrix of columns of A and then we downdate with GEMM.
// The downdate is delicate since the output matrix shares a buffer with one of the
// input matrices, but it's okay since they're non-overlapping regions of that buffer.
//
_rpchol_impl::compute_columns(layout, n, A_stateless, Sprime, F_panel);
// ^ F_panel = A(:, Sprime).
_rpchol_impl::pack_selected_rows(layout, n, ell, F, Sprime, work_mat.data());
// ^ work_mat is a copy of F(Sprime, 1:ell).
blas::gemm(
layout, Op::NoTrans, Op::Trans, n, ell_incr, ell,
-1.0, F, n, work_mat.data(), ell_incr, 1.0, F_panel, n
);
//
// 2.2. Execute Lines 6 and 7 of [arXiv:2304.12465, Algorithm 4].
//
_rpchol_impl::pack_selected_rows(layout, n, ell_incr, F_panel, Sprime, work_mat.data());
c_status = lapack::potrf(uplo, ell_incr, work_mat.data(), ell_incr);
if (c_status) {
ell_incr = c_status - 1;
Sprime.resize(ell_incr);
}
blas::trsm(
layout, blas::Side::Right, uplo, Op::NoTrans, blas::Diag::NonUnit,
n, ell_incr, 1.0, work_mat.data(), ell_incr, F_panel, n
);

//
// 3. Update S, d, cdf and ell.
//
std::copy(Sprime.begin(), Sprime.end(), S + ell);
w_status = _rpchol_impl::downdate_d_and_cdf(layout, n, Sprime, F_panel, d, cdf);
ell = ell + ell_incr;
}
if (w_status) { cout << "downdate_d_and_cdf failed with exit code " << w_status << ".\n"; }
if (c_status) { cout << "Cholesky failed with exit code " << c_status << ".\n"; }
if (w_status || c_status) { cout << "returning with approximation rank " << ell << "\n"; }
k = ell;
cb(k);
return state;
}

template <typename T, typename FUNC_T, typename STATE>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state) {
auto cb = [](int64_t i) { return i ;};
rp_cholesky(n, A_stateless, k, S, F, b, state, cb);
return state;
}


}
Loading