Randomly pivoted Cholesky and positive definite kernels (supersedes PR …

…#85) (#100) Add initial support for positive definite kernels (see /misc/rl_pdkernels.hh). * Various helper functions for evaluating the Gaussian kernel (or any kernel that primarily relies on the Euclidean distance matrix). * Add a block_arrowhead_multiply function to avoid duplicate kernel evaluations when accessing a kernel matrix as a black-box linear operator. * Add an RBFKernelMatrix class that represents a collection of regularized RBF kernel matrices. This satisfies the SymmetricLinearOperator concept. add randomly pivoted Cholesky (see /comps/rl_rpchol.hh): * Utility functions * compute_columns evaluates kernel matrices in parallel where the kernel matrix is specified by a function handle callable as ``K_stateless(i, j)``. * pack_selected_rows * downdate_d_and_cdf * rp_cholesky: implements Algorithm 4 from https://arxiv.org/abs/2304.12465. Add a new driver, krill_full_rpchol, in ``drivers/rl_krill.hh``, which basically implements [arXiv:2304.12465's](https://arxiv.org/pdf/2304.12465) Algorithm 1.
BallisticLA · Feb 13, 2025 · 66ac027 · 66ac027
1 parent 8ea1196
commit 66ac027
Show file tree

Hide file tree

Showing 11 changed files with 1,299 additions and 1 deletion.
diff --git a/RandLAPACK.hh b/RandLAPACK.hh
@@ -10,6 +10,7 @@
 #include "RandLAPACK/misc/rl_util.hh"
 #include "RandLAPACK/misc/rl_linops.hh"
 #include "RandLAPACK/misc/rl_gen.hh"
+#include "RandLAPACK/misc/rl_pdkernels.hh"
 
 // Computational routines
 #include "RandLAPACK/comps/rl_determiter.hh"
@@ -20,13 +21,15 @@
 #include "RandLAPACK/comps/rl_syps.hh"
 #include "RandLAPACK/comps/rl_syrf.hh"
 #include "RandLAPACK/comps/rl_orth.hh"
+#include "RandLAPACK/comps/rl_rpchol.hh"
 
 // Drivers
 #include "RandLAPACK/drivers/rl_rsvd.hh"
 #include "RandLAPACK/drivers/rl_cqrrpt.hh"
 #include "RandLAPACK/drivers/rl_bqrrp.hh"
 #include "RandLAPACK/drivers/rl_revd2.hh"
 #include "RandLAPACK/drivers/rl_rbki.hh"
+#include "RandLAPACK/drivers/rl_krill.hh"
 
 // Cuda functions - issues with linking/visibility when present if the below is uncommented.
 // A temporary fix is to add the below directly in the test/benchmark files.

diff --git a/RandLAPACK/CMakeLists.txt b/RandLAPACK/CMakeLists.txt
@@ -14,9 +14,11 @@ set(RandLAPACK_cxx_sources
     rl_rf.hh
     rl_syps.hh
     rl_syrf.hh
+    rl_rpchol.hh
     rl_gen.hh
     rl_blaspp.hh
     rl_linops.hh
+    rl_pdkernels.hh
 
     rl_cusolver.hh
     rl_cuda_kernels.cuh

diff --git a/RandLAPACK/comps/rl_preconditioners.hh b/RandLAPACK/comps/rl_preconditioners.hh
@@ -8,6 +8,7 @@
 #include "rl_orth.hh"
 #include "rl_syps.hh"
 #include "rl_syrf.hh"
+#include "rl_rpchol.hh"
 #include "rl_revd2.hh"
 
 #include <RandBLAS.hh>
@@ -337,4 +338,26 @@ RandBLAS::RNGState<RNG> nystrom_pc_data(
 }
 
 
+/**
+ * TODO: make an overload of rpchol_pc_data that omits "n" and assumes A implements
+ * some linear operator interface.
+ */
+
+template <typename T, typename STATE, typename FUNC>
+STATE rpchol_pc_data(
+    int64_t n, FUNC &A_stateless, int64_t &k, int64_t b, T* V, T* eigvals, STATE state
+) {
+    std::vector<int64_t> selection(k, -1);
+    state = RandLAPACK::rp_cholesky(n, A_stateless, k, selection.data(), V, b, state);
+    // ^ A_stateless \approx VV'; need to convert VV' into its eigendecomposition.
+    std::vector<T> work(k*k, 0.0);
+    lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.data(), k);
+    // V has been overwritten with its (nontrivial) left singular vectors
+    for (int64_t i = 0; i < k; ++i) 
+        eigvals[i] = std::pow(eigvals[i], 2);
+    return state;
+}
+
+
+
 }  // end namespace RandLAPACK
diff --git a/RandLAPACK/comps/rl_rpchol.hh b/RandLAPACK/comps/rl_rpchol.hh
@@ -0,0 +1,201 @@
+#pragma once
+
+#include "rl_lapackpp.hh"
+#include <blas.hh>
+#include <RandBLAS.hh>
+#include <algorithm>
+#include <vector>
+#include <set>
+
+namespace RandLAPACK {
+
+namespace _rpchol_impl {
+
+using std::vector;
+using blas::Layout;
+
+template <typename T, typename FUNC_T>
+void compute_columns(
+    Layout layout, int64_t N, FUNC_T &K_stateless, vector<int64_t> &col_indices, T* buff
+) {
+    randblas_require(layout == Layout::ColMajor);
+    int64_t num_cols = col_indices.size();
+    #pragma omp parallel for collapse(2)
+    for (int64_t ell = 0; ell < num_cols; ++ell) {
+        for (int64_t i = 0; i < N; ++i) {
+            int64_t j = col_indices[ell];
+            buff[i + ell*N] = K_stateless(i, j);
+        }
+    }
+    return;
+}
+
+template <typename T>
+void pack_selected_rows(
+    Layout layout, int64_t rows_mat, int64_t cols_mat, T* mat, vector<int64_t> &row_indices, T* submat
+) {
+    randblas_require(layout == Layout::ColMajor);
+    int64_t num_rows = row_indices.size();
+    for (int64_t i = 0; i < num_rows; ++i) {
+        blas::copy(cols_mat, mat + row_indices[i], rows_mat, submat + i, num_rows);
+    }
+    return;
+}
+
+template <typename T>
+int downdate_d_and_cdf(Layout layout, int64_t N, vector<int64_t> &indices, T* F_panel, vector<T> &d, vector<T> &cdf) {
+    randblas_require(layout == Layout::ColMajor);
+    int64_t cols_F_panel = indices.size();
+    for (int64_t j = 0; j < cols_F_panel; ++j) {
+        for (int64_t i = 0; i < N; ++i) {
+            T val = F_panel[i + j*N];
+            d[i] -= val*val;
+        }
+    }
+    // Then, to accound for the possibility of rounding errors, manually zero-out everything in "indices."
+    for (auto i : indices)
+        d[i] = 0.0;
+    cdf = d;
+    try {
+        RandBLAS::weights_to_cdf(N, cdf.data());
+    } catch(RandBLAS::Error &e) {
+        std::string message{e.what()};
+        if (message.find("sum >=") != std::string::npos) {
+            return 1;
+        } else if (message.find("val >= error_if_below") != std::string::npos) {
+            return 2;
+        }
+    }
+    return 0;
+}
+
+} // end namespace RandLAPACK::_rpchol_impl
+
+/***
+ * Computes a rank-k approximation of an implicit n-by-n matrix whose (i,j)^{th}
+ * entry is A_stateless(i,j), where A_stateless is a stateless function. We build
+ * the approximation iteratively and increase the rank by at most "b" at each iteration.
+ * 
+ * Implements Algorithm 4 from https://arxiv.org/abs/2304.12465.
+ * 
+ * Here's example code where the implict matrix is given by a squared exponential kernel:
+ * 
+ *      // Assume we've already defined ...
+ *      //         X  : a rows_x by cols_x double-precision matrix (suitably standardized)
+ *      //              where each column defines a datapoint.
+ *      //  bandwidth : scale for the squared exponential kernel    
+ * 
+ *      auto A = [X, rows_x, cols_x, bandwidth](int64_t i, int64_t j) {
+ *          double out = 0;
+ *          double* Xi = X + i*rows_x;
+ *          double* Xj = X + j*rows_x;
+ *          for (int64_t ell = 0; ell < rows_x) {
+ *              double val = (Xi[ell] - Xj[ell]) / (std::sqrt(2)*bandwidth);
+ *              out += val*val;
+ *          }
+ *          out = std::exp(out);
+ *          return out;
+ *      };
+ *      std::vector<double> F(rows_x*k, 0.0);
+ *      std::vector<int64_t> selection(k);
+ *      RandBLAS::RNGState state_in(0);
+ *      auto state_out = rp_cholesky(cols_x, A, k, selection.data(), F.data(), 64, state_in);
+ * 
+ * Notes
+ * -----
+ * Compare to 
+ * https://github.com/eepperly/Robust-randomized-preconditioning-for-kernel-ridge-regression/blob/main/code/choleskybase.m
+ * 
+ */
+template <typename T, typename FUNC_T, typename STATE, typename CALLBACK>
+STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S,  T* F, int64_t b, STATE state, CALLBACK &cb) {
+    // TODO: make this function robust to rank-deficient matrices. 
+    using RandBLAS::sample_indices_iid;
+    using RandBLAS::weights_to_cdf;
+    using blas::Op;
+    using blas::Uplo;
+    using std::cout;
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Upper;
+
+    std::vector<T> work_mat(b*k, 0.0);
+    std::vector<T> d(n, 0.0);
+    std::vector<T> cdf(n);
+
+    std::vector<int64_t> Sprime{};
+
+    for (int64_t i = 0; i < n; ++i)
+        d[i] = A_stateless(i,i);
+    cdf = d;
+    weights_to_cdf(n, cdf.data());
+    int w_status = 0;
+    int c_status = 0;
+    int64_t ell  = 0;
+    while (ell < k && w_status == 0 && c_status == 0) {
+        //
+        //  1. Compute the next block of column indices
+        //
+        int64_t curr_B = std::min(b, k - ell);
+        Sprime.resize(curr_B);
+        state = sample_indices_iid(n, cdf.data(), curr_B, Sprime.data(), state);
+        std::sort( Sprime.begin(), Sprime.end() );
+        Sprime.erase( unique( Sprime.begin(), Sprime.end() ), Sprime.end() );
+        int64_t ell_incr = Sprime.size();
+
+        //
+        //  2. Compute F_panel: the next block of ell_incr columns in F.
+        //
+        T* F_panel = F + ell*n;
+        //
+        //      2.1. Overwrite F_panel with the matrix "G" from Line 5 of [arXiv:2304.12465, Algorithm 4].
+        //
+        //           First we compute a submatrix of columns of A and then we downdate with GEMM.
+        //           The downdate is delicate since the output matrix shares a buffer with one of the
+        //           input matrices, but it's okay since they're non-overlapping regions of that buffer.
+        //
+        _rpchol_impl::compute_columns(layout, n, A_stateless, Sprime, F_panel);
+        //           ^ F_panel = A(:, Sprime).
+        _rpchol_impl::pack_selected_rows(layout, n, ell, F, Sprime, work_mat.data());
+        //           ^ work_mat is a copy of F(Sprime, 1:ell).
+        blas::gemm(
+            layout, Op::NoTrans, Op::Trans, n, ell_incr, ell,
+            -1.0, F, n, work_mat.data(), ell_incr, 1.0, F_panel, n
+        );
+        //
+        //      2.2. Execute Lines 6 and 7 of [arXiv:2304.12465, Algorithm 4].     
+        //
+        _rpchol_impl::pack_selected_rows(layout, n, ell_incr, F_panel, Sprime, work_mat.data());
+        c_status = lapack::potrf(uplo, ell_incr, work_mat.data(), ell_incr);
+        if (c_status) {
+            ell_incr = c_status - 1;
+            Sprime.resize(ell_incr);
+        }
+        blas::trsm(
+            layout, blas::Side::Right, uplo, Op::NoTrans, blas::Diag::NonUnit,
+            n, ell_incr, 1.0, work_mat.data(), ell_incr, F_panel, n
+        );
+
+        //
+        // 3. Update S, d, cdf and ell.
+        //
+        std::copy(Sprime.begin(), Sprime.end(), S + ell);
+        w_status = _rpchol_impl::downdate_d_and_cdf(layout, n, Sprime, F_panel, d, cdf);
+        ell = ell + ell_incr;
+    }
+    if (w_status) { cout << "downdate_d_and_cdf failed with exit code " << w_status << ".\n"; } 
+    if (c_status) { cout << "Cholesky failed with exit code " << c_status << ".\n"; }
+    if (w_status || c_status) { cout << "returning with approximation rank " << ell << "\n"; }
+    k = ell;
+    cb(k);
+    return state;
+}
+
+template <typename T, typename FUNC_T, typename STATE>
+STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S,  T* F, int64_t b, STATE state) {
+    auto cb = [](int64_t i) { return i ;};
+    rp_cholesky(n, A_stateless, k, S, F, b, state, cb);
+    return state;
+}
+
+
+}