Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RowMajor CQRRPT [WiP] #95

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/workflows/core-linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: webfactory/ssh-agent
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.PRIVATE_CLONE_SSH_KEY }}

- name: submodule update
run: |
git submodule update --init --recursive
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/core-macos.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: webfactory/ssh-agent
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.PRIVATE_CLONE_SSH_KEY }}

- name: submodule update
run: |
git submodule update --init --recursive
Expand Down
39 changes: 23 additions & 16 deletions RandLAPACK/drivers/rl_cqrrpt.hh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class CQRRPTalg {
int64_t ldr,
int64_t* J,
T d_factor,
RandBLAS::RNGState<RNG> &state
RandBLAS::RNGState<RNG> &state,
Layout layout
) = 0;
};

Expand All @@ -49,8 +50,8 @@ class CQRRPT : public CQRRPTalg<T, RNG> {
///
///
/// The algorithm optionally computes a condition number of a preconditioned matrix A through a 'cond_check'
/// parameter, which defaults to 0. This requires extra n * (m + 1) * sizeof(T) bytes of space, which will be
/// internally allocated by a utility routine.
/// parameter, which defaults to 0. This requires extra n * (m + 1) * sizeof(T) bytes of space, which will be
/// internally allocated by a utility routine.
/// A computation is handled by a utility method that finds the l2 condition number by computing all singular
/// values of the R-factor via an appropriate LAPACK function.
CQRRPT(
Expand All @@ -70,7 +71,7 @@ class CQRRPT : public CQRRPTalg<T, RNG> {
/// A[:, J] = QR,
/// where Q and R are of size m-by-k and k-by-n, with rank(A) = k.
/// Detailed description of this algorithm may be found in Section 5.1.2.
/// of "the RandLAPACK book".
/// of "the RandLAPACK book".
///
/// @param[in] m
/// The number of rows in the matrix A.
Expand Down Expand Up @@ -116,7 +117,8 @@ class CQRRPT : public CQRRPTalg<T, RNG> {
int64_t ldr,
int64_t* J,
T d_factor,
RandBLAS::RNGState<RNG> &state
RandBLAS::RNGState<RNG> &state,
Layout layout=Layout::ColMajor
) override;

public:
Expand Down Expand Up @@ -149,7 +151,8 @@ int CQRRPT<T, RNG>::call(
int64_t ldr,
int64_t* J,
T d_factor,
RandBLAS::RNGState<RNG> &state
RandBLAS::RNGState<RNG> &state,
Layout layout
){
///--------------------TIMING VARS--------------------/
high_resolution_clock::time_point saso_t_stop;
Expand Down Expand Up @@ -193,16 +196,17 @@ int CQRRPT<T, RNG>::call(

if(this -> timing)
saso_t_start = high_resolution_clock::now();

/// Generating a SASO
RandBLAS::SparseDist DS(d, m, this->nnz);
RandBLAS::SparseSkOp<T, RNG> S(DS, state);
RandBLAS::fill_sparse(S);
state = S.next_state;

/// Applying a SASO
auto layout_flag_1 = layout == Layout::ColMajor ? Op::NoTrans : Op::Trans;
RandBLAS::sketch_general(
Layout::ColMajor, Op::NoTrans, Op::NoTrans,
layout, Op::NoTrans, layout_flag_1,
d, n, m, (T) 1.0, S, 0, 0, A, lda, (T) 0.0, A_hat, d
);

Expand All @@ -225,8 +229,8 @@ int CQRRPT<T, RNG>::call(
}

/// Using naive rank estimation to ensure that R used for preconditioning is invertible.
/// The actual rank estimate k will be computed a posteriori.
/// Using R[i,i] to approximate the i-th singular value of A_hat.
/// The actual rank estimate k will be computed a posteriori.
/// Using R[i,i] to approximate the i-th singular value of A_hat.
/// Truncate at the largest i where R[i,i] / R[0,0] >= eps.
for(i = 0; i < n; ++i) {
if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) {
Expand All @@ -241,7 +245,8 @@ int CQRRPT<T, RNG>::call(

/// Extracting a k by k R representation
T* R_sp = R;
lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, ldr);
auto layout_flag_2 = layout == Layout::ColMajor ? MatrixType::Upper : MatrixType::Lower;
lapack::lacpy(layout_flag_2, k, k, A_hat, d, R_sp, ldr);

if(this -> timing)
a_mod_piv_t_start = high_resolution_clock::now();
Expand All @@ -256,16 +261,17 @@ int CQRRPT<T, RNG>::call(
}

// A_pre * R_sp = AP
blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, ldr, A, lda);
blas::trsm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, ldr, A, lda);

if(this -> timing) {
a_mod_trsm_t_stop = high_resolution_clock::now();
cholqr_t_start = high_resolution_clock::now();
}

// Do Cholesky QR
blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, ldr);
lapack::potrf(Uplo::Upper, k, R_sp, ldr);
blas::syrk(layout, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, ldr);
auto layout_flag_3 = layout == Layout::ColMajor ? Uplo::Upper : Uplo::Lower;
lapack::potrf(layout_flag_3, k, R_sp, ldr);

// Re-estimate rank after we have the R-factor form Cholesky QR.
// The strategy here is the same as in naive rank estimation.
Expand All @@ -286,13 +292,14 @@ int CQRRPT<T, RNG>::call(
}
}

blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, ldr, A, lda);
blas::trsm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, ldr, A, lda);

if(this -> timing)
cholqr_t_stop = high_resolution_clock::now();

// Get the final R-factor -- undoing the preconditioning
blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R_sp, ldr);
RandLAPACK::util::get_U(new_rank, new_rank, R_sp, ldr);
blas::trmm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R_sp, ldr);

// Set the rank parameter to the value comuted a posteriori.
this->rank = k;
Expand Down
25 changes: 25 additions & 0 deletions RandLAPACK/misc/rl_util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,31 @@ void col_swap(
}
}

template <typename t>
void col_swap_row_major(
int64_t m,
int64_t n,
int64_t k,
t* a,
int64_t lda,
std::vector<int64_t> idx
) {
if (k > n)
throw std::runtime_error("invalid rank parameter.");

int64_t i, j;
for (i = 0, j = 0; i < k; ++i) {
j = idx[i] - 1; // adjust for zero-based index
// use blas to swap the whole columns
blas::swap(m, &a[i], lda, &a[j], lda);

// swap idx array elements
// find idx element with value i and assign it to j
auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1);
idx[it - idx.begin()] = j + 1;
}
}

/// Checks if the given size is larger than available.
/// If so, resizes the vector.
template <typename T>
Expand Down
103 changes: 79 additions & 24 deletions test/drivers/test_cqrrpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
#include "rl_gen.hh"
#include "rl_util.hh"

#include <RandBLAS.hh>
#include <fstream>
Expand All @@ -22,41 +23,61 @@ class TestCQRRPT : public ::testing::Test
int64_t col;
int64_t rank; // has to be modifiable
std::vector<T> A;
std::vector<T> A_row;
std::vector<T> R;
std::vector<int64_t> J;
std::vector<T> A_cpy1;
std::vector<T> A_cpy2;
std::vector<T> I_ref;

CQRRPTTestData(int64_t m, int64_t n, int64_t k) :
A(m * n, 0.0),
Layout layout;

CQRRPTTestData(int64_t m, int64_t n, int64_t k, Layout layout=Layout::ColMajor):
row(m),
col(n),
rank(k),
A(m * n, 0.0),
A_row(m * n, 0.0),
R(n * n, 0.0),
J(n, 0),
J(n, 0),
A_cpy1(m * n, 0.0),
A_cpy2(m * n, 0.0),
I_ref(k * k, 0.0)
{
row = m;
col = n;
rank = k;
}
I_ref(k * k, 0.0),
layout(layout)
{}

};

template<typename T>
void ColMajor2RowMajor(
CQRRPTTestData<T> &all_data
) {
// Iterate over each row for the output format
for (int64_t i = 0; i < all_data.row; ++i) {
// Copy a single row from A_col to A_row
blas::copy(all_data.col, all_data.A.data() + i, all_data.row, all_data.A_row.data() + i * all_data.col, 1);
}
}

template <typename T>
static void norm_and_copy_computational_helper(T &norm_A, CQRRPTTestData<T> &all_data) {
//NOTE: This function is safe for use with RowMajor inputs as well
auto m = all_data.row;
auto n = all_data.col;
auto lda = all_data.layout == Layout::ColMajor ? m : n;

lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), lda);
lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), lda);

lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m);
lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m);
norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), lda);
}


/// This routine also appears in benchmarks, but idk if it should be put into utils
template <typename T>
static void
error_check(T &norm_A, CQRRPTTestData<T> &all_data) {
//NOTE: This function should be safe for both RowMajor and ColMajor matrices

auto m = all_data.row;
auto n = all_data.col;
Expand All @@ -73,26 +94,27 @@ class TestCQRRPT : public ::testing::Test

// Check orthogonality of Q
// Q' * Q - I = 0
blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, -1.0, I_ref_dat, k);
auto ldq = all_data.layout == Layout::ColMajor ? m : n;
blas::syrk(all_data.layout, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, ldq, -1.0, I_ref_dat, k);
T norm_0 = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k);

// A - QR
blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m);
blas::gemm(all_data.layout, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, ldq, R_dat, n, -1.0, A_dat, ldq);

// Implementing max col norm metric
T max_col_norm = 0.0;
T col_norm = 0.0;
int max_idx = 0;
for(int i = 0; i < n; ++i) {
col_norm = blas::nrm2(m, &A_dat[m * i], 1);
col_norm = blas::nrm2(m, &A_dat[ldq * i], 1);
if(max_col_norm < col_norm) {
max_col_norm = col_norm;
max_idx = i;
}
}
T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1);
T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m);
T col_norm_A = blas::nrm2(n, &A_cpy_dat[ldq * max_idx], 1);
T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, ldq);

printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A);
printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A);
printf("FRO NORM OF (Q'Q - I)/sqrt(n): %2e\n\n", norm_0 / std::sqrt((T) n));
Expand All @@ -107,7 +129,7 @@ class TestCQRRPT : public ::testing::Test
/// Computes QR factorzation, and computes A[:, J] - QR.
template <typename T, typename RNG, typename alg_type>
static void test_CQRRPT_general(
T d_factor,
T d_factor,
T norm_A,
CQRRPTTestData<T> &all_data,
alg_type &CQRRPT,
Expand All @@ -116,15 +138,21 @@ class TestCQRRPT : public ::testing::Test
auto m = all_data.row;
auto n = all_data.col;

CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state);
CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state, all_data.layout);

all_data.rank = CQRRPT.rank;
printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank);

RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J);
RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);
if(all_data.layout == Layout::ColMajor) {
RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J);
RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);
}
else {
RandLAPACK::util::col_swap_row_major(m, n, n, all_data.A_cpy1.data(), n, all_data.J);
RandLAPACK::util::col_swap_row_major(m, n, n, all_data.A_cpy2.data(), n, all_data.J);
}

error_check(norm_A, all_data);
error_check(norm_A, all_data);
}
};

Expand Down Expand Up @@ -153,6 +181,33 @@ TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) {
test_CQRRPT_general(d_factor, norm_A, all_data, CQRRPT, state);
}

TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp_RowMajor) {
int64_t m = 10;
int64_t n = 5;
int64_t k = 5;
double d_factor = 2;
double norm_A = 0;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
Layout layout = Layout::RowMajor;

CQRRPTTestData<double> all_data(m, n, k, layout);
RandLAPACK::CQRRPT<double, r123::Philox4x32> CQRRPT(false, tol);
CQRRPT.nnz = 2;
CQRRPT.no_hqrrp = 1;

RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::polynomial);
m_info.cond_num = 2;
m_info.rank = k;
m_info.exponent = 2.0;
RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state);

ColMajor2RowMajor(all_data);

norm_and_copy_computational_helper(norm_A, all_data);
test_CQRRPT_general(d_factor, norm_A, all_data, CQRRPT, state);
}

TEST_F(TestCQRRPT, CQRRPT_low_rank_with_hqrrp) {
int64_t m = 10000;
int64_t n = 200;
Expand Down
Loading