From 1b586b004130e85bf4799dbfcff6d5bc1dfc3caf Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Thu, 16 Jan 2025 16:43:28 -0800 Subject: [PATCH 1/7] Testing infrastructure + initial changes --- RandLAPACK/drivers/rl_cqrrpt.hh | 41 ++++++++--------- RandLAPACK/misc/rl_util.hh | 25 ++++++++++ test/drivers/test_cqrrpt.cc | 82 +++++++++++++++++++-------------- 3 files changed, 92 insertions(+), 56 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index be1b2caa..67826de2 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -49,8 +49,8 @@ class CQRRPT : public CQRRPTalg { /// /// /// The algorithm optionally computes a condition number of a preconditioned matrix A through a 'cond_check' - /// parameter, which defaults to 0. This requires extra n * (m + 1) * sizeof(T) bytes of space, which will be - /// internally allocated by a utility routine. + /// parameter, which defaults to 0. This requires extra n * (m + 1) * sizeof(T) bytes of space, which will be + /// internally allocated by a utility routine. /// A computation is handled by a utility method that finds the l2 condition number by computing all singular /// values of the R-factor via an appropriate LAPACK function. CQRRPT( @@ -70,7 +70,7 @@ class CQRRPT : public CQRRPTalg { /// A[:, J] = QR, /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. /// Detailed description of this algorithm may be found in Section 5.1.2. - /// of "the RandLAPACK book". + /// of "the RandLAPACK book". /// /// @param[in] m /// The number of rows in the matrix A. @@ -128,6 +128,7 @@ class CQRRPT : public CQRRPTalg { std::vector times; // tuning SASOS + int num_threads; int64_t nnz; // HQRRP-related @@ -193,12 +194,11 @@ int CQRRPT::call( if(this -> timing) saso_t_start = high_resolution_clock::now(); - + /// Generating a SASO - RandBLAS::SparseDist DS(d, m, this->nnz); + RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; RandBLAS::SparseSkOp S(DS, state); - RandBLAS::fill_sparse(S); - state = S.next_state; + state = RandBLAS::fill_sparse(S); /// Applying a SASO RandBLAS::sketch_general( @@ -225,8 +225,8 @@ int CQRRPT::call( } /// Using naive rank estimation to ensure that R used for preconditioning is invertible. - /// The actual rank estimate k will be computed a posteriori. - /// Using R[i,i] to approximate the i-th singular value of A_hat. + /// The actual rank estimate k will be computed a posteriori. + /// Using R[i,i] to approximate the i-th singular value of A_hat. /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. for(i = 0; i < n; ++i) { if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) { @@ -239,9 +239,7 @@ int CQRRPT::call( if(this -> timing) rank_reveal_t_stop = high_resolution_clock::now(); - /// Extracting a k by k R representation - T* R_sp = R; - lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, ldr); + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); if(this -> timing) a_mod_piv_t_start = high_resolution_clock::now(); @@ -256,7 +254,7 @@ int CQRRPT::call( } // A_pre * R_sp = AP - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, ldr, A, lda); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R, ldr, A, lda); if(this -> timing) { a_mod_trsm_t_stop = high_resolution_clock::now(); @@ -264,8 +262,8 @@ int CQRRPT::call( } // Do Cholesky QR - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, ldr); - lapack::potrf(Uplo::Upper, k, R_sp, ldr); + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R, ldr); + lapack::potrf(Uplo::Upper, k, R, ldr); // Re-estimate rank after we have the R-factor form Cholesky QR. // The strategy here is the same as in naive rank estimation. @@ -273,11 +271,11 @@ int CQRRPT::call( // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T. new_rank = k; - running_max = R_sp[0]; - running_min = R_sp[0]; + running_max = R[0]; + running_min = R[0]; for(i = 0; i < k; ++i) { - curr_entry = std::abs(R_sp[i * ldr + i]); + curr_entry = std::abs(R[i * ldr + i]); running_max = std::max(running_max, curr_entry); running_min = std::min(running_min, curr_entry); if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits::epsilon())) { @@ -286,13 +284,14 @@ int CQRRPT::call( } } - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, ldr, A, lda); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R, ldr, A, lda); if(this -> timing) cholqr_t_stop = high_resolution_clock::now(); - // Get the final R-factor -- undoing the preconditioning - blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R_sp, ldr); + // Get the final R-factor. + RandLAPACK::util::get_U(new_rank, new_rank, R, ldr); + blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R, ldr); // Set the rank parameter to the value comuted a posteriori. this->rank = k; diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 5aebf5bd..3f8d6df3 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -167,6 +167,31 @@ void col_swap( } } +template +void col_swap_row_major( + int64_t m, + int64_t n, + int64_t k, + t* a, + int64_t lda, + std::vector idx +) { + if (k > n) + throw std::runtime_error("invalid rank parameter."); + + int64_t i, j; + for (i = 0, j = 0; i < k; ++i) { + j = idx[i] - 1; // adjust for zero-based index + // use blas to swap the whole columns + blas::swap(m, &a[i], lda, &a[j], lda); + + // swap idx array elements + // find idx element with value i and assign it to j + auto it = std::find(idx.begin() + i, idx.begin() + k, i + 1); + idx[it - idx.begin()] = j + 1; + } +} + /// Checks if the given size is larger than available. /// If so, resizes the vector. template diff --git a/test/drivers/test_cqrrpt.cc b/test/drivers/test_cqrrpt.cc index 1335f56a..7ce9e926 100644 --- a/test/drivers/test_cqrrpt.cc +++ b/test/drivers/test_cqrrpt.cc @@ -2,6 +2,7 @@ #include "rl_blaspp.hh" #include "rl_lapackpp.hh" #include "rl_gen.hh" +#include "rl_util.hh" #include #include @@ -22,34 +23,52 @@ class TestCQRRPT : public ::testing::Test int64_t col; int64_t rank; // has to be modifiable std::vector A; + std::vector A_row; std::vector R; std::vector J; std::vector A_cpy1; std::vector A_cpy2; std::vector I_ref; - CQRRPTTestData(int64_t m, int64_t n, int64_t k) : - A(m * n, 0.0), + Layout layout; + + CQRRPTTestData(int64_t m, int64_t n, int64_t k, Layout layout=Layout::ColMajor): + row(m), + col(n), + rank(k), + layout(layout), + A(m * n, 0.0), + A_row(m * n, 0.0), R(n * n, 0.0), - J(n, 0), + J(n, 0), A_cpy1(m * n, 0.0), A_cpy2(m * n, 0.0), - I_ref(k * k, 0.0) - { - row = m; - col = n; - rank = k; - } + I_ref(k * k, 0.0){} + }; + template + void ColMajor2RowMajor( + CQRRPTTestData &all_data + ) { + // Iterate over each row for the output format + for (int64_t i = 0; i < all_data.row; ++i) { + // Copy a single row from A_col to A_row + blas::copy(all_data.col, all_data.A.data() + i, all_data.row, all_data.A_row.data() + i * all_data.col, 1); + } + } + template static void norm_and_copy_computational_helper(T &norm_A, CQRRPTTestData &all_data) { + //NOTE: This function is safe for use with RowMajor inputs as well auto m = all_data.row; auto n = all_data.col; + auto lda = all_data.layout == Layout::ColMajor ? m : n; + + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), lda); + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), lda); - lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m); - lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m); - norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), lda); } @@ -57,6 +76,7 @@ class TestCQRRPT : public ::testing::Test template static void error_check(T &norm_A, CQRRPTTestData &all_data) { + //NOTE: This function should be safe for both RowMajor and ColMajor matrices auto m = all_data.row; auto n = all_data.col; @@ -73,33 +93,18 @@ class TestCQRRPT : public ::testing::Test // Check orthogonality of Q // Q' * Q - I = 0 - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, m, -1.0, I_ref_dat, k); + auto ldq = all_data.layout == Layout::ColMajor ? m : n; + blas::syrk(all_data.layout, Uplo::Upper, Op::Trans, k, m, 1.0, Q_dat, ldq, -1.0, I_ref_dat, k); T norm_0 = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k); // A - QR - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m); - - // Implementing max col norm metric - T max_col_norm = 0.0; - T col_norm = 0.0; - int max_idx = 0; - for(int i = 0; i < n; ++i) { - col_norm = blas::nrm2(m, &A_dat[m * i], 1); - if(max_col_norm < col_norm) { - max_col_norm = col_norm; - max_idx = i; - } - } - T col_norm_A = blas::nrm2(n, &A_cpy_dat[m * max_idx], 1); - T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, m); - + blas::gemm(all_data.layout, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, ldq, R_dat, n, -1.0, A_dat, ldq); + T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, ldq); printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A); - printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A); printf("FRO NORM OF (Q'Q - I)/sqrt(n): %2e\n\n", norm_0 / std::sqrt((T) n)); T atol = std::pow(std::numeric_limits::epsilon(), 0.75); ASSERT_LE(norm_AQR, atol * norm_A); - ASSERT_LE(max_col_norm, atol * col_norm_A); ASSERT_LE(norm_0, atol * std::sqrt((T) n)); } @@ -107,7 +112,7 @@ class TestCQRRPT : public ::testing::Test /// Computes QR factorzation, and computes A[:, J] - QR. template static void test_CQRRPT_general( - T d_factor, + T d_factor, T norm_A, CQRRPTTestData &all_data, alg_type &CQRRPT, @@ -116,15 +121,22 @@ class TestCQRRPT : public ::testing::Test auto m = all_data.row; auto n = all_data.col; + //NOTE: should be the sole remaining piece CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); all_data.rank = CQRRPT.rank; printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); - RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J); - RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); + if(all_data.layout == Layout::ColMajor) { + RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy1.data(), m, all_data.J); + RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); + } + else { + RandLAPACK::util::col_swap_row_major(m, n, n, all_data.A_cpy1.data(), n, all_data.J); + RandLAPACK::util::col_swap_row_major(m, n, n, all_data.A_cpy2.data(), n, all_data.J); + } - error_check(norm_A, all_data); + error_check(norm_A, all_data); } }; From 3569b5f5556f3bb02ed3a938fd7a9972f6ed37d4 Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Fri, 17 Jan 2025 03:00:16 -0800 Subject: [PATCH 2/7] Cleaned --- RandLAPACK/drivers/rl_cqrrpt.hh | 40 ++++++++++++++++++--------------- test/drivers/test_cqrrpt.cc | 25 +++++++++++++++++---- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index 67826de2..eeae595a 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -30,7 +30,8 @@ class CQRRPTalg { int64_t ldr, int64_t* J, T d_factor, - RandBLAS::RNGState &state + RandBLAS::RNGState &state, + Layout layout ) = 0; }; @@ -116,7 +117,8 @@ class CQRRPT : public CQRRPTalg { int64_t ldr, int64_t* J, T d_factor, - RandBLAS::RNGState &state + RandBLAS::RNGState &state, + Layout layout=Layout::ColMajor ) override; public: @@ -128,7 +130,6 @@ class CQRRPT : public CQRRPTalg { std::vector times; // tuning SASOS - int num_threads; int64_t nnz; // HQRRP-related @@ -150,7 +151,8 @@ int CQRRPT::call( int64_t ldr, int64_t* J, T d_factor, - RandBLAS::RNGState &state + RandBLAS::RNGState &state, + Layout layout ){ ///--------------------TIMING VARS--------------------/ high_resolution_clock::time_point saso_t_stop; @@ -196,13 +198,14 @@ int CQRRPT::call( saso_t_start = high_resolution_clock::now(); /// Generating a SASO - RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; + RandBLAS::SparseDist DS(d, m, this->nnz); RandBLAS::SparseSkOp S(DS, state); - state = RandBLAS::fill_sparse(S); + RandBLAS::fill_sparse(S); + state = S.next_state; /// Applying a SASO RandBLAS::sketch_general( - Layout::ColMajor, Op::NoTrans, Op::NoTrans, + layout, Op::NoTrans, Op::NoTrans, d, n, m, (T) 1.0, S, 0, 0, A, lda, (T) 0.0, A_hat, d ); @@ -239,7 +242,9 @@ int CQRRPT::call( if(this -> timing) rank_reveal_t_stop = high_resolution_clock::now(); - lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); + /// Extracting a k by k R representation + T* R_sp = R; + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, ldr); if(this -> timing) a_mod_piv_t_start = high_resolution_clock::now(); @@ -254,7 +259,7 @@ int CQRRPT::call( } // A_pre * R_sp = AP - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R, ldr, A, lda); + blas::trsm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, ldr, A, lda); if(this -> timing) { a_mod_trsm_t_stop = high_resolution_clock::now(); @@ -262,8 +267,8 @@ int CQRRPT::call( } // Do Cholesky QR - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R, ldr); - lapack::potrf(Uplo::Upper, k, R, ldr); + blas::syrk(layout, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, ldr); + lapack::potrf(Uplo::Upper, k, R_sp, ldr); // Re-estimate rank after we have the R-factor form Cholesky QR. // The strategy here is the same as in naive rank estimation. @@ -271,11 +276,11 @@ int CQRRPT::call( // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T. new_rank = k; - running_max = R[0]; - running_min = R[0]; + running_max = R_sp[0]; + running_min = R_sp[0]; for(i = 0; i < k; ++i) { - curr_entry = std::abs(R[i * ldr + i]); + curr_entry = std::abs(R_sp[i * ldr + i]); running_max = std::max(running_max, curr_entry); running_min = std::min(running_min, curr_entry); if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits::epsilon())) { @@ -284,14 +289,13 @@ int CQRRPT::call( } } - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R, ldr, A, lda); + blas::trsm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, ldr, A, lda); if(this -> timing) cholqr_t_stop = high_resolution_clock::now(); - // Get the final R-factor. - RandLAPACK::util::get_U(new_rank, new_rank, R, ldr); - blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R, ldr); + // Get the final R-factor -- undoing the preconditioning + blas::trmm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R_sp, ldr); // Set the rank parameter to the value comuted a posteriori. this->rank = k; diff --git a/test/drivers/test_cqrrpt.cc b/test/drivers/test_cqrrpt.cc index 7ce9e926..d50544fa 100644 --- a/test/drivers/test_cqrrpt.cc +++ b/test/drivers/test_cqrrpt.cc @@ -36,14 +36,16 @@ class TestCQRRPT : public ::testing::Test row(m), col(n), rank(k), - layout(layout), A(m * n, 0.0), A_row(m * n, 0.0), R(n * n, 0.0), J(n, 0), A_cpy1(m * n, 0.0), A_cpy2(m * n, 0.0), - I_ref(k * k, 0.0){} + A_cpy3(m * n, 0.0), + I_ref(k * k, 0.0), + layout(layout) + {} }; @@ -99,12 +101,28 @@ class TestCQRRPT : public ::testing::Test // A - QR blas::gemm(all_data.layout, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, ldq, R_dat, n, -1.0, A_dat, ldq); + + // Implementing max col norm metric + T max_col_norm = 0.0; + T col_norm = 0.0; + int max_idx = 0; + for(int i = 0; i < n; ++i) { + col_norm = blas::nrm2(m, &A_dat[ldq * i], 1); + if(max_col_norm < col_norm) { + max_col_norm = col_norm; + max_idx = i; + } + } + T col_norm_A = blas::nrm2(n, &A_cpy_dat[ldq * max_idx], 1); T norm_AQR = lapack::lange(Norm::Fro, m, n, A_dat, ldq); + printf("REL NORM OF AP - QR: %15e\n", norm_AQR / norm_A); + printf("MAX COL NORM METRIC: %15e\n", max_col_norm / col_norm_A); printf("FRO NORM OF (Q'Q - I)/sqrt(n): %2e\n\n", norm_0 / std::sqrt((T) n)); T atol = std::pow(std::numeric_limits::epsilon(), 0.75); ASSERT_LE(norm_AQR, atol * norm_A); + ASSERT_LE(max_col_norm, atol * col_norm_A); ASSERT_LE(norm_0, atol * std::sqrt((T) n)); } @@ -121,8 +139,7 @@ class TestCQRRPT : public ::testing::Test auto m = all_data.row; auto n = all_data.col; - //NOTE: should be the sole remaining piece - CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state, all_data.layout); all_data.rank = CQRRPT.rank; printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); From 278f6277cd455db58656442608b376657ffc3d1d Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Fri, 17 Jan 2025 10:59:22 -0800 Subject: [PATCH 3/7] Added a `RowMajor` test --- test/drivers/test_cqrrpt.cc | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/test/drivers/test_cqrrpt.cc b/test/drivers/test_cqrrpt.cc index d50544fa..50a93ea3 100644 --- a/test/drivers/test_cqrrpt.cc +++ b/test/drivers/test_cqrrpt.cc @@ -42,7 +42,6 @@ class TestCQRRPT : public ::testing::Test J(n, 0), A_cpy1(m * n, 0.0), A_cpy2(m * n, 0.0), - A_cpy3(m * n, 0.0), I_ref(k * k, 0.0), layout(layout) {} @@ -182,6 +181,33 @@ TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { test_CQRRPT_general(d_factor, norm_A, all_data, CQRRPT, state); } +TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp_RowMajor) { + int64_t m = 10; + int64_t n = 5; + int64_t k = 5; + double d_factor = 2; + double norm_A = 0; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + Layout layout = Layout::RowMajor; + + CQRRPTTestData all_data(m, n, k, layout); + RandLAPACK::CQRRPT CQRRPT(false, tol); + CQRRPT.nnz = 2; + CQRRPT.no_hqrrp = 1; + + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial); + m_info.cond_num = 2; + m_info.rank = k; + m_info.exponent = 2.0; + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + + ColMajor2RowMajor(all_data); + + norm_and_copy_computational_helper(norm_A, all_data); + test_CQRRPT_general(d_factor, norm_A, all_data, CQRRPT, state); +} + TEST_F(TestCQRRPT, CQRRPT_low_rank_with_hqrrp) { int64_t m = 10000; int64_t n = 200; From 7c441a44dfe8084b4eebb5d6d363e35dee162c8d Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Fri, 17 Jan 2025 11:02:55 -0800 Subject: [PATCH 4/7] Zeroing out below-diagonal entries of `R` --- RandLAPACK/drivers/rl_cqrrpt.hh | 1 + 1 file changed, 1 insertion(+) diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index eeae595a..c366b9bc 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -295,6 +295,7 @@ int CQRRPT::call( cholqr_t_stop = high_resolution_clock::now(); // Get the final R-factor -- undoing the preconditioning + RandLAPACK::util::get_U(new_rank, new_rank, R_sp, ldr); blas::trmm(layout, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R_sp, ldr); // Set the rank parameter to the value comuted a posteriori. From 5693c96131834e5b003ea697ea659250bceb241a Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Fri, 17 Jan 2025 13:23:19 -0800 Subject: [PATCH 5/7] CI fix --- .github/workflows/core-linux.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/core-linux.yaml b/.github/workflows/core-linux.yaml index 13fe44e9..b992dadc 100644 --- a/.github/workflows/core-linux.yaml +++ b/.github/workflows/core-linux.yaml @@ -14,11 +14,6 @@ jobs: steps: - uses: actions/checkout@v2 - - name: webfactory/ssh-agent - uses: webfactory/ssh-agent@v0.5.4 - with: - ssh-private-key: ${{ secrets.PRIVATE_CLONE_SSH_KEY }} - - name: submodule update run: | git submodule update --init --recursive From 8584bfaa7252551abfa6768ffc6acbfefa9c0754 Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Fri, 17 Jan 2025 13:38:11 -0800 Subject: [PATCH 6/7] MacOS CI --- .github/workflows/core-macos.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/core-macos.yaml b/.github/workflows/core-macos.yaml index 76a8c59b..d8a45f7f 100644 --- a/.github/workflows/core-macos.yaml +++ b/.github/workflows/core-macos.yaml @@ -14,11 +14,6 @@ jobs: steps: - uses: actions/checkout@v2 - - name: webfactory/ssh-agent - uses: webfactory/ssh-agent@v0.5.4 - with: - ssh-private-key: ${{ secrets.PRIVATE_CLONE_SSH_KEY }} - - name: submodule update run: | git submodule update --init --recursive From d0fd7399227ee464262114a5b4f54635eaeb5ef9 Mon Sep 17 00:00:00 2001 From: Aryaman Jeendgar Date: Fri, 17 Jan 2025 15:09:09 -0800 Subject: [PATCH 7/7] Making `potrf | geqp3` happy --- RandLAPACK/drivers/rl_cqrrpt.hh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index c366b9bc..8d040c02 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -204,8 +204,9 @@ int CQRRPT::call( state = S.next_state; /// Applying a SASO + auto layout_flag_1 = layout == Layout::ColMajor ? Op::NoTrans : Op::Trans; RandBLAS::sketch_general( - layout, Op::NoTrans, Op::NoTrans, + layout, Op::NoTrans, layout_flag_1, d, n, m, (T) 1.0, S, 0, 0, A, lda, (T) 0.0, A_hat, d ); @@ -244,7 +245,8 @@ int CQRRPT::call( /// Extracting a k by k R representation T* R_sp = R; - lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, ldr); + auto layout_flag_2 = layout == Layout::ColMajor ? MatrixType::Upper : MatrixType::Lower; + lapack::lacpy(layout_flag_2, k, k, A_hat, d, R_sp, ldr); if(this -> timing) a_mod_piv_t_start = high_resolution_clock::now(); @@ -268,7 +270,8 @@ int CQRRPT::call( // Do Cholesky QR blas::syrk(layout, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, ldr); - lapack::potrf(Uplo::Upper, k, R_sp, ldr); + auto layout_flag_3 = layout == Layout::ColMajor ? Uplo::Upper : Uplo::Lower; + lapack::potrf(layout_flag_3, k, R_sp, ldr); // Re-estimate rank after we have the R-factor form Cholesky QR. // The strategy here is the same as in naive rank estimation.