From 26e3952ee35d67ff69d56c61cf5326b1a3360e19 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 23 Oct 2023 08:45:10 -0700 Subject: [PATCH 01/56] Adding RBKI files and a Gemm vs ormqr benchmark --- RandLAPACK/CMakeLists.txt | 1 + RandLAPACK/drivers/rl_rbki.hh | 155 ++++++++++++++++++++++++++++++++++ benchmark/CMakeLists.txt | 2 +- benchmark/Gemm_vs_ormqr.cc | 76 +++++++++++++++++ test/CMakeLists.txt | 1 + test/drivers/test_rbki.cc | 9 ++ 6 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 RandLAPACK/drivers/rl_rbki.hh create mode 100644 benchmark/Gemm_vs_ormqr.cc create mode 100644 test/drivers/test_rbki.cc diff --git a/RandLAPACK/CMakeLists.txt b/RandLAPACK/CMakeLists.txt index d8f0f694..cd25e91a 100644 --- a/RandLAPACK/CMakeLists.txt +++ b/RandLAPACK/CMakeLists.txt @@ -1,5 +1,6 @@ set(RandLAPACK_cxx_sources + #rl_rbki.hh rl_lapackpp.hh rl_cqrrpt.hh rl_cqrrp.hh diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh new file mode 100644 index 00000000..a5d872c6 --- /dev/null +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -0,0 +1,155 @@ +#ifndef randlapack_rbki_h +#define randlapack_rbki_h + +#include "rl_util.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_hqrrp.hh" + +#include +#include +#include +#include +#include + +using namespace std::chrono; + +namespace RandLAPACK { + +template +class RBKIalg { + public: + + virtual ~RBKIalg() {} + + virtual int call( + int64_t m, + int64_t n, + std::vector &A, + int64_t d, + std::vector &R, + std::vector &J, + RandBLAS::RNGState &state + ) = 0; +}; + +template +class RBKI : public RBKIalg { + public: + + RBKI( + bool verb, + bool time_subroutines, + T ep + ) { + verbosity = verb; + timing = time_subroutines; + eps = ep; + no_hqrrp = 1; + nb_alg = 64; + oversampling = 10; + use_cholqr = 0; + panel_pivoting = 1; + naive_rank_estimate = 1; + use_fro_norm = 1; + cond_check = 0; + } + + int call( + int64_t m, + int64_t n, + std::vector &A, + int64_t d, + std::vector &R, + std::vector &J, + RandBLAS::RNGState &state + ) override; + + public: + bool verbosity; + bool timing; + bool cond_check; + T eps; + int64_t rank; + + // 10 entries + std::vector times; + + // tuning SASOS + int num_threads; + int64_t nnz; + + // Buffers + std::vector A_hat; + std::vector tau; + std::vector R_sp; + + // HQRRP-related + int no_hqrrp; + int64_t nb_alg; + int64_t oversampling; + int64_t panel_pivoting; + int64_t use_cholqr; + + // Rank estimate-related + int naive_rank_estimate; + int use_fro_norm; + + // Preconditioning-related + T cond_num_A_pre; + T cond_num_A_norm_pre; +}; + +// ----------------------------------------------------------------------------- +template +int RBKI::call( + int64_t m, + int64_t n, + T* &A, + int64_t lda, + int64_t k, + T tol, + T* &U, + T* &V, + T* &S, + RandBLAS::RNGState &state +){ + int64_t iter = 0; + + // Sketching operator space. + T* Y = ( T * ) calloc( n * k, sizeof( T ) ); + // X_i space + T* Y = ( T * ) calloc( m * k, sizeof( T ) ); + // tau space for QR + T* tau = ( T * ) calloc( k, sizeof( T ) ); + + // Pre-conpute Fro norm of an input matrix. + T norm_A = lapack::lange(Norm::Fro, m, n, A, lda); + T sq_tol = std::pow(tol, 2); + + // Generate a dense Gaussian random matrx. + RandBLAS::DenseDist D(n, k); + state = RandBLAS::fill_dense(D, Y, state).second; + + // [X_i, ~] = qr(A * Y_i, 0) + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y, n, 0.0, X, m); + lapack::geqrf(m, k, X, k, tau); + // Y_i = A' * X_i + // Below operation will instead return Y_i' because ORMQR() does not have an option to transpose + // a non-inplicit matrix. + lapack::ormqr(Side::Left, Op::Trans, k, n, m, X, m, tau, A, lda); + + // Iterate until in-loop termination criteria is met. + while(1) { + if (i % 2 == 0) { + + } + else { + + } + } + + return 0; +} +} // end namespace RandLAPACK +#endif \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index ed608603..40a78494 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -60,4 +60,4 @@ add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivo # CQRRP benchmarks add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) \ No newline at end of file +add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) diff --git a/benchmark/Gemm_vs_ormqr.cc b/benchmark/Gemm_vs_ormqr.cc new file mode 100644 index 00000000..1f587f84 --- /dev/null +++ b/benchmark/Gemm_vs_ormqr.cc @@ -0,0 +1,76 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" + +#include +#include +#include +/* +Auxillary benchmark routine, computes flops using GEMM for a given system +*/ + +using namespace std::chrono; +using namespace RandLAPACK; + +template +static void +test_speed(int64_t m, int64_t n, int64_t runs, RandBLAS::RNGState const_state) { + + // Matrix to decompose. + std::vector A(m * n, 0.0); + // Matrix to apply the Q-factor to. + std::vector B1(m * n, 0.0); + std::vector B2(m * n, 0.0); + std::vector Product(n * n, 0.0); + std::vector tau(n, 0.0); + + T* A_dat = A.data(); + T* B1_dat = B1.data(); + T* B2_dat = B2.data(); + T* Product_dat = Product.data(); + T* tau_dat = tau.data(); + + T mean_gflop_rate_gemm = 0; + T mean_gflop_rate_ormqr = 0; + + for (int i = 0; i < runs; ++i) { + auto state = const_state; + + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, A, state); + RandLAPACK::gen::mat_gen(m_info, B1, state); + lapack::lacpy(MatrixType::General, m, n, B1_dat, m, B2_dat, m); + + // Get the implicit Q-factor in A_dat + lapack::geqrf(m, n, A_dat, m, tau_dat); + + auto start_ormqr = high_resolution_clock::now(); + lapack::ormqr(Side::Left, Op::Trans, m, n, n, A_dat, m, tau_dat, B1_dat, m); + auto stop_ormqr = high_resolution_clock::now(); + long dur_ormqr = duration_cast(stop_ormqr - start_ormqr).count(); + + auto start_gemm = high_resolution_clock::now(); + lapack::ungqr(m, n, n, A_dat, m, tau_dat); + gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, n, m, 1.0, A_dat, m, B2_dat, m, 0.0, Product_dat, n); + auto stop_gemm = high_resolution_clock::now(); + long dur_gemm = duration_cast(stop_gemm - start_gemm).count(); + + T gflop_count_gemm = (2 * std::pow(n, 2) * m) / std::pow(10, 9); + if (i != 0) { + mean_gflop_rate_gemm += gflop_count_gemm / dur_gemm; + mean_gflop_rate_ormqr += gflop_count_gemm / dur_ormqr; + } + } + + printf("%f %f\n", mean_gflop_rate_gemm / (runs - 1), mean_gflop_rate_ormqr / (runs - 1)); +} + +int main() { + auto state = RandBLAS::RNGState(); + test_speed(std::pow(2, 10), std::pow(2, 5), 10, state); + test_speed(std::pow(2, 11), std::pow(2, 6), 10, state); + test_speed(std::pow(2, 12), std::pow(2, 7), 10, state); + test_speed(std::pow(2, 13), std::pow(2, 8), 10, state); + test_speed(std::pow(2, 14), std::pow(2, 9), 10, state); + test_speed(std::pow(2, 15), std::pow(2, 10), 10, state); + return 0; +} \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4264986c..c3d3f39e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,6 +18,7 @@ if (GTest_FOUND) drivers/test_cqrrp.cc drivers/test_revd2.cc drivers/test_hqrrp.cc + #drivers/test_rbki.cc ) add_executable(RandLAPACK_tests ${RandLAPACK_test_srcs}) diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc new file mode 100644 index 00000000..ebbc5740 --- /dev/null +++ b/test/drivers/test_rbki.cc @@ -0,0 +1,9 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include +#include + From dd5dd14dac9e57b44ee4d5d07efe21d42340009b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 27 Oct 2023 07:43:34 -0700 Subject: [PATCH 02/56] R not transposed --- RandLAPACK.hh | 1 + RandLAPACK/CMakeLists.txt | 2 +- RandLAPACK/drivers/rl_rbki.hh | 238 ++++++++++++++++++++++++---------- test/CMakeLists.txt | 2 +- test/drivers/test_cqrrp.cc | 70 +++++++++- test/drivers/test_rbki.cc | 75 +++++++++++ 6 files changed, 318 insertions(+), 70 deletions(-) diff --git a/RandLAPACK.hh b/RandLAPACK.hh index 36e9cdd4..875d1774 100644 --- a/RandLAPACK.hh +++ b/RandLAPACK.hh @@ -21,5 +21,6 @@ #include "RandLAPACK/drivers/rl_cqrrpt.hh" #include "RandLAPACK/drivers/rl_cqrrp.hh" #include "RandLAPACK/drivers/rl_revd2.hh" +#include "RandLAPACK/drivers/rl_rbki.hh" #endif diff --git a/RandLAPACK/CMakeLists.txt b/RandLAPACK/CMakeLists.txt index cd25e91a..3c288286 100644 --- a/RandLAPACK/CMakeLists.txt +++ b/RandLAPACK/CMakeLists.txt @@ -1,6 +1,6 @@ set(RandLAPACK_cxx_sources - #rl_rbki.hh + rl_rbki.hh rl_lapackpp.hh rl_cqrrpt.hh rl_cqrrp.hh diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index a5d872c6..2f999aba 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -25,10 +25,12 @@ class RBKIalg { virtual int call( int64_t m, int64_t n, - std::vector &A, - int64_t d, - std::vector &R, - std::vector &J, + T* A, + int64_t lda, + int64_t k, + T* U, + T* V, + T* Sigma, RandBLAS::RNGState &state ) = 0; }; @@ -44,60 +46,25 @@ class RBKI : public RBKIalg { ) { verbosity = verb; timing = time_subroutines; - eps = ep; - no_hqrrp = 1; - nb_alg = 64; - oversampling = 10; - use_cholqr = 0; - panel_pivoting = 1; - naive_rank_estimate = 1; - use_fro_norm = 1; - cond_check = 0; + tol = ep; } int call( int64_t m, int64_t n, - std::vector &A, - int64_t d, - std::vector &R, - std::vector &J, + T* A, + int64_t lda, + int64_t k, + T* U, + T* V, + T* Sigma, RandBLAS::RNGState &state ) override; public: bool verbosity; bool timing; - bool cond_check; - T eps; - int64_t rank; - - // 10 entries - std::vector times; - - // tuning SASOS - int num_threads; - int64_t nnz; - - // Buffers - std::vector A_hat; - std::vector tau; - std::vector R_sp; - - // HQRRP-related - int no_hqrrp; - int64_t nb_alg; - int64_t oversampling; - int64_t panel_pivoting; - int64_t use_cholqr; - - // Rank estimate-related - int naive_rank_estimate; - int use_fro_norm; - - // Preconditioning-related - T cond_num_A_pre; - T cond_num_A_norm_pre; + T tol; }; // ----------------------------------------------------------------------------- @@ -105,50 +72,187 @@ template int RBKI::call( int64_t m, int64_t n, - T* &A, + T* A, int64_t lda, int64_t k, - T tol, - T* &U, - T* &V, - T* &S, + T* U, + T* V, + T* Sigma, RandBLAS::RNGState &state ){ - int64_t iter = 0; + int64_t iter, iter_od, iter_ev = 0; + T norm_R = 0; - // Sketching operator space. - T* Y = ( T * ) calloc( n * k, sizeof( T ) ); - // X_i space - T* Y = ( T * ) calloc( m * k, sizeof( T ) ); + // Space for Y_i and Y_odd. (maybe needs to be n by m + k) + T* Y = ( T * ) calloc( n * m, sizeof( T ) ); + // Space for X_i and X_ev. (maybe needs to be m by m + k) + T* X = ( T * ) calloc( m * m, sizeof( T ) ); // tau space for QR T* tau = ( T * ) calloc( k, sizeof( T ) ); + // + T* R = ( T * ) calloc( n * n, sizeof( T ) ); + T* S = ( T * ) calloc( n * n, sizeof( T ) ); + + // Pointers allocation + // This will be offset by n * k at every even iteration. + T* Y_i = Y; + // This stays the same throughout execution. + T* Y_od = Y; + T* R_i = NULL; + T* R_ii = R; + + T* X_i = X; //&X_ev[m * k]; + T* X_ev = X; + T* S_i = S; + T* S_ii = &S[k]; // Pre-conpute Fro norm of an input matrix. T norm_A = lapack::lange(Norm::Fro, m, n, A, lda); - T sq_tol = std::pow(tol, 2); + T sq_tol = std::pow(this->tol, 2); // Generate a dense Gaussian random matrx. RandBLAS::DenseDist D(n, k); - state = RandBLAS::fill_dense(D, Y, state).second; + state = RandBLAS::fill_dense(D, Y_i, state).second; + + char name [] = "A input"; + RandBLAS::util::print_colmaj(m, n, A, name); + + char name1 [] = "Y sketching"; + RandBLAS::util::print_colmaj(n, k, Y_i, name1); + char name2 [] = "Y_od"; + char name3 [] = "R"; + + char name4 [] = "X_ev"; + char name5 [] = "S"; + + char name6 [] = "Y_i"; + char name7 [] = "X_i"; + + // [X_ev, ~] = qr(A * Y_i, 0) + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); + lapack::geqrf(m, k, X_i, m, tau); + // Convert X_i into an explicit form. It is now stored in X_ev as it should be + lapack::ungqr(m, k, k, X_i, m, tau); - // [X_i, ~] = qr(A * Y_i, 0) - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y, n, 0.0, X, m); - lapack::geqrf(m, k, X, k, tau); - // Y_i = A' * X_i - // Below operation will instead return Y_i' because ORMQR() does not have an option to transpose - // a non-inplicit matrix. - lapack::ormqr(Side::Left, Op::Trans, k, n, m, X, m, tau, A, lda); + // Advance odd iteration count; + ++iter_od; // Iterate until in-loop termination criteria is met. while(1) { - if (i % 2 == 0) { + if (iter % 2 == 0) { + // Y_i = A' * X_i + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); + + if (iter == 2) { + printf("PROBLEM CASE START\n"); + + //RandBLAS::util::print_colmaj(m, n, A, name); + RandBLAS::util::print_colmaj(m, k, X_i, name7); + + char name8 [] = "Y_i = A' * X_i"; + RandBLAS::util::print_colmaj(n, k, Y_i, name8); + } + + + // Move the X_i pointer; + X_i = &X_i[m * k]; + + if (iter != 0) { + // R_i = Y_od' * Y_i + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_ev * k, k, n, 1.0, Y_od, n, Y_i, n, 0.0, R_i, n); + + if (iter == 2) { + char name9 [] = "R_i = Y_od' * Y_i"; + RandBLAS::util::print_colmaj(n, n, R, name9); + } + + // Y_i = Y_i - Y_od * R_i + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); + } + + RandBLAS::util::print_colmaj(n, k, Y_i, name6); + + // [Y_i, R_ii] = qr(Y_i, 0) + std::fill(&tau[0], &tau[k], 0.0); + lapack::geqrf(n, k, Y_i, n, tau); + + // Early termination + // if (abs(R(end)) <= sqrt(eps('double'))) + if(std::abs(Y_i[n * k - 1]) < std::sqrt(std::numeric_limits::epsilon())) + break; + + // Copy R_ii over to R's space under R_i (offset down by iter * k) + lapack::lacpy(MatrixType::Upper, k, k, Y_i, n, R_ii, n); + // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be + lapack::ungqr(n, k, k, Y_i, n, tau); + + //RandBLAS::util::print_colmaj(n, m, Y_od, name2); + RandBLAS::util::print_colmaj(n, n, R, name3); + + // Advance R pointers + iter == 0 ? R_i = &R_ii[n * k] : R_i = &R_i[n * k]; + R_ii = &R_ii[(n + 1) * k]; + + // Advance even iteration count; + ++iter_ev; + + if(iter == 4) + return 0; } else { + // X_i = A * Y_i + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); + + // Move the X_i pointer; + Y_i = &Y_i[n * k]; + // S_i = X_ev' * X_i + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n); + + //X_i = X_i - X_ev * S_i; + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n, 1.0, X_i, m); + + RandBLAS::util::print_colmaj(m, k, X_i, name7); + + // [X_i, S_ii] = qr(X_i, 0); + std::fill(&tau[0], &tau[k], 0.0); + lapack::geqrf(m, k, X_i, m, tau); + + // Early termination + // if (abs(S(end)) <= sqrt(eps('double'))) + if(std::abs(X_i[m * k - 1]) < std::sqrt(std::numeric_limits::epsilon())) + break; + + // Copy S_ii over to S's space under S_i (offset down by iter * k) + lapack::lacpy(MatrixType::Upper, k, k, X_i, m, S_ii, n); + // Convert X_i into an explicit form. It is now stored in X_ev as it should be + lapack::ungqr(m, k, k, X_i, m, tau); + + RandBLAS::util::print_colmaj(m, m, X_ev, name4); + RandBLAS::util::print_colmaj(n, n, S, name5); + + // Advance R pointers + S_i = &S_i[n * k]; + S_ii = &S_ii[(n + 1) * k]; + + // Advance odd iteration count; + ++iter_od; } + ++iter; + norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n, n, R, n); + printf("norm_R: %e\n", norm_R); } + /* + if (iter % 2 == 0) { + lapack::gesdd(Job::SomeVec, n, iter * k, R, n, Sigma, U_hat, n, VT_hat, n); + } else { + lapack::gesdd(Job::SomeVec, iter * k, iter * k, S, n, Sigma, U_hat, n, VT_hat, n); + //blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter * k, k, m, 1.0, X_ev, m, U_hat, m, 0.0, &S[n * iter * k], n); + //blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter * k, k, m, 1.0, Y_od, m, V_hat, m, 0.0, &S[n * iter * k], n); + } + */ return 0; } } // end namespace RandLAPACK diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c3d3f39e..3fb4c8b6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ if (GTest_FOUND) drivers/test_cqrrp.cc drivers/test_revd2.cc drivers/test_hqrrp.cc - #drivers/test_rbki.cc + drivers/test_rbki.cc ) add_executable(RandLAPACK_tests ${RandLAPACK_test_srcs}) diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc index 29db6af3..2129f1c9 100644 --- a/test/drivers/test_cqrrp.cc +++ b/test/drivers/test_cqrrp.cc @@ -225,4 +225,72 @@ TEST_F(TestCQRRP, CQRRP_blocked_low_rank) { #if !defined(__APPLE__) test_CQRRP_general>(d_factor, norm_A, all_data, CQRRP_blocked, state); #endif -} \ No newline at end of file +} + + + +// Note: If Subprocess killed exception -> reload vscode +TEST_F(TestCQRRP, something) { + int64_t m = 10; + int64_t n = 5; + auto state = RandBLAS::RNGState(); + + std::vector A(m * n, 0.0); + std::vector B(m * n, 0.0); + std::vector C(m * 2 * n, 0.0); + std::vector D(m * n, 0.0); + std::vector D_cpy(m * n, 0.0); + std::vector D_space(m * n, 0.0); + + std::vector tau(n * 2, 0.0); + + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, A, state); + RandLAPACK::gen::mat_gen(m_info, B, state); + RandLAPACK::gen::mat_gen(m_info, D, state); + lapack::lacpy(MatrixType::General, m, n, D.data(), m, D_cpy.data(), m); + + lapack::geqrf(m, n, A.data(), m, tau.data()); + lapack::geqrf(m, n, B.data(), m, tau.data() + n); + + // Method 1 + lapack::lacpy(MatrixType::Lower, m, n, A.data(), m, C.data(), m); + lapack::lacpy(MatrixType::Lower, m, n, B.data(), m, C.data() + (m * n), m); + lapack::ormqr(Side::Left, Op::NoTrans, m, n, m, C.data(), m, tau.data(), D.data(), m); + + char name [] = "D through ormqr"; + RandBLAS::util::print_colmaj(m, n, D.data(), name); + + // Method 2 + lapack::ungqr(m, n, n, A.data(), m, tau.data()); + lapack::ungqr(m, n, n, B.data(), m, tau.data() + n); + + lapack::lacpy(MatrixType::General, m, n, A.data(), m, C.data(), m); + lapack::lacpy(MatrixType::General, m, n, B.data(), m, C.data() + (m * n), m); + + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, m, n, m, 1.0, C.data(), m, D_cpy.data(), m, 0.0, D_space.data(), m); + + char name1 [] = "D through gemm"; + RandBLAS::util::print_colmaj(m, n, D_space.data(), name1); +} + +/* +// Note: If Subprocess killed exception -> reload vscode +TEST_F(TestCQRRP, something2) { + int64_t m = 10; + int64_t n = 5; + auto state = RandBLAS::RNGState(); + + std::vector A(m * n, 0.0); + std::vector tau(n * 2, 0.0); + + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, A, state); + + lapack::geqr(m, n, A.data(), m, tau.data(), -1); + int64_t tsize = (int64_t) t_3[0]; + t_3.resize(tsize); + auto sart_geqr = high_resolution_clock::now(); + lapack::geqr(m, n, A_1.data(), m, t_3.data(), tsize); +} +*/ \ No newline at end of file diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index ebbc5740..81814580 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -7,3 +7,78 @@ #include #include + +class TestRBKI : public ::testing::Test +{ + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + struct RBKITestData { + int64_t row; + int64_t col; + int64_t rank; // has to be modifiable + std::vector A; + std::vector U; + std::vector V; + std::vector Sigma; + std::vector A_cpy; + + RBKITestData(int64_t m, int64_t n, int64_t k) : + A(m * n, 0.0), + U(m * n, 0.0), + V(n * n, 0.0), + Sigma(n, 0.0), + A_cpy(m * n, 0.0) + { + row = m; + col = n; + rank = k; + } + }; + + template + static void norm_and_copy_computational_helper(T &norm_A, RBKITestData &all_data) { + auto m = all_data.row; + auto n = all_data.col; + + lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy.data(), m); + norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + } + + template + static void test_RBKI_general( + T norm_A, + RBKITestData &all_data, + alg_type &RBKI, + RandBLAS::RNGState &state) { + + auto m = all_data.row; + auto n = all_data.col; + auto k = all_data.rank; + + RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + } +}; + +// Note: If Subprocess killed exception -> reload vscode +TEST_F(TestRBKI, RBKI_basic) { + int64_t m = 10; + int64_t n = 8; + int64_t k = 2; + double norm_A = 0; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + + RBKITestData all_data(m, n, k); + RandLAPACK::RBKI RBKI(false, false, tol); + + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + + norm_and_copy_computational_helper(norm_A, all_data); + test_RBKI_general>(norm_A, all_data, RBKI, state); +} From c2fa6982303738c46eb2a342a4f2f33834246dc7 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 31 Oct 2023 09:24:51 -0700 Subject: [PATCH 03/56] Works for small cases, print statements in --- RandLAPACK/drivers/rl_cqrrp.hh | 1 + RandLAPACK/drivers/rl_rbki.hh | 155 ++++++++++++++++++++------------- test/drivers/test_rbki.cc | 18 +++- 3 files changed, 112 insertions(+), 62 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 90eec124..765df493 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -324,6 +324,7 @@ int CQRRP_blocked::call( // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. // Get a transpose of A_sk + #pragma omp parallel for for(i = 0; i < cols; ++i) blas::copy(sampling_dimension, &A_sk[i * d], 1, &A_sk_trans[i], n); // Perform a row-pivoted LU on a transpose of A_sk diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 2f999aba..a8f9f717 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -29,7 +29,7 @@ class RBKIalg { int64_t lda, int64_t k, T* U, - T* V, + T* VT, T* Sigma, RandBLAS::RNGState &state ) = 0; @@ -56,7 +56,7 @@ class RBKI : public RBKIalg { int64_t lda, int64_t k, T* U, - T* V, + T* VT, T* Sigma, RandBLAS::RNGState &state ) override; @@ -76,11 +76,11 @@ int RBKI::call( int64_t lda, int64_t k, T* U, - T* V, + T* VT, T* Sigma, RandBLAS::RNGState &state ){ - int64_t iter, iter_od, iter_ev = 0; + int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; // Space for Y_i and Y_odd. (maybe needs to be n by m + k) @@ -91,7 +91,7 @@ int RBKI::call( T* tau = ( T * ) calloc( k, sizeof( T ) ); // T* R = ( T * ) calloc( n * n, sizeof( T ) ); - T* S = ( T * ) calloc( n * n, sizeof( T ) ); + T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); // Pointers allocation // This will be offset by n * k at every even iteration. @@ -106,19 +106,23 @@ int RBKI::call( T* S_i = S; T* S_ii = &S[k]; + T* U_hat = NULL; + T* VT_hat = NULL; + // Pre-conpute Fro norm of an input matrix. T norm_A = lapack::lange(Norm::Fro, m, n, A, lda); T sq_tol = std::pow(this->tol, 2); + T threshold = std::sqrt(1 - sq_tol) * norm_A; // Generate a dense Gaussian random matrx. RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; char name [] = "A input"; - RandBLAS::util::print_colmaj(m, n, A, name); + //RandBLAS::util::print_colmaj(m, n, A, name); char name1 [] = "Y sketching"; - RandBLAS::util::print_colmaj(n, k, Y_i, name1); + //RandBLAS::util::print_colmaj(n, k, Y_i, name1); char name2 [] = "Y_od"; char name3 [] = "R"; @@ -143,62 +147,48 @@ int RBKI::call( // Y_i = A' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); - if (iter == 2) { - printf("PROBLEM CASE START\n"); - - //RandBLAS::util::print_colmaj(m, n, A, name); - RandBLAS::util::print_colmaj(m, k, X_i, name7); - - char name8 [] = "Y_i = A' * X_i"; - RandBLAS::util::print_colmaj(n, k, Y_i, name8); - } - - // Move the X_i pointer; X_i = &X_i[m * k]; if (iter != 0) { - // R_i = Y_od' * Y_i - blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_ev * k, k, n, 1.0, Y_od, n, Y_i, n, 0.0, R_i, n); - - if (iter == 2) { - char name9 [] = "R_i = Y_od' * Y_i"; - RandBLAS::util::print_colmaj(n, n, R, name9); - } + // R_i' = Y_i' * Y_od + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, R_i, n); // Y_i = Y_i - Y_od * R_i - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); } - - RandBLAS::util::print_colmaj(n, k, Y_i, name6); + //RandBLAS::util::print_colmaj(n, k, Y_i, name6); // [Y_i, R_ii] = qr(Y_i, 0) std::fill(&tau[0], &tau[k], 0.0); lapack::geqrf(n, k, Y_i, n, tau); - // Early termination - // if (abs(R(end)) <= sqrt(eps('double'))) - if(std::abs(Y_i[n * k - 1]) < std::sqrt(std::numeric_limits::epsilon())) - break; + // Copy R_ii over to R's space under R_i (offset down by iter_ev * k) + #pragma omp parallel for + for(i = 0; i < k; ++i) + blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n); - // Copy R_ii over to R's space under R_i (offset down by iter * k) - lapack::lacpy(MatrixType::Upper, k, k, Y_i, n, R_ii, n); // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be lapack::ungqr(n, k, k, Y_i, n, tau); - + //RandBLAS::util::print_colmaj(n, m, Y_od, name2); - RandBLAS::util::print_colmaj(n, n, R, name3); + //RandBLAS::util::print_colmaj(n, n, R, name3); + + // Early termination + // if (abs(R(end)) <= sqrt(eps('double'))) + if(std::abs(R_ii[n + k - 1]) < std::sqrt(std::numeric_limits::epsilon())) + { + printf("TERMINATION 1\n"); + break; + } // Advance R pointers - iter == 0 ? R_i = &R_ii[n * k] : R_i = &R_i[n * k]; + iter == 0 ? R_i = &R_ii[k] : R_i = &R_i[k]; R_ii = &R_ii[(n + 1) * k]; // Advance even iteration count; ++iter_ev; - - if(iter == 4) - return 0; } else { // X_i = A * Y_i @@ -208,10 +198,10 @@ int RBKI::call( Y_i = &Y_i[n * k]; // S_i = X_ev' * X_i - blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n); + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); //X_i = X_i - X_ev * S_i; - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n, 1.0, X_i, m); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); RandBLAS::util::print_colmaj(m, k, X_i, name7); @@ -219,40 +209,83 @@ int RBKI::call( std::fill(&tau[0], &tau[k], 0.0); lapack::geqrf(m, k, X_i, m, tau); - // Early termination - // if (abs(S(end)) <= sqrt(eps('double'))) - if(std::abs(X_i[m * k - 1]) < std::sqrt(std::numeric_limits::epsilon())) - break; - - // Copy S_ii over to S's space under S_i (offset down by iter * k) - lapack::lacpy(MatrixType::Upper, k, k, X_i, m, S_ii, n); + // Copy S_ii over to S's space under S_i (offset down by iter_od * k) + lapack::lacpy(MatrixType::Upper, k, k, X_i, m, S_ii, n + k); // Convert X_i into an explicit form. It is now stored in X_ev as it should be lapack::ungqr(m, k, k, X_i, m, tau); - RandBLAS::util::print_colmaj(m, m, X_ev, name4); - RandBLAS::util::print_colmaj(n, n, S, name5); + //RandBLAS::util::print_colmaj(m, m, X_ev, name4); + //RandBLAS::util::print_colmaj(n + k, n, S, name5); + + // Early termination + // if (abs(S(end)) <= sqrt(eps('double'))) + if(std::abs(S_ii[n + k + k - 1]) < std::sqrt(std::numeric_limits::epsilon())) + { + printf("TERMINATION 2\n"); + break; + } // Advance R pointers - S_i = &S_i[n * k]; - S_ii = &S_ii[(n + 1) * k]; + S_i = &S_i[(n + k) * k]; + S_ii = &S_ii[((n + k) + 1) * k]; // Advance odd iteration count; ++iter_od; } ++iter; norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n, n, R, n); - printf("norm_R: %e\n", norm_R); + //printf("norm_R: %e\n", norm_R); + + //norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A + if(norm_R > threshold) + { + printf("TERMINATION 3\n"); + break; + } } - /* + + iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; + + U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); + VT_hat = ( T * ) calloc( end_cols * end_cols, sizeof( T ) ); + if (iter % 2 == 0) { - lapack::gesdd(Job::SomeVec, n, iter * k, R, n, Sigma, U_hat, n, VT_hat, n); + // [U_hat, Sigma, V_hat] = svd(R') + lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); + // V = Y_od * V_hat + // We actually perform VT = V_hat' * Y_odd' + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); + + } else { + + // [U_hat, Sigma, V_hat] = svd(S) + lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); + // U = X_ev * U_hat + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); + // V = Y_od * V_hat + // We actually perform VT = V_hat' * Y_odd' + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); + } - } else { - lapack::gesdd(Job::SomeVec, iter * k, iter * k, S, n, Sigma, U_hat, n, VT_hat, n); - //blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter * k, k, m, 1.0, X_ev, m, U_hat, m, 0.0, &S[n * iter * k], n); - //blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter * k, k, m, 1.0, Y_od, m, V_hat, m, 0.0, &S[n * iter * k], n); + //RandBLAS::util::print_colmaj(m, m, X_ev, name4); + //char name10 [] = "U_hat"; + //RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name10); + //RandBLAS::util::print_colmaj(n, m, Y_od, name2); + char name11 [] = "VT_hat"; + //RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name11); + + //char name12 [] = "U"; + //RandBLAS::util::print_colmaj(m, end_cols, U, name12); + char name13 [] = "VT"; + //RandBLAS::util::print_colmaj(n, n, VT, name13); + + + + for(int j = 0; j < end_cols; ++j) { + printf("%e\n", *(Sigma +j)); } - */ + return 0; } } // end namespace RandLAPACK diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index 81814580..0fc30201 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -26,13 +26,15 @@ class TestRBKI : public ::testing::Test std::vector V; std::vector Sigma; std::vector A_cpy; + std::vector Sigma_exact; RBKITestData(int64_t m, int64_t n, int64_t k) : A(m * n, 0.0), U(m * n, 0.0), V(n * n, 0.0), Sigma(n, 0.0), - A_cpy(m * n, 0.0) + A_cpy(m * n, 0.0), + Sigma_exact(n, 0.0) { row = m; col = n; @@ -61,6 +63,20 @@ class TestRBKI : public ::testing::Test auto k = all_data.rank; RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + // Compute singular values via a deterministic method + lapack::gesdd(Job::NoVec, m, n, all_data.A_cpy.data(), m, all_data.Sigma_exact.data(), NULL, m, NULL, n); + + // Find diff between singular values computed by two methods + int cnt = -1; + std::for_each(all_data.Sigma.data(), all_data.Sigma.data() + n, + // Lambda expression begins + [&cnt, &all_data](T &entry) { + entry -= all_data.Sigma_exact[++cnt]; + } + ); + T norm = blas::nrm2(n, all_data.Sigma.data(), 1); + printf("||A_svd - A_rbki||_F: %e\n", norm); + } }; From bf2ccb1bfe85bdd6f54bc8356d37c8fc0009a01c Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 6 Nov 2023 09:50:02 -0800 Subject: [PATCH 04/56] Seems to be working --- RandLAPACK/drivers/rl_rbki.hh | 61 ++++++++++++++++++----------------- test/drivers/test_rbki.cc | 15 ++++++--- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index a8f9f717..d501c384 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -82,30 +82,37 @@ int RBKI::call( ){ int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; + int64_t space_rows = k * std::ceil(m / (T) k); + printf("%ld\n", space_rows); - // Space for Y_i and Y_odd. (maybe needs to be n by m + k) + // We need a full copy of X and Y all the way through the algorithm + // due to an operation with X_odd and Y_odd happening at the end. + // Space for Y_i and Y_odd. T* Y = ( T * ) calloc( n * m, sizeof( T ) ); // Space for X_i and X_ev. (maybe needs to be m by m + k) - T* X = ( T * ) calloc( m * m, sizeof( T ) ); + T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); // tau space for QR T* tau = ( T * ) calloc( k, sizeof( T ) ); - // + // While R and S matrices are structured (both band), we cannot make use of this structure through + // BLAS-level functions. + // Note also that we store a transposed version of R. T* R = ( T * ) calloc( n * n, sizeof( T ) ); T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); // Pointers allocation - // This will be offset by n * k at every even iteration. + // Below pointers will be offset by (n or m) * k at every even iteration. T* Y_i = Y; - // This stays the same throughout execution. + T* X_i = X; + // Below pointers stay the same throughout the alg. T* Y_od = Y; + T* Y_od = Y; + T* X_ev = X; + // S and S pointers are offset at every step. T* R_i = NULL; T* R_ii = R; - - T* X_i = X; //&X_ev[m * k]; - T* X_ev = X; T* S_i = S; T* S_ii = &S[k]; - + // Pre-decloration of SVD-related buffers. T* U_hat = NULL; T* VT_hat = NULL; @@ -135,7 +142,7 @@ int RBKI::call( // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); lapack::geqrf(m, k, X_i, m, tau); - // Convert X_i into an explicit form. It is now stored in X_ev as it should be + // Convert X_i into an explicit form. It is now stored in X_ev as it should be. lapack::ungqr(m, k, k, X_i, m, tau); // Advance odd iteration count; @@ -153,8 +160,7 @@ int RBKI::call( if (iter != 0) { // R_i' = Y_i' * Y_od blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, R_i, n); - - // Y_i = Y_i - Y_od * R_i + // Y_i = Y_i - Y_od * R_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); } @@ -164,12 +170,12 @@ int RBKI::call( std::fill(&tau[0], &tau[k], 0.0); lapack::geqrf(n, k, Y_i, n, tau); - // Copy R_ii over to R's space under R_i (offset down by iter_ev * k) + // Copy R_ii over to R's (in transposed format). #pragma omp parallel for for(i = 0; i < k; ++i) blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n); - // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be + // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be. lapack::ungqr(n, k, k, Y_i, n, tau); //RandBLAS::util::print_colmaj(n, m, Y_od, name2); @@ -177,9 +183,9 @@ int RBKI::call( // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) - if(std::abs(R_ii[n + k - 1]) < std::sqrt(std::numeric_limits::epsilon())) + if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 1\n"); + printf("TERMINATION 1 at iteration %d\n", iter_ev); break; } @@ -196,14 +202,13 @@ int RBKI::call( // Move the X_i pointer; Y_i = &Y_i[n * k]; - + // S_i = X_ev' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); - //X_i = X_i - X_ev * S_i; blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); - RandBLAS::util::print_colmaj(m, k, X_i, name7); + //RandBLAS::util::print_colmaj(m, k, X_i, name7); // [X_i, S_ii] = qr(X_i, 0); std::fill(&tau[0], &tau[k], 0.0); @@ -214,14 +219,14 @@ int RBKI::call( // Convert X_i into an explicit form. It is now stored in X_ev as it should be lapack::ungqr(m, k, k, X_i, m, tau); - //RandBLAS::util::print_colmaj(m, m, X_ev, name4); + //RandBLAS::util::print_colmaj(m, m + k, X_ev, name4); //RandBLAS::util::print_colmaj(n + k, n, S, name5); // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) - if(std::abs(S_ii[n + k + k - 1]) < std::sqrt(std::numeric_limits::epsilon())) + if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 2\n"); + printf("TERMINATION 2 at iteration %d\n", iter_od); break; } @@ -249,6 +254,8 @@ int RBKI::call( U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); VT_hat = ( T * ) calloc( end_cols * end_cols, sizeof( T ) ); + printf("rows: %ld, cols: %ld\n", end_rows, end_cols); + if (iter % 2 == 0) { // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); @@ -262,10 +269,10 @@ int RBKI::call( // [U_hat, Sigma, V_hat] = svd(S) lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); // U = X_ev * U_hat - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); + //blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); // V = Y_od * V_hat // We actually perform VT = V_hat' * Y_odd' - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); + //blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); } //RandBLAS::util::print_colmaj(m, m, X_ev, name4); @@ -279,12 +286,6 @@ int RBKI::call( //RandBLAS::util::print_colmaj(m, end_cols, U, name12); char name13 [] = "VT"; //RandBLAS::util::print_colmaj(n, n, VT, name13); - - - - for(int j = 0; j < end_cols; ++j) { - printf("%e\n", *(Sigma +j)); - } return 0; } diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index 0fc30201..536db2f6 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -68,13 +68,18 @@ class TestRBKI : public ::testing::Test // Find diff between singular values computed by two methods int cnt = -1; - std::for_each(all_data.Sigma.data(), all_data.Sigma.data() + n, + + for(int i = 0; i < n; ++i) { + //printf("%e, %e\n", all_data.Sigma[i], all_data.Sigma_exact[i]); + } + + std::for_each(all_data.Sigma.data(), all_data.Sigma.data() + k, // Lambda expression begins [&cnt, &all_data](T &entry) { entry -= all_data.Sigma_exact[++cnt]; } ); - T norm = blas::nrm2(n, all_data.Sigma.data(), 1); + T norm = blas::nrm2(k, all_data.Sigma.data(), 1); printf("||A_svd - A_rbki||_F: %e\n", norm); } @@ -82,9 +87,9 @@ class TestRBKI : public ::testing::Test // Note: If Subprocess killed exception -> reload vscode TEST_F(TestRBKI, RBKI_basic) { - int64_t m = 10; - int64_t n = 8; - int64_t k = 2; + int64_t m = 4000; + int64_t n = 200; + int64_t k = 100; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); From 4bd772beeb086b765357b61bb5bc8759e1ddbe8b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 6 Nov 2023 10:30:21 -0800 Subject: [PATCH 05/56] Cleanup --- RandLAPACK/drivers/rl_rbki.hh | 57 +++++------------------------------ test/drivers/test_rbki.cc | 6 ++-- 2 files changed, 11 insertions(+), 52 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index d501c384..2db62d7f 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -105,7 +105,6 @@ int RBKI::call( T* X_i = X; // Below pointers stay the same throughout the alg. T* Y_od = Y; - T* Y_od = Y; T* X_ev = X; // S and S pointers are offset at every step. T* R_i = NULL; @@ -126,18 +125,10 @@ int RBKI::call( state = RandBLAS::fill_dense(D, Y_i, state).second; char name [] = "A input"; - //RandBLAS::util::print_colmaj(m, n, A, name); + RandBLAS::util::print_colmaj(m, n, A, name); char name1 [] = "Y sketching"; - //RandBLAS::util::print_colmaj(n, k, Y_i, name1); - char name2 [] = "Y_od"; - char name3 [] = "R"; - - char name4 [] = "X_ev"; - char name5 [] = "S"; - - char name6 [] = "Y_i"; - char name7 [] = "X_i"; + RandBLAS::util::print_colmaj(n, k, Y_i, name1); // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); @@ -164,8 +155,6 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); } - //RandBLAS::util::print_colmaj(n, k, Y_i, name6); - // [Y_i, R_ii] = qr(Y_i, 0) std::fill(&tau[0], &tau[k], 0.0); lapack::geqrf(n, k, Y_i, n, tau); @@ -178,9 +167,6 @@ int RBKI::call( // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be. lapack::ungqr(n, k, k, Y_i, n, tau); - //RandBLAS::util::print_colmaj(n, m, Y_od, name2); - //RandBLAS::util::print_colmaj(n, n, R, name3); - // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) @@ -208,8 +194,6 @@ int RBKI::call( //X_i = X_i - X_ev * S_i; blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); - //RandBLAS::util::print_colmaj(m, k, X_i, name7); - // [X_i, S_ii] = qr(X_i, 0); std::fill(&tau[0], &tau[k], 0.0); lapack::geqrf(m, k, X_i, m, tau); @@ -219,9 +203,6 @@ int RBKI::call( // Convert X_i into an explicit form. It is now stored in X_ev as it should be lapack::ungqr(m, k, k, X_i, m, tau); - //RandBLAS::util::print_colmaj(m, m + k, X_ev, name4); - //RandBLAS::util::print_colmaj(n + k, n, S, name5); - // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) @@ -233,18 +214,14 @@ int RBKI::call( // Advance R pointers S_i = &S_i[(n + k) * k]; S_ii = &S_ii[((n + k) + 1) * k]; - // Advance odd iteration count; ++iter_od; } ++iter; norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n, n, R, n); - //printf("norm_R: %e\n", norm_R); //norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A - if(norm_R > threshold) - { - printf("TERMINATION 3\n"); + if(norm_R > threshold) { break; } } @@ -259,33 +236,15 @@ int RBKI::call( if (iter % 2 == 0) { // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); - blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); - // V = Y_od * V_hat - // We actually perform VT = V_hat' * Y_odd' - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); - } else { - // [U_hat, Sigma, V_hat] = svd(S) lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); - // U = X_ev * U_hat - //blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); - // V = Y_od * V_hat - // We actually perform VT = V_hat' * Y_odd' - //blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); } - - //RandBLAS::util::print_colmaj(m, m, X_ev, name4); - //char name10 [] = "U_hat"; - //RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name10); - //RandBLAS::util::print_colmaj(n, m, Y_od, name2); - char name11 [] = "VT_hat"; - //RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name11); - - //char name12 [] = "U"; - //RandBLAS::util::print_colmaj(m, end_cols, U, name12); - char name13 [] = "VT"; - //RandBLAS::util::print_colmaj(n, n, VT, name13); + // U = X_ev * U_hat + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); + // V = Y_od * V_hat + // We actually perform VT = V_hat' * Y_odd' + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); return 0; } diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index 536db2f6..d0e0b848 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -87,9 +87,9 @@ class TestRBKI : public ::testing::Test // Note: If Subprocess killed exception -> reload vscode TEST_F(TestRBKI, RBKI_basic) { - int64_t m = 4000; - int64_t n = 200; - int64_t k = 100; + int64_t m = 10; + int64_t n = 8; + int64_t k = 4; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); From 596425860ca8a0b73961324a2d3610c085616f8c Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 6 Nov 2023 11:49:10 -0800 Subject: [PATCH 06/56] Cleanup --- RandLAPACK/drivers/rl_rbki.hh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 2db62d7f..2c1ac89d 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -124,12 +124,6 @@ int RBKI::call( RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; - char name [] = "A input"; - RandBLAS::util::print_colmaj(m, n, A, name); - - char name1 [] = "Y sketching"; - RandBLAS::util::print_colmaj(n, k, Y_i, name1); - // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); lapack::geqrf(m, k, X_i, m, tau); @@ -231,7 +225,7 @@ int RBKI::call( U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); VT_hat = ( T * ) calloc( end_cols * end_cols, sizeof( T ) ); - printf("rows: %ld, cols: %ld\n", end_rows, end_cols); + //printf("rows: %ld, cols: %ld\n", end_rows, end_cols); if (iter % 2 == 0) { // [U_hat, Sigma, V_hat] = svd(R') From e3cc36f87f05e30a5152eea771ca245549991271 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 7 Nov 2023 05:59:00 -0800 Subject: [PATCH 07/56] Update --- RandLAPACK/drivers/rl_rbki.hh | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 2c1ac89d..eb37e53d 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -19,9 +19,7 @@ namespace RandLAPACK { template class RBKIalg { public: - virtual ~RBKIalg() {} - virtual int call( int64_t m, int64_t n, @@ -38,7 +36,6 @@ class RBKIalg { template class RBKI : public RBKIalg { public: - RBKI( bool verb, bool time_subroutines, @@ -48,7 +45,6 @@ class RBKI : public RBKIalg { timing = time_subroutines; tol = ep; } - int call( int64_t m, int64_t n, @@ -60,7 +56,6 @@ class RBKI : public RBKIalg { T* Sigma, RandBLAS::RNGState &state ) override; - public: bool verbosity; bool timing; @@ -163,8 +158,7 @@ int RBKI::call( // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) - if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) - { + if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { printf("TERMINATION 1 at iteration %d\n", iter_ev); break; } @@ -199,8 +193,7 @@ int RBKI::call( // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) - if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) - { + if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { printf("TERMINATION 2 at iteration %d\n", iter_od); break; } From 78cca0b1a468df13fa3068d157272cbde4b77f91 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 13 Nov 2023 13:30:56 -0800 Subject: [PATCH 08/56] Update --- .../bench_RBKI/RBKI_speed_comparisons.cc | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 benchmark/bench_RBKI/RBKI_speed_comparisons.cc diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc new file mode 100644 index 00000000..0d29797b --- /dev/null +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -0,0 +1,198 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct QR_benchmark_data { + int64_t row; + int64_t col; + T tolerance; + T sampling_factor; + std::vector A; + std::vector R; + std::vector tau; + std::vector J; + + QR_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + A(m * n, 0.0), + R(n * n, 0.0), + tau(n, 0.0), + J(n, 0) + { + row = m; + col = n; + tolerance = tol; + sampling_factor = d_factor; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + std::fill(all_data.R.begin(), all_data.R.end(), 0.0); + std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); + std::fill(all_data.J.begin(), all_data.J.end(), 0); +} + +template +static std::vector call_all_algs( + RandLAPACK::gen::mat_gen_info m_info, + int64_t numruns, + int64_t n, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + auto m = all_data.row; + auto tol = all_data.tolerance; + auto d_factor = all_data.sampling_factor; + + // Additional params setup. + RandLAPACK::CQRRPT CQRRPT(true, true, tol); + CQRRPT.nnz = 4; + CQRRPT.num_threads = 48; + + // timing vars + long dur_cqrrpt = 0; + long dur_geqp3 = 0; + long dur_geqr = 0; + long dur_geqpt = 0; + long dur_geqrf = 0; + long dur_scholqr = 0; + long t_cqrrpt_best = 0; + long t_geqp3_best = 0; + long t_geqr_best = 0; + long t_geqpt_best = 0; + long t_geqrf_best = 0; + long t_scholqr_best = 0; + + // Making sure the states are unchanged + auto state_gen = state; + auto state_alg = state; + + for (int i = 0; i < numruns; ++i) { + printf("Iteration %d start.\n", i); + // Testing GEQP3 + auto start_geqp3 = high_resolution_clock::now(); + lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); + auto stop_geqp3 = high_resolution_clock::now(); + dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen); + + // Testing GEQRF + auto start_geqrf = high_resolution_clock::now(); + lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); + auto stop_geqrf = high_resolution_clock::now(); + dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen); + + // Testing CQRRPT + auto start_cqrrp = high_resolution_clock::now(); + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); + auto stop_cqrrp = high_resolution_clock::now(); + dur_cqrrpt = duration_cast(stop_cqrrp - start_cqrrp).count(); + + state_gen = state; + state_alg = state; + data_regen(m_info, all_data, state_gen); + + // Testing SCHOLQR3 + auto start_scholqr = high_resolution_clock::now(); + //--------------------------------------------------------------------------------------------------------------------------// + T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + T shift = 11 * std::numeric_limits::epsilon() * n * std::pow(norm_A, 2); + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); + for (int i = 0; i < n; ++i) + all_data.R[i * (n + 1)] += shift; + lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); + // CholeskyQR2 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); + lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); + // CholeskyQR3 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); + lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); + //--------------------------------------------------------------------------------------------------------------------------// + auto stop_scholqr = high_resolution_clock::now(); + dur_scholqr = duration_cast(stop_scholqr - start_scholqr).count(); + + auto state_gen = state; + data_regen(m_info, all_data, state_gen); + + // Testing GEQR + GEQPT + auto start_geqpt = high_resolution_clock::now(); + auto start_geqr = high_resolution_clock::now(); +#if !defined(__APPLE__) + // GEQR(A) part + lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); + int64_t tsize = (int64_t) all_data.tau[0]; + all_data.tau.resize(tsize); + lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); +#endif + auto stop_geqr = high_resolution_clock::now(); + dur_geqr = duration_cast(stop_geqr - start_geqr).count(); +#if !defined(__APPLE__) + // GEQP3(R) part + lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n); + lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data()); +#endif + auto stop_geqpt = high_resolution_clock::now(); + dur_geqpt = duration_cast(stop_geqpt - start_geqpt).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen); + + i == 0 ? t_cqrrpt_best = dur_cqrrpt : (dur_cqrrpt < t_cqrrpt_best) ? t_cqrrpt_best = dur_cqrrpt : NULL; + i == 0 ? t_geqpt_best = dur_geqpt : (dur_geqpt < t_geqpt_best) ? t_geqpt_best = dur_geqpt : NULL; + } + + std::vector res{t_cqrrpt_best, t_geqpt_best}; + + return res; +} + +int main() { + // Declare parameters + int64_t m = std::pow(2, 17); + int64_t n_start = std::pow(2, 9); + int64_t n_stop = std::pow(2, 13); + double d_factor = 1.25; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 1; + + // Allocate basic workspace + QR_benchmark_data all_data(m, n_stop, tol, d_factor); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + + // Declare a data file + std::fstream file("RBKI_speed_comp_" + std::to_string(m) + + "_col_start_" + std::to_string(n_start) + + "_col_stop_" + std::to_string(n_stop) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); + + for (;n_start <= n_stop; n_start *= 2) { + res = call_all_algs(m_info, numruns, n_start, all_data, state_constant); + file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ",\n"; + } +} \ No newline at end of file From d5b4d2cd48fdaf93962b6666ac352982e7989879 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 17 Nov 2023 06:38:18 -0800 Subject: [PATCH 09/56] Benchmark update --- RandLAPACK/drivers/rl_rbki.hh | 24 ++- .../bench_RBKI/RBKI_speed_comparisons.cc | 182 ++++++------------ test/drivers/test_rbki.cc | 6 +- 3 files changed, 80 insertions(+), 132 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index eb37e53d..71f8bd53 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -60,6 +60,7 @@ class RBKI : public RBKIalg { bool verbosity; bool timing; T tol; + int num_krylov_iters; }; // ----------------------------------------------------------------------------- @@ -83,16 +84,16 @@ int RBKI::call( // We need a full copy of X and Y all the way through the algorithm // due to an operation with X_odd and Y_odd happening at the end. // Space for Y_i and Y_odd. - T* Y = ( T * ) calloc( n * m, sizeof( T ) ); + T* Y = ( T * ) calloc( n * m, sizeof( T ) ); // Space for X_i and X_ev. (maybe needs to be m by m + k) - T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); + T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); // tau space for QR T* tau = ( T * ) calloc( k, sizeof( T ) ); // While R and S matrices are structured (both band), we cannot make use of this structure through // BLAS-level functions. // Note also that we store a transposed version of R. - T* R = ( T * ) calloc( n * n, sizeof( T ) ); - T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); + T* R = ( T * ) calloc( n * n, sizeof( T ) ); + T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); // Pointers allocation // Below pointers will be offset by (n or m) * k at every even iteration. @@ -159,7 +160,7 @@ int RBKI::call( // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 1 at iteration %d\n", iter_ev); + printf("TERMINATION 1 at iteration %ld\n", iter_ev); break; } @@ -194,7 +195,7 @@ int RBKI::call( // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 2 at iteration %d\n", iter_od); + printf("TERMINATION 2 at iteration %ld\n", iter_od); break; } @@ -213,9 +214,10 @@ int RBKI::call( } } + this->num_krylov_iters = iter; iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; - U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); + U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); VT_hat = ( T * ) calloc( end_cols * end_cols, sizeof( T ) ); //printf("rows: %ld, cols: %ld\n", end_rows, end_cols); @@ -233,6 +235,14 @@ int RBKI::call( // We actually perform VT = V_hat' * Y_odd' blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); + free(Y); + free(X); + free(tau); + free(R); + free(S); + free(U_hat); + free(VT_hat); + return 0; } } // end namespace RandLAPACK diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 0d29797b..40a02b7d 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -7,192 +7,130 @@ #include template -struct QR_benchmark_data { +struct RBKI_benchmark_data { int64_t row; int64_t col; - T tolerance; - T sampling_factor; + int64_t rank; // has to be modifiable + T tolerance; std::vector A; - std::vector R; - std::vector tau; - std::vector J; + std::vector U; + std::vector V; + std::vector Sigma; + std::vector A_cpy; + std::vector Sigma_exact; - QR_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : A(m * n, 0.0), - R(n * n, 0.0), - tau(n, 0.0), - J(n, 0) + U(m * n, 0.0), + V(n * n, 0.0), + Sigma(n, 0.0), + A_cpy(m * n, 0.0), + Sigma_exact(n, 0.0) { - row = m; - col = n; - tolerance = tol; - sampling_factor = d_factor; + row = m; + col = n; + rank = k; + tolerance = tol; } }; // Re-generate and clear data template static void data_regen(RandLAPACK::gen::mat_gen_info m_info, - QR_benchmark_data &all_data, + RBKI_benchmark_data &all_data, RandBLAS::RNGState &state) { RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - std::fill(all_data.R.begin(), all_data.R.end(), 0.0); - std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); - std::fill(all_data.J.begin(), all_data.J.end(), 0); + std::fill(all_data.U.begin(), all_data.U.end(), 0.0); + std::fill(all_data.V.begin(), all_data.V.end(), 0.0); + std::fill(all_data.Sigma.begin(), all_data.Sigma.end(), 0.0); } template static std::vector call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, - int64_t n, - QR_benchmark_data &all_data, + int64_t k, + RBKI_benchmark_data &all_data, RandBLAS::RNGState &state) { auto m = all_data.row; + auto n = all_data.col; auto tol = all_data.tolerance; - auto d_factor = all_data.sampling_factor; // Additional params setup. - RandLAPACK::CQRRPT CQRRPT(true, true, tol); - CQRRPT.nnz = 4; - CQRRPT.num_threads = 48; + RandLAPACK::RBKI RBKI(false, false, tol); // timing vars - long dur_cqrrpt = 0; - long dur_geqp3 = 0; - long dur_geqr = 0; - long dur_geqpt = 0; - long dur_geqrf = 0; - long dur_scholqr = 0; - long t_cqrrpt_best = 0; - long t_geqp3_best = 0; - long t_geqr_best = 0; - long t_geqpt_best = 0; - long t_geqrf_best = 0; - long t_scholqr_best = 0; - + long dur_rbki = 0; + long dur_other = 0; + long t_rbki_best = 0; + long t_other_best = 0; + // Making sure the states are unchanged auto state_gen = state; auto state_alg = state; for (int i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); - // Testing GEQP3 - auto start_geqp3 = high_resolution_clock::now(); - lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); - auto stop_geqp3 = high_resolution_clock::now(); - dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); + + // Testing RBKI + auto start_rbki = high_resolution_clock::now(); + RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + auto stop_rbki = high_resolution_clock::now(); + dur_rbki = duration_cast(stop_rbki - start_rbki).count(); state_gen = state; data_regen(m_info, all_data, state_gen); - // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); - lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); - dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); - - state_gen = state; - data_regen(m_info, all_data, state_gen); - - // Testing CQRRPT - auto start_cqrrp = high_resolution_clock::now(); - CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); - auto stop_cqrrp = high_resolution_clock::now(); - dur_cqrrpt = duration_cast(stop_cqrrp - start_cqrrp).count(); - - state_gen = state; - state_alg = state; - data_regen(m_info, all_data, state_gen); - - // Testing SCHOLQR3 - auto start_scholqr = high_resolution_clock::now(); - //--------------------------------------------------------------------------------------------------------------------------// - T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); - T shift = 11 * std::numeric_limits::epsilon() * n * std::pow(norm_A, 2); - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); - for (int i = 0; i < n; ++i) - all_data.R[i * (n + 1)] += shift; - lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); - // CholeskyQR2 - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); - lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); - // CholeskyQR3 - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); - lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); - //--------------------------------------------------------------------------------------------------------------------------// - auto stop_scholqr = high_resolution_clock::now(); - dur_scholqr = duration_cast(stop_scholqr - start_scholqr).count(); - - auto state_gen = state; - data_regen(m_info, all_data, state_gen); - - // Testing GEQR + GEQPT - auto start_geqpt = high_resolution_clock::now(); - auto start_geqr = high_resolution_clock::now(); -#if !defined(__APPLE__) - // GEQR(A) part - lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); - int64_t tsize = (int64_t) all_data.tau[0]; - all_data.tau.resize(tsize); - lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); -#endif - auto stop_geqr = high_resolution_clock::now(); - dur_geqr = duration_cast(stop_geqr - start_geqr).count(); -#if !defined(__APPLE__) - // GEQP3(R) part - lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n); - lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data()); -#endif - auto stop_geqpt = high_resolution_clock::now(); - dur_geqpt = duration_cast(stop_geqpt - start_geqpt).count(); + // Testing Other + auto start_other = high_resolution_clock::now(); + /// RIVAL ALGORITHM CALL + auto stop_other = high_resolution_clock::now(); + dur_other = duration_cast(stop_other - start_other).count(); state_gen = state; data_regen(m_info, all_data, state_gen); - i == 0 ? t_cqrrpt_best = dur_cqrrpt : (dur_cqrrpt < t_cqrrpt_best) ? t_cqrrpt_best = dur_cqrrpt : NULL; - i == 0 ? t_geqpt_best = dur_geqpt : (dur_geqpt < t_geqpt_best) ? t_geqpt_best = dur_geqpt : NULL; + i == 0 ? t_rbki_best = dur_rbki : (dur_rbki < t_rbki_best) ? t_rbki_best = dur_rbki : NULL; + i == 0 ? t_other_best = dur_other : (dur_other < t_other_best) ? t_other_best = dur_other : NULL; } - std::vector res{t_cqrrpt_best, t_geqpt_best}; + std::vector res{t_rbki_best, t_other_best}; return res; } int main() { // Declare parameters - int64_t m = std::pow(2, 17); - int64_t n_start = std::pow(2, 9); - int64_t n_stop = std::pow(2, 13); - double d_factor = 1.25; + int64_t m = std::pow(10, 3); + int64_t n = std::pow(10, 3); + int64_t k_start = 100; + int64_t k_stop = 100; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 1; + int64_t numruns = 5; // Allocate basic workspace - QR_benchmark_data all_data(m, n_stop, tol, d_factor); + RBKI_benchmark_data all_data(m, n, k_stop, tol); + // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A, state); // Declare a data file - std::fstream file("RBKI_speed_comp_" + std::to_string(m) - + "_col_start_" + std::to_string(n_start) - + "_col_stop_" + std::to_string(n_stop) - + "_d_factor_" + std::to_string(d_factor) + std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + + "_n_" + std::to_string(n) + + "_k_start_" + std::to_string(k_start) + + "_k_stop_" + std::to_string(k_stop) + ".dat", std::fstream::app); - for (;n_start <= n_stop; n_start *= 2) { - res = call_all_algs(m_info, numruns, n_start, all_data, state_constant); - file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ",\n"; + for (;k_start <= k_stop; k_start *= 2) { + res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); + file << res[0] << ", " << res[1] << ",\n"; } } \ No newline at end of file diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index d0e0b848..536db2f6 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -87,9 +87,9 @@ class TestRBKI : public ::testing::Test // Note: If Subprocess killed exception -> reload vscode TEST_F(TestRBKI, RBKI_basic) { - int64_t m = 10; - int64_t n = 8; - int64_t k = 4; + int64_t m = 4000; + int64_t n = 200; + int64_t k = 100; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); From 21c8a1c7f0fd02e130f4ad301c4c17d256cc61a1 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 17 Nov 2023 14:15:51 -0800 Subject: [PATCH 10/56] Trying to add matrix files processing capability; having an issue with get_line function locally. --- RandLAPACK/misc/rl_gen.hh | 40 ++++++++++++++++++- benchmark/CMakeLists.txt | 2 + .../bench_RBKI/RBKI_speed_comparisons.cc | 37 +++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 895f2fd0..5debfd6d 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -24,7 +24,8 @@ enum mat_type { spiked, adverserial, bad_cholqr, - kahan}; + kahan, + custom_input}; /// A struct containing info about a given matrix to be generated by mat_gen(). /// Requires only the size and type of a matrix by default, but can have other optional parameters. @@ -41,6 +42,7 @@ struct mat_gen_info { bool check_true_rank; T theta; T perturb; + std::string filename; mat_gen_info(int64_t m, int64_t n, mat_type t) { rows = m; @@ -415,11 +417,40 @@ void gen_kahan_mat( blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, m, m, 1.0, S.data(), m, C.data(), m, 1.0, A.data(), m); } +/// Generates Kahan matrix +template +void process_input_mat( + int64_t &m, + int64_t &n, + std::vector &A, + std::string filename +) { + std::string line; + std::string line_entry; + double value; + + // Read input file + std::ifstream inputMat(filename); + + // Count numcols + std::istringstream iss(line); + while (iss >> line_entry) + ++n; + // Count numrows. + while (std::getline(inputMat, line)) + ++m; + + // Place the contents of a file into the matrix space. + double value; + while (inputMat >> value) + A.push_back(value); +} + /// 'Entry point' routine for matrix generation. /// Calls functions for different mat type to fill the contents of a provided standard vector. template void mat_gen( - mat_gen_info info, + mat_gen_info &info, std::vector &A, RandBLAS::RNGState &state ) { @@ -474,6 +505,11 @@ void mat_gen( RandLAPACK::gen::gen_kahan_mat(m, n, A, info.theta, info.perturb); } break; + case custom_input: { + // Generates Kahan Matrix + RandLAPACK::gen::process_input_mat(m, n, A, info.filename); + } + break; default: throw std::runtime_error(std::string("Unrecognized case.")); break; diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 40a78494..99dfe76a 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -61,3 +61,5 @@ add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivo add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) + +add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 40a02b7d..665a3905 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -101,6 +101,7 @@ static std::vector call_all_algs( return res; } +/* int main() { // Declare parameters int64_t m = std::pow(10, 3); @@ -122,6 +123,42 @@ int main() { RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + // Declare a data file + std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + + "_n_" + std::to_string(n) + + "_k_start_" + std::to_string(k_start) + + "_k_stop_" + std::to_string(k_stop) + + ".dat", std::fstream::app); + + for (;k_start <= k_stop; k_start *= 2) { + res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); + file << res[0] << ", " << res[1] << ",\n"; + } +} +*/ + +int main() { + // Declare parameters + int64_t m = std::pow(10, 3); + int64_t n = std::pow(10, 3); + int64_t k_start = 100; + int64_t k_stop = 100; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 5; + + // Allocate basic workspace + RBKI_benchmark_data all_data(m, n, k_stop, tol); + + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); + custom_input.filename = argv[1]; + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + // Declare a data file std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) From 56574e388aa533ff469c82e82385f23b96fd85de Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 20 Nov 2023 09:23:16 -0800 Subject: [PATCH 11/56] Added capabilities to read input matrix. --- RandLAPACK/misc/rl_gen.hh | 84 +++++++++++-------- .../bench_RBKI/RBKI_speed_comparisons.cc | 57 ++++++++++++- 2 files changed, 101 insertions(+), 40 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 5debfd6d..4c7f145f 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -10,6 +10,8 @@ #include #include #include +#include +#include namespace RandLAPACK::gen { @@ -42,9 +44,10 @@ struct mat_gen_info { bool check_true_rank; T theta; T perturb; - std::string filename; + char* filename; + int workspace_query_mod; - mat_gen_info(int64_t m, int64_t n, mat_type t) { + mat_gen_info(int64_t& m, int64_t& n, mat_type t) { rows = m; cols = n; m_type = t; @@ -402,7 +405,6 @@ void gen_kahan_mat( T theta, T perturb ) { - std::vector S(m * m, 0.0); std::vector C(m * m, 0.0); @@ -423,27 +425,39 @@ void process_input_mat( int64_t &m, int64_t &n, std::vector &A, - std::string filename + char* filename, + int& workspace_query_mod ) { - std::string line; - std::string line_entry; - double value; - - // Read input file - std::ifstream inputMat(filename); - - // Count numcols - std::istringstream iss(line); - while (iss >> line_entry) - ++n; - // Count numrows. - while (std::getline(inputMat, line)) + // We only check the size of the input data. + if (workspace_query_mod) { + std::string line; + std::string line_entry; + + // Read input file + std::ifstream inputMat(filename); + + // Count numcols. + std::getline(inputMat, line); + std::istringstream lineStream(line); + while (lineStream >> line_entry) + ++n; + + // Count numrows - already got through row 1. ++m; + while (std::getline(inputMat, line)) + ++m; - // Place the contents of a file into the matrix space. - double value; - while (inputMat >> value) - A.push_back(value); + // Exit querying mod. + workspace_query_mod = 0; + } else { + double value; + // Read input file + std::ifstream inputMat(filename); + + // Place the contents of a file into the matrix space. + while (inputMat >> value) + A.push_back(value); + } } /// 'Entry point' routine for matrix generation. @@ -454,60 +468,56 @@ void mat_gen( std::vector &A, RandBLAS::RNGState &state ) { - // Base parameters - int64_t m = info.rows; - int64_t n = info.cols; - int64_t k = info.rank; - T* A_dat = RandLAPACK::util::upsize(m * n, A); + T* A_dat = RandLAPACK::util::upsize(info.rows * info.cols, A); switch(info.m_type) { case polynomial: // Generating matrix with polynomially decaying singular values - RandLAPACK::gen::gen_poly_mat(m, n, A, k, info.cond_num, info.exponent, info.diag, state); + RandLAPACK::gen::gen_poly_mat(info.rows, info.cols, A, info.rank, info.cond_num, info.exponent, info.diag, state); break; case exponential: // Generating matrix with exponentially decaying singular values - RandLAPACK::gen::gen_exp_mat(m, n, A, k, info.cond_num, info.diag, state); + RandLAPACK::gen::gen_exp_mat(info.rows, info.cols, A, info.rank, info.cond_num, info.diag, state); break; break; case gaussian: { // Gaussian random matrix - RandBLAS::DenseDist D(m, n); + RandBLAS::DenseDist D(info.rows, info.cols); state = RandBLAS::fill_dense(D, A_dat, state).second; } break; case step: { // Generating matrix with a staircase-like spectrum - RandLAPACK::gen::gen_step_mat(m, n, A, k, info.cond_num, info.diag, state); + RandLAPACK::gen::gen_step_mat(info.rows, info.cols, A, info.rank, info.cond_num, info.diag, state); } break; case spiked: { // This matrix may be numerically rank deficient - RandLAPACK::gen::gen_spiked_mat(m, n, A, info.scaling, state); + RandLAPACK::gen::gen_spiked_mat(info.rows, info.cols, A, info.scaling, state); if(info.check_true_rank) - k = RandLAPACK::util::rank_check(m, n, A); + info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A); } break; case adverserial: { // This matrix may be numerically rank deficient - RandLAPACK::gen::gen_oleg_adversarial_mat(m, n, A, info.scaling, state); + RandLAPACK::gen::gen_oleg_adversarial_mat(info.rows, info.cols, A, info.scaling, state); if(info.check_true_rank) - k = RandLAPACK::util::rank_check(m, n, A); + info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A); } break; case bad_cholqr: { // Per Oleg's suggestion, this is supposed to make QB fail with CholQR for orth/stab - RandLAPACK::gen::gen_bad_cholqr_mat(m, n, A, k, info.cond_num, info.diag, state); + RandLAPACK::gen::gen_bad_cholqr_mat(info.rows, info.cols, A, info.rank, info.cond_num, info.diag, state); } break; case kahan: { // Generates Kahan Matrix - RandLAPACK::gen::gen_kahan_mat(m, n, A, info.theta, info.perturb); + RandLAPACK::gen::gen_kahan_mat(info.rows, info.cols, A, info.theta, info.perturb); } break; case custom_input: { // Generates Kahan Matrix - RandLAPACK::gen::process_input_mat(m, n, A, info.filename); + RandLAPACK::gen::process_input_mat(info.rows, info.cols, A, info.filename, info.workspace_query_mod); } break; default: diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 665a3905..fcaf525a 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -137,7 +137,9 @@ int main() { } */ -int main() { +/* + +int main(int argc, char *argv[]) { // Declare parameters int64_t m = std::pow(10, 3); int64_t n = std::pow(10, 3); @@ -156,18 +158,67 @@ int main() { // Generate the input matrix - gaussian suffices for performance tests. RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); - custom_input.filename = argv[1]; + + m_info.filename = argv[1]; + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); + // Declare a data file std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) + "_k_start_" + std::to_string(k_start) + "_k_stop_" + std::to_string(k_stop) - + ".dat", std::fstream::app); + + ".dat", std::fstream::app); + for (;k_start <= k_stop; k_start *= 2) { + res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); + file << res[0] << ", " << res[1] << ",\n"; + } +} +*/ + + +int main(int argc, char *argv[]) { + int64_t m = 0; + int64_t n = 0; + int64_t k_start = 0; + int64_t k_stop = 0; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + + // Generate the input matrix. + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); + m_info.filename = argv[1]; + m_info.workspace_query_mod = 1; + // Workspace query; + std::vector buf; + RandLAPACK::gen::mat_gen(m_info, buf, state); + // Allocate basic workspace. + RBKI_benchmark_data all_data(m, n, k_stop, tol); + // Fill the data matrix; + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + + // Update basic params. + m = m_info.rows; + n = m_info.cols; + k_start = std::max((int64_t) 1, n / 100); + k_stop = n; + + printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); + + // Declare a data file + std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + + "_n_" + std::to_string(n) + + "_k_start_" + std::to_string(k_start) + + "_k_stop_" + std::to_string(k_stop) + + ".dat", std::fstream::app); + +/* for (;k_start <= k_stop; k_start *= 2) { res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); file << res[0] << ", " << res[1] << ",\n"; } +*/ } \ No newline at end of file From 28bcd2048af3ab0f3da12779c9a898fdd08b04aa Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 20 Nov 2023 10:40:51 -0800 Subject: [PATCH 12/56] Reworking matrix generators. --- RandLAPACK/drivers/rl_hqrrp.hh | 10 +- RandLAPACK/misc/rl_gen.hh | 206 +++++++++++++++++---------------- RandLAPACK/misc/rl_util.hh | 6 +- test/comps/test_qb.cc | 2 +- test/drivers/test_revd2.cc | 6 +- test/drivers/test_rsvd.cc | 4 +- 6 files changed, 118 insertions(+), 116 deletions(-) diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index 8a568376..1dd6f2d3 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -98,7 +98,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (double *) buff_U, & ldim_U, (double *) buff_T, & ldim_T, (double *) buff_B, & ldim_B, (double *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1, 1, 1, 1 + , 1, 1, 1, 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -106,7 +106,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (float *) buff_U, & ldim_U, (float *) buff_T, & ldim_T, (float *) buff_B, & ldim_B, (float *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1, 1, 1, 1 + , 1, 1, 1, 1 #endif ); } else { @@ -136,7 +136,7 @@ void _LAPACK_larf( (double *) C, & ldc_, (double *) work #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1 + , 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -146,7 +146,7 @@ void _LAPACK_larf( (float *) C, & ldc_, (float *) work #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1 + , 1 #endif ); } else { @@ -398,7 +398,7 @@ static int64_t NoFLA_QRP_downdate_partial_norms( char dlmach_param = 'E'; tol3z = sqrt( LAPACK_dlamch( & dlmach_param #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1 + , 1 #endif ) ); ptr_d = buff_d; diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 4c7f145f..8af2ee5f 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -68,38 +68,37 @@ template void gen_singvec( int64_t m, int64_t n, - std::vector &A, + T* A, int64_t k, - std::vector &S, + T* S, RandBLAS::RNGState &state ) { - std::vector U(m * k, 0.0); - std::vector V(n * k, 0.0); - std::vector tau(k, 2.0); - std::vector Gemm_buf(m * k, 0.0); - - // Data pointer predeclarations for whatever is accessed more than once - T* U_dat = U.data(); - T* V_dat = V.data(); - T* tau_dat = tau.data(); - T* Gemm_buf_dat = Gemm_buf.data(); + T* U = ( T * ) calloc( m * k, sizeof( T ) ); + T* V = ( T * ) calloc( n * k, sizeof( T ) ); + T* tau = ( T * ) calloc( k , sizeof( T ) ); + T* Gemm_buf = ( T * ) calloc( m * k, sizeof( T ) ); RandBLAS::DenseDist DU(m, k); RandBLAS::DenseDist DV(n, k); - state = RandBLAS::fill_dense(DU, U_dat, state).second; - state = RandBLAS::fill_dense(DV, V_dat, state).second; + state = RandBLAS::fill_dense(DU, U, state).second; + state = RandBLAS::fill_dense(DV, V, state).second; - lapack::geqrf(m, k, U_dat, m, tau_dat); - lapack::ungqr(m, k, k, U_dat, m, tau_dat); + lapack::geqrf(m, k, U, m, tau); + lapack::ungqr(m, k, k, U, m, tau); - lapack::geqrf(n, k, V_dat, n, tau_dat); - lapack::ungqr(n, k, k, V_dat, n, tau_dat); + lapack::geqrf(n, k, V, n, tau); + lapack::ungqr(n, k, k, V, n, tau); - blas::copy(m * k, U_dat, 1, Gemm_buf_dat, 1); + blas::copy(m * k, U, 1, Gemm_buf, 1); for(int i = 0; i < k; ++i) - blas::scal(m, S[i + k * i], &Gemm_buf_dat[i * m], 1); + blas::scal(m, S[i + k * i], &Gemm_buf[i * m], 1); + + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, n, k, 1.0, Gemm_buf, m, V, n, 0.0, A, m); - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, n, k, 1.0, Gemm_buf_dat, m, V_dat, n, 0.0, A.data(), m); + free(U); + free(V); + free(tau); + free(Gemm_buf); } /// Generates a matrix with polynomially-decaying spectrum of the following form: @@ -112,7 +111,7 @@ template void gen_poly_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, int64_t k, T cond, T p, @@ -121,8 +120,8 @@ void gen_poly_mat( ) { // Predeclare to all nonzero constants, start decay where needed - std::vector s(k, 1.0); - std::vector S(k * k, 0.0); + T* s = ( T * ) calloc( k, sizeof( T ) ); + T* S = ( T * ) calloc( k * k, sizeof( T ) ); // The first 10% of the singular values will be equal to one int offset = (int) floor(k * 0.1); @@ -131,7 +130,8 @@ void gen_poly_mat( T a = std::pow((std::pow(last_entry, -1 / p) - std::pow(first_entry, -1 / p)) / (k - offset), p); T b = std::pow(a * first_entry, -1 / p) - offset; // apply lambda function to every entry of s - std::for_each(s.begin() + offset, s.end(), + std::fill(s, s + offset, 1.0); + std::for_each(s + offset, s + k, // Lambda expression begins [&p, &offset, &a, &b](T &entry) { entry = 1 / (a * std::pow(offset + b, p)); @@ -143,15 +143,13 @@ void gen_poly_mat( RandLAPACK::util::diag(k, k, s, k, S); if (diagon) { - if (!(m == k || n == k)) { - m = k; - n = k; - A.resize(k * k); - } - lapack::lacpy(MatrixType::General, k, k, S.data(), k, A.data(), k); + lapack::lacpy(MatrixType::General, k, k, S, k, A, k); } else { RandLAPACK::gen::gen_singvec(m, n, A, k, S, state); } + + free(s); + free(S); } /// Generates a matrix with exponentially-decaying spectrum of the following form: @@ -163,15 +161,14 @@ template void gen_exp_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, int64_t k, T cond, bool diagon, RandBLAS::RNGState &state ) { - - std::vector s(k, 1.0); - std::vector S(k * k, 0.0); + T* s = ( T * ) calloc( k, sizeof( T ) ); + T* S = ( T * ) calloc( k * k, sizeof( T ) ); // The first 10% of the singular values will be =1 int offset = (int) floor(k * 0.1); @@ -181,7 +178,8 @@ void gen_exp_mat( T cnt = 0.0; // apply lambda function to every entry of s // Please make sure that the first singular value is always 1 - std::for_each(s.begin() + offset, s.end(), + std::fill(s, s + offset, 1.0); + std::for_each(s + offset, s + k, // Lambda expression begins [&t, &cnt](T &entry) { entry = (std::exp(++cnt * -t)); @@ -191,15 +189,13 @@ void gen_exp_mat( // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); if (diagon) { - if (!(m == k || n == k)) { - m = k; - n = k; - A.resize(k * k); - } - lapack::lacpy(MatrixType::General, k, k, S.data(), k, A.data(), k); + lapack::lacpy(MatrixType::General, k, k, S, k, A, k); } else { RandLAPACK::gen::gen_singvec(m, n, A, k, S, state); } + + free(s); + free(S); } /// Generates matrix with a staircase spectrum with 4 steps. @@ -211,7 +207,7 @@ template void gen_step_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, int64_t k, T cond, bool diagon, @@ -219,30 +215,28 @@ void gen_step_mat( ) { // Predeclare to all nonzero constants, start decay where needed - std::vector s(k, 1.0); - std::vector S(k * k, 0.0); + T* s = ( T * ) calloc( k, sizeof( T ) ); + T* S = ( T * ) calloc( k * k, sizeof( T ) ); // We will have 4 steps controlled by the condition number size and starting with 1 int offset = (int) (k / 4); - std::fill(s.begin(), s.begin() + offset, 1); - std::fill(s.begin() + offset + 1, s.begin() + 2 * offset, 8.0 / cond); - std::fill(s.begin() + 2 * offset + 1, s.begin() + 3 * offset, 4.0 / cond); - std::fill(s.begin() + 3 * offset + 1, s.end(), 1.0 / cond); + std::fill(s, s + offset, 1.0); + std::fill(s + offset + 1, s + 2 * offset, 8.0 / cond); + std::fill(s + 2 * offset + 1, s + 3 * offset, 4.0 / cond); + std::fill(s + 3 * offset + 1, s + k, 1.0 / cond); // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); if (diagon) { - if (!(m == k || n == k)) { - m = k; - n = k; - A.resize(k * k); - } - lapack::lacpy(MatrixType::General, k, k, S.data(), k, A.data(), k); + lapack::lacpy(MatrixType::General, k, k, S, k, A, k); } else { - gen_singvec(m, n, A, k, S, state); + RandLAPACK::gen::gen_singvec(m, n, A, k, S, state); } + + free(s); + free(S); } /// Generates a matrix with high coherence between the left singular vectors. @@ -253,7 +247,7 @@ template void gen_spiked_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, T spike_scale, RandBLAS::RNGState &state ) { @@ -264,14 +258,14 @@ void gen_spiked_mat( RandBLAS::SparseSkOp S(DS, state); state = RandBLAS::fill_sparse(S); - std::vector V(n * n, 0.0); - std::vector tau(n, 0.0); + T* V = ( T * ) calloc( n * n, sizeof( T ) ); + T* tau = ( T * ) calloc( n, sizeof( T ) ); RandBLAS::DenseDist DV(n, n); - state = RandBLAS::fill_dense(DV, V.data(), state).second; + state = RandBLAS::fill_dense(DV, V, state).second; - lapack::geqrf(n, n, V.data(), n, tau.data()); - lapack::ungqr(n, n, n, V.data(), n, tau.data()); + lapack::geqrf(n, n, V, n, tau); + lapack::ungqr(n, n, n, V, n, tau); // Fill A with stacked copies of V int start = 0; @@ -289,6 +283,9 @@ void gen_spiked_mat( } start += m; } + + free(V); + free(tau); } /// Generates a numerically rank-deficient matrix. @@ -303,7 +300,7 @@ template void gen_oleg_adversarial_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, T sigma, RandBLAS::RNGState &state ) { @@ -311,39 +308,42 @@ void gen_oleg_adversarial_mat( T scaling_factor_U = sigma; T scaling_factor_V = 10e-3; - std::vector U(m * n, 0.0); - std::vector V(n * n, 0.0); - std::vector tau1(n, 0.0); - std::vector tau2(n, 0.0); + T* U = ( T * ) calloc( m * n, sizeof( T ) ); + T* V = ( T * ) calloc( n * n, sizeof( T ) ); + T* tau1 = ( T * ) calloc( n, sizeof( T ) ); + T* tau2 = ( T * ) calloc( n, sizeof( T ) ); RandBLAS::DenseDist DU(m, n); - state = RandBLAS::fill_dense(DU, U.data(), state).second; + state = RandBLAS::fill_dense(DU, U, state).second; RandBLAS::DenseDist DV(n, n); - state = RandBLAS::fill_dense(DV, V.data(), state).second; + state = RandBLAS::fill_dense(DV, V, state).second; - T* U_dat = U.data(); for(int i = 0; i < n; ++i) { //U_dat[m * i + 1] *= scaling_factor_U; for(int j = 0; j < 10; ++j) { - U_dat[m * i + j] *= scaling_factor_U; + U[m * i + j] *= scaling_factor_U; } } - lapack::geqrf(m, n, U.data(), m, tau1.data()); - lapack::ungqr(m, n, n, U.data(), m, tau1.data()); + lapack::geqrf(m, n, U, m, tau1); + lapack::ungqr(m, n, n, U, m, tau1); - lapack::geqrf(n, n, V.data(), n, tau2.data()); - lapack::ungqr(n, n, n, V.data(), n, tau2.data()); + lapack::geqrf(n, n, V, n, tau2); + lapack::ungqr(n, n, n, V, n, tau2); // Grab an upper-triangular portion of V - RandLAPACK::util::get_U(n, n, V.data(), n); + RandLAPACK::util::get_U(n, n, V, n); - T* V_dat = V.data(); for(int i = 11; i < n; ++i) - V_dat[n * i + i] *= scaling_factor_V; + V[n * i + i] *= scaling_factor_V; - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, n, 1.0, U.data(), m, V.data(), n, 0.0, A.data(), m); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, n, 1.0, U, m, V, n, 0.0, A, m); + + free(U); + free(V); + free(tau1); + free(tau2); } /// Per Oleg Balabanov's suggestion, this matrix is supposed to break QB with Cholesky QR. @@ -356,18 +356,18 @@ template void gen_bad_cholqr_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, int64_t k, T cond, bool diagon, RandBLAS::RNGState &state ) { - - std::vector s(n, 1.0); - std::vector S(n * n, 0.0); + T* s = ( T * ) calloc( n, sizeof( T ) ); + T* S = ( T * ) calloc( n * n, sizeof( T ) ); // The first k singular values will be =1 int offset = k; + std::fill(s, s + offset, 1.0); // Then, we start with 10^-8 and decrease exponentially T t = log(std::pow(10, 8) / cond) / (1 - (n - offset)); @@ -375,7 +375,7 @@ void gen_bad_cholqr_mat( T cnt = 0.0; // apply lambda function to every entry of s // Please make sure that the first singular value is always 1 - std::for_each(s.begin() + offset, s.end(), + std::for_each(s + offset, s + k, // Lambda expression begins [&t, &cnt](T &entry) { entry = (std::exp(t) / std::pow(10, 8)) * (std::exp(++cnt * -t)); @@ -385,15 +385,13 @@ void gen_bad_cholqr_mat( // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); if (diagon) { - if (!(m == k || n == k)) { - m = k; - n = k; - A.resize(k * k); - } - lapack::lacpy(MatrixType::General, k, k, S.data(), k, A.data(), k); + lapack::lacpy(MatrixType::General, k, k, S, k, A, k); } else { - gen_singvec(m, n, A, k, S, state); + RandLAPACK::gen::gen_singvec(m, n, A, k, S, state); } + + free(s); + free(S); } /// Generates Kahan matrix @@ -401,12 +399,12 @@ template void gen_kahan_mat( int64_t m, int64_t n, - std::vector &A, + T* A, T theta, T perturb ) { - std::vector S(m * m, 0.0); - std::vector C(m * m, 0.0); + T* S = ( T * ) calloc( m * m, sizeof( T ) ); + T* C = ( T * ) calloc( m * m, sizeof( T ) ); for (int i = 0; i < n; ++i) { A[(m + 1) * i] = perturb * std::numeric_limits::epsilon() * (m - i); @@ -416,7 +414,10 @@ void gen_kahan_mat( C[m * i + i] = 1.0; } - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, m, m, 1.0, S.data(), m, C.data(), m, 1.0, A.data(), m); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, m, m, 1.0, S, m, C, m, 1.0, A, m); + + free(S); + free(C); } /// Generates Kahan matrix @@ -424,7 +425,7 @@ template void process_input_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, char* filename, int& workspace_query_mod ) { @@ -455,8 +456,9 @@ void process_input_mat( std::ifstream inputMat(filename); // Place the contents of a file into the matrix space. + int i = -1; while (inputMat >> value) - A.push_back(value); + A[++i] = value; } } @@ -465,10 +467,10 @@ void process_input_mat( template void mat_gen( mat_gen_info &info, - std::vector &A, + std::vector &A_mat, RandBLAS::RNGState &state ) { - T* A_dat = RandLAPACK::util::upsize(info.rows * info.cols, A); + T* A = A_mat.data(); switch(info.m_type) { case polynomial: @@ -483,7 +485,7 @@ void mat_gen( case gaussian: { // Gaussian random matrix RandBLAS::DenseDist D(info.rows, info.cols); - state = RandBLAS::fill_dense(D, A_dat, state).second; + state = RandBLAS::fill_dense(D, A, state).second; } break; case step: { @@ -495,14 +497,14 @@ void mat_gen( // This matrix may be numerically rank deficient RandLAPACK::gen::gen_spiked_mat(info.rows, info.cols, A, info.scaling, state); if(info.check_true_rank) - info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A); + info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A_mat); } break; case adverserial: { // This matrix may be numerically rank deficient RandLAPACK::gen::gen_oleg_adversarial_mat(info.rows, info.cols, A, info.scaling, state); if(info.check_true_rank) - info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A); + info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A_mat); } break; case bad_cholqr: { diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 2fbcdf5c..065975b6 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -42,15 +42,15 @@ template void diag( int64_t m, int64_t n, - const std::vector &s, + T* s, int64_t k, // size of s, < min(m, n) - std::vector &S // Assuming S is m by n + T* S // Assuming S is m by n ) { if(k > std::min(m, n)) throw std::runtime_error("Invalid rank parameter."); // size of s - blas::copy(k, s.data(), 1, S.data(), m + 1); + blas::copy(k, s, 1, S, m + 1); } /// Zeros-out the upper-triangular portion of A diff --git a/test/comps/test_qb.cc b/test/comps/test_qb.cc index 2230dec7..1a1f5b70 100644 --- a/test/comps/test_qb.cc +++ b/test/comps/test_qb.cc @@ -150,7 +150,7 @@ class TestQB : public ::testing::Test // zero out the trailing singular values std::fill(s_dat + k, s_dat + n, 0.0); - RandLAPACK::util::diag(n, n, all_data.s, n, all_data.S); + RandLAPACK::util::diag(n, n, all_data.s.data(), n, all_data.S.data()); // TEST 4: Below is A_k - A_hat = A_k - QB blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, n, 1.0, U_dat, m, S_dat, n, 1.0, A_k_dat, m); diff --git a/test/drivers/test_revd2.cc b/test/drivers/test_revd2.cc index 4ab9e376..f8e2a9e3 100644 --- a/test/drivers/test_revd2.cc +++ b/test/drivers/test_revd2.cc @@ -161,7 +161,7 @@ class TestREVD2 : public ::testing::Test // Construnct A_hat = U1 * S1 * VT1 // Turn vector into diagonal matrix - RandLAPACK::util::diag(k, k, all_data.eigvals, k, all_data.E); + RandLAPACK::util::diag(k, k, all_data.eigvals.data(), k, all_data.E.data()); // V * E = Buf blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, k, 1.0, V_dat, m, E_dat, k, 0.0, Buf_dat, m); // A - Buf * V' - should be close to 0 @@ -199,8 +199,8 @@ class TestREVD2 : public ::testing::Test T* work_l_dat = all_data.A_l.data(); T* A_approx_dat = all_data.work.data(); - RandLAPACK::util::diag(k, k, all_data.eigvals_u, k, all_data.E_u); - RandLAPACK::util::diag(k, k, all_data.eigvals_l, k, all_data.E_l); + RandLAPACK::util::diag(k, k, all_data.eigvals_u.data(), k, all_data.E_u.data()); + RandLAPACK::util::diag(k, k, all_data.eigvals_l.data(), k, all_data.E_l.data()); // Reconstruct factorizations, compare the result // V_u * E_u = work_u diff --git a/test/drivers/test_rsvd.cc b/test/drivers/test_rsvd.cc index d92d5398..a95e7ec8 100644 --- a/test/drivers/test_rsvd.cc +++ b/test/drivers/test_rsvd.cc @@ -136,7 +136,7 @@ class TestRSVD : public ::testing::Test // Construnct A_approx_determ = U1 * S1 * VT1 // Turn vector into diagonal matrix - RandLAPACK::util::diag(k, k, all_data.s1, k, all_data.S1); + RandLAPACK::util::diag(k, k, all_data.s1.data(), k, all_data.S1.data()); // U1 * S1 = A_approx_determ_duf blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, k, 1.0, U1_dat, m, S1_dat, k, 1.0, A_approx_determ_duf_dat, m); // A_approx_determ_duf * VT1 = A_approx_determ @@ -147,7 +147,7 @@ class TestRSVD : public ::testing::Test // zero out the trailing singular values std::fill(s_dat + k, s_dat + n, 0.0); - RandLAPACK::util::diag(n, n, all_data.s, n, all_data.S); + RandLAPACK::util::diag(n, n, all_data.s.data(), n, all_data.S.data()); // TEST 4: Below is A_k - A_approx_determ = A_k - QB blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, n, 1.0, U_dat, m, S_dat, n, 1.0, A_k_dat, m); From cc56005e1777de2227995d2e59da46f9e397699c Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 20 Nov 2023 11:09:07 -0800 Subject: [PATCH 13/56] All that is left is to change mat_gen signature --- RandLAPACK/comps/rl_orth.hh | 4 ++-- RandLAPACK/comps/rl_rf.hh | 2 +- RandLAPACK/comps/rl_rs.hh | 4 ++-- RandLAPACK/comps/rl_syrf.hh | 2 +- RandLAPACK/misc/rl_gen.hh | 4 ++-- RandLAPACK/misc/rl_util.hh | 35 +++++++++++++++++------------------ 6 files changed, 25 insertions(+), 26 deletions(-) diff --git a/RandLAPACK/comps/rl_orth.hh b/RandLAPACK/comps/rl_orth.hh index 573f9a6b..136698d3 100644 --- a/RandLAPACK/comps/rl_orth.hh +++ b/RandLAPACK/comps/rl_orth.hh @@ -95,8 +95,8 @@ int CholQRQ::call( // Scheme may succeed, but output garbage if(this->cond_check) { - if(util::cond_num_check(k, k, Q_gram, this->Q_gram_cpy, this->s, this->verbosity) > (1 / std::sqrt(std::numeric_limits::epsilon()))){ - // return 1; + if(util::cond_num_check(k, k, Q_gram.data(), (this->Q_gram_cpy).data(), (this->s).data(), this->verbosity) > (1 / std::sqrt(std::numeric_limits::epsilon()))){ + //return 1; } } diff --git a/RandLAPACK/comps/rl_rf.hh b/RandLAPACK/comps/rl_rf.hh index 46c85ad3..e4e0e5b8 100644 --- a/RandLAPACK/comps/rl_rf.hh +++ b/RandLAPACK/comps/rl_rf.hh @@ -129,7 +129,7 @@ int RF::call( if(this->cond_check) // Writes into this->cond_nums - this->cond_nums.push_back(util::cond_num_check(m, k, Q, this->Q_cpy, this->s, this->verbosity)); + this->cond_nums.push_back(util::cond_num_check(m, k, Q.data(), (this->Q_cpy).data(), (this->s).data(), this->verbosity)); if(this->Orth_Obj.call(m, k, Q)) return 2; // Orthogonalization failed diff --git a/RandLAPACK/comps/rl_rs.hh b/RandLAPACK/comps/rl_rs.hh index 9918dcc5..cf930733 100644 --- a/RandLAPACK/comps/rl_rs.hh +++ b/RandLAPACK/comps/rl_rs.hh @@ -160,7 +160,7 @@ int RS::call( ++ p_done; if(this->cond_check) - this->cond_nums.push_back(util::cond_num_check(m, k, Omega_1, this->Omega_1_cpy, this->s, this->verbosity)); + this->cond_nums.push_back(util::cond_num_check(m, k, Omega_1.data(), (this->Omega_1_cpy).data(), (this->s).data(), this->verbosity)); if ((p_done % q == 0) && (this->Stab_Obj.call(m, k, Omega_1))) return 1; @@ -170,7 +170,7 @@ int RS::call( ++ p_done; if (this->cond_check) - this->cond_nums.push_back(util::cond_num_check(n, k, Omega, this->Omega_cpy, this->s, this->verbosity)); + this->cond_nums.push_back(util::cond_num_check(n, k, Omega.data(), (this->Omega_cpy).data(), (this->s).data(), this->verbosity)); if ((p_done % q == 0) && (this->Stab_Obj.call(n, k, Omega))) return 1; diff --git a/RandLAPACK/comps/rl_syrf.hh b/RandLAPACK/comps/rl_syrf.hh index 63cef59d..4a83d18b 100644 --- a/RandLAPACK/comps/rl_syrf.hh +++ b/RandLAPACK/comps/rl_syrf.hh @@ -139,7 +139,7 @@ int SYRF::call( util::upsize(m * k, this->cond_work_mat); util::upsize(k, this->cond_work_vec); this->cond_nums.push_back( - util::cond_num_check(m, k, Q, this->cond_work_mat, this->cond_work_vec, this->verbose) + util::cond_num_check(m, k, Q.data(), (this->cond_work_mat).data(), (this->cond_work_vec).data(), this->verbose) ); } if(this->Orth_Obj.call(m, k, Q)) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 8af2ee5f..f2c019b4 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -497,14 +497,14 @@ void mat_gen( // This matrix may be numerically rank deficient RandLAPACK::gen::gen_spiked_mat(info.rows, info.cols, A, info.scaling, state); if(info.check_true_rank) - info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A_mat); + info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A); } break; case adverserial: { // This matrix may be numerically rank deficient RandLAPACK::gen::gen_oleg_adversarial_mat(info.rows, info.cols, A, info.scaling, state); if(info.check_true_rank) - info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A_mat); + info.rank = RandLAPACK::util::rank_check(info.rows, info.cols, A); } break; case bad_cholqr: { diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 065975b6..911c8177 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -187,26 +187,20 @@ template T cond_num_check( int64_t m, int64_t n, - const std::vector &A, - std::vector &A_cpy, - std::vector &s, + T* A, + T* A_cpy, + T* s, bool verbose ) { - // Copy to avoid any changes - T* A_cpy_dat = upsize(m * n, A_cpy); - T* s_dat = upsize(n, s); + // TODO: GET RID OF THE INTERNAL ALLOCATIONS + A_cpy = ( T * ) calloc( m * n, sizeof( T ) ); + s = ( T * ) calloc( n, sizeof( T ) ); - // Packed storage check - if (A.size() < A_cpy.size()) { - // Convert to normal format - lapack::tfttr(Op::NoTrans, Uplo::Upper, n, A.data(), A_cpy_dat, m); - } else { - lapack::lacpy(MatrixType::General, m, n, A.data(), m, A_cpy_dat, m); - } - lapack::gesdd(Job::NoVec, m, n, A_cpy_dat, m, s_dat, NULL, m, NULL, n); + lapack::lacpy(MatrixType::General, m, n, A, m, A_cpy, m); + lapack::gesdd(Job::NoVec, m, n, A_cpy, m, s, NULL, m, NULL, n); - T cond_num = s_dat[0] / s_dat[n - 1]; + T cond_num = s[0] / s[n - 1]; if (verbose) printf("CONDITION NUMBER: %f\n", cond_num); @@ -219,16 +213,21 @@ template int64_t rank_check( int64_t m, int64_t n, - const std::vector &A + T* A ) { - std::vector A_pre_cpy; - std::vector s; + T* A_pre_cpy = ( T * ) calloc( m * n, sizeof( T ) ); + T* s = ( T * ) calloc( n, sizeof( T ) ); + RandLAPACK::util::cond_num_check(m, n, A, A_pre_cpy, s, false); for(int i = 0; i < n; ++i) { if (s[i] / s[0] <= 5 * std::numeric_limits::epsilon()) return i - 1; } + + free(A_pre_cpy); + free(s); + return n; } From 96df7951e456052e49bb8a1fadb89640646c4799 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 20 Nov 2023 13:23:29 -0800 Subject: [PATCH 14/56] Need to fix matrix read order. --- RandLAPACK/misc/rl_gen.hh | 6 +++--- benchmark/bench_CQRRP/CQRRP_pivot_quality.cc | 4 ++-- .../bench_CQRRP/CQRRP_runtime_breakdown.cc | 4 ++-- .../bench_CQRRP/CQRRP_speed_comparisons.cc | 4 ++-- .../bench_CQRRPT/CQRRPT_pivot_quality.cc | 4 ++-- .../bench_CQRRPT/CQRRPT_runtime_breakdown.cc | 4 ++-- .../bench_CQRRPT/CQRRPT_speed_comparisons.cc | 4 ++-- .../bench_RBKI/RBKI_speed_comparisons.cc | 21 ++++++++++++------- benchmark/bench_general/Chol_check.cc | 2 +- benchmark/bench_general/GEMM_flop_count.cc | 4 ++-- test/comps/test_orth.cc | 2 +- test/comps/test_preconditioners.cc | 2 +- test/comps/test_qb.cc | 8 +++---- test/comps/test_rf.cc | 4 ++-- test/comps/test_syrf.cc | 4 ++-- test/comps/test_util.cc | 8 +++---- test/drivers/test_cqrrp.cc | 14 ++++++------- test/drivers/test_cqrrpt.cc | 6 +++--- test/drivers/test_hqrrp.cc | 2 +- test/drivers/test_rbki.cc | 2 +- test/drivers/test_revd2.cc | 12 +++++------ test/drivers/test_rsvd.cc | 2 +- 22 files changed, 64 insertions(+), 59 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index f2c019b4..494609b4 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -457,8 +457,9 @@ void process_input_mat( // Place the contents of a file into the matrix space. int i = -1; - while (inputMat >> value) + while (inputMat >> value){ A[++i] = value; + } } } @@ -467,10 +468,9 @@ void process_input_mat( template void mat_gen( mat_gen_info &info, - std::vector &A_mat, + T* A, RandBLAS::RNGState &state ) { - T* A = A_mat.data(); switch(info.m_type) { case polynomial: diff --git a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc index 4efdee23..f3429cd9 100644 --- a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc +++ b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc @@ -36,7 +36,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, QR_speed_benchmark_data &all_data, RandBLAS::RNGState &state) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); std::fill(all_data.J.begin(), all_data.J.end(), 0); } @@ -180,7 +180,7 @@ int main() { //m_info.cond_num = std::pow(10, 10); //m_info.rank = n; //m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); #if !defined(__APPLE__) R_norm_ratio(m_info, b_sz, all_data, state_constant1); diff --git a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc index 2cd56aa2..c6f44e84 100644 --- a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc +++ b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc @@ -34,7 +34,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, QR_speed_benchmark_data &all_data, RandBLAS::RNGState &state) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); std::fill(all_data.J.begin(), all_data.J.end(), 0); } @@ -106,7 +106,7 @@ int main() { QR_speed_benchmark_data all_data(m, n, tol, d_factor); // Generate the input matrix - gaussian suffices for performance tests. RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file std::fstream file("CQRRP_inner_speed_" + std::to_string(m) diff --git a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc index 0f2d2578..87a62cb3 100644 --- a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc +++ b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc @@ -34,7 +34,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, QR_speed_benchmark_data &all_data, RandBLAS::RNGState &state, int apply_itoa) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); if (apply_itoa) { std::iota(all_data.J.begin(), all_data.J.end(), 1); @@ -179,7 +179,7 @@ int main() { QR_speed_benchmark_data all_data(m, n, tol, d_factor); // Generate the input matrix - gaussian suffices for performance tests. RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file std::fstream file("QR_time_raw_rows_" + std::to_string(m) diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc index ac091fbb..c0ae5b16 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc @@ -38,7 +38,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, QR_benchmark_data &all_data, RandBLAS::RNGState &state) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.R.begin(), all_data.R.end(), 0.0); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); std::fill(all_data.J.begin(), all_data.J.end(), 0); @@ -171,7 +171,7 @@ int main() { m_info.cond_num = std::pow(10, 10); m_info.rank = n; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); R_norm_ratio(m_info, all_data, state_constant1); printf("R done\n"); diff --git a/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc index de694a88..953d88e2 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc @@ -36,7 +36,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, QR_benchmark_data &all_data, RandBLAS::RNGState &state) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.R.begin(), all_data.R.end(), 0.0); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); std::fill(all_data.J.begin(), all_data.J.end(), 0); @@ -104,7 +104,7 @@ int main() { QR_benchmark_data all_data(m, n_stop, tol, d_factor); // Generate the input matrix - gaussian suffices for performance tests. RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file std::fstream file("CQRRPT_inner_speed_" + std::to_string(m) diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc index 8791074a..421ef94c 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -36,7 +36,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, QR_benchmark_data &all_data, RandBLAS::RNGState &state) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.R.begin(), all_data.R.end(), 0.0); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); std::fill(all_data.J.begin(), all_data.J.end(), 0); @@ -186,7 +186,7 @@ int main() { QR_benchmark_data all_data(m, n_stop, tol, d_factor); // Generate the input matrix - gaussian suffices for performance tests. RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file std::fstream file("CQRRPT_speed_comp_" + std::to_string(m) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index fcaf525a..c1a3b9b5 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -193,12 +193,7 @@ int main(int argc, char *argv[]) { m_info.filename = argv[1]; m_info.workspace_query_mod = 1; // Workspace query; - std::vector buf; - RandLAPACK::gen::mat_gen(m_info, buf, state); - // Allocate basic workspace. - RBKI_benchmark_data all_data(m, n, k_stop, tol); - // Fill the data matrix; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, NULL, state); // Update basic params. m = m_info.rows; @@ -206,8 +201,18 @@ int main(int argc, char *argv[]) { k_start = std::max((int64_t) 1, n / 100); k_stop = n; - printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); + // Allocate basic workspace. + RBKI_benchmark_data all_data(m, n, k_stop, tol); + + + printf("%d\n", all_data.A.size()); + + // Fill the data matrix; + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); + printf("%e\n", *(all_data.A.data() + 1)); +/* // Declare a data file std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) @@ -215,7 +220,7 @@ int main(int argc, char *argv[]) { + "_k_stop_" + std::to_string(k_stop) + ".dat", std::fstream::app); -/* + for (;k_start <= k_stop; k_start *= 2) { res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); file << res[0] << ", " << res[1] << ",\n"; diff --git a/benchmark/bench_general/Chol_check.cc b/benchmark/bench_general/Chol_check.cc index f3cbb8cd..22da8a4e 100644 --- a/benchmark/bench_general/Chol_check.cc +++ b/benchmark/bench_general/Chol_check.cc @@ -15,7 +15,7 @@ chol_check(int64_t m, int64_t k, RandBLAS::RNGState state) { RandLAPACK::gen::mat_gen_info m_info(m, m, RandLAPACK::gen::polynomial); m_info.cond_num = std::pow(10, 8); - RandLAPACK::gen::mat_gen(m_info, A, state); + RandLAPACK::gen::mat_gen(m_info, A.data(), state); T* A_dat = A.data(); T* A_leading_submat_symm_dat = A_leading_submat_symm.data(); diff --git a/benchmark/bench_general/GEMM_flop_count.cc b/benchmark/bench_general/GEMM_flop_count.cc index c2b6d7b7..64b0a8a8 100644 --- a/benchmark/bench_general/GEMM_flop_count.cc +++ b/benchmark/bench_general/GEMM_flop_count.cc @@ -34,8 +34,8 @@ test_flops(int64_t k, RandBLAS::RNGState state) { T* C_dat = C.data(); RandLAPACK::gen::mat_gen_info m_info(k, k, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, A, state); - RandLAPACK::gen::mat_gen(m_info, B, state); + RandLAPACK::gen::mat_gen(m_info, A.data(), state); + RandLAPACK::gen::mat_gen(m_info, B.data(), state); // Get the timing auto start = high_resolution_clock::now(); diff --git a/test/comps/test_orth.cc b/test/comps/test_orth.cc index 255f2f14..9d4c29bd 100644 --- a/test/comps/test_orth.cc +++ b/test/comps/test_orth.cc @@ -113,7 +113,7 @@ TEST_F(TestOrth, Test_CholQRQ) RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial); m_info.cond_num = 2; m_info.rank = k; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); sketch_and_copy_computational_helper(state, all_data); test_orth_sketch(all_data, CholQRQ); diff --git a/test/comps/test_preconditioners.cc b/test/comps/test_preconditioners.cc index 4f193d2b..f9d03901 100644 --- a/test/comps/test_preconditioners.cc +++ b/test/comps/test_preconditioners.cc @@ -278,7 +278,7 @@ TEST_F(TestNystromPrecond, basictest) { mat_info.exponent = 2.0; std::vector A(m * m, 0.0); RandBLAS::RNGState data_state(0); - RandLAPACK::gen::mat_gen(mat_info, A, data_state); + RandLAPACK::gen::mat_gen(mat_info, A.data(), data_state); std::vector G(m * m, 0.0); blas::syrk(Layout::ColMajor, Uplo::Lower, Op::NoTrans, m, m, 1.0, A.data(), m, 0.0, G.data(), m diff --git a/test/comps/test_qb.cc b/test/comps/test_qb.cc index 1a1f5b70..90bdcea9 100644 --- a/test/comps/test_qb.cc +++ b/test/comps/test_qb.cc @@ -251,7 +251,7 @@ TEST_F(TestQB, Polynomial_Decay_general1) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, (*all_data).A, state); + RandLAPACK::gen::mat_gen(m_info, (*all_data).A.data(), state); svd_and_copy_computational_helper(*all_data); test_QB2_low_exact_rank>(block_sz, tol, *all_data, *all_algs, state); @@ -283,7 +283,7 @@ TEST_F(TestQB, Polynomial_Decay_general2) m_info.cond_num = 6.7; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, (*all_data).A, state); + RandLAPACK::gen::mat_gen(m_info, (*all_data).A.data(), state); svd_and_copy_computational_helper(*all_data); test_QB2_low_exact_rank>(block_sz, tol, *all_data, *all_algs, state); @@ -315,7 +315,7 @@ TEST_F(TestQB, Polynomial_Decay_zero_tol1) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, (*all_data).A, state); + RandLAPACK::gen::mat_gen(m_info, (*all_data).A.data(), state); double norm_A = lapack::lange(Norm::Fro, m, n, (*all_data).A.data(), m); test_QB2_k_eq_min(block_sz, tol, norm_A, *all_data, *all_algs, state); @@ -347,7 +347,7 @@ TEST_F(TestQB, Polynomial_Decay_zero_tol2) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, (*all_data).A, state); + RandLAPACK::gen::mat_gen(m_info, (*all_data).A.data(), state); double norm_A = lapack::lange(Norm::Fro, m, n, (*all_data).A.data(), m); test_QB2_k_eq_min(block_sz, tol, norm_A, *all_data, *all_algs, state); diff --git a/test/comps/test_rf.cc b/test/comps/test_rf.cc index 754ed51c..2729ce5d 100644 --- a/test/comps/test_rf.cc +++ b/test/comps/test_rf.cc @@ -160,7 +160,7 @@ TEST_F(TestRF, Polynomial_Decay_general1) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, (*all_data).A, state); + RandLAPACK::gen::mat_gen(m_info, (*all_data).A.data(), state); orth_and_copy_computational_helper(*all_data); @@ -190,7 +190,7 @@ TEST_F(TestRF, Polynomial_Decay_general2) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, (*all_data).A, state); + RandLAPACK::gen::mat_gen(m_info, (*all_data).A.data(), state); orth_and_copy_computational_helper(*all_data); diff --git a/test/comps/test_syrf.cc b/test/comps/test_syrf.cc index 6fb3aa0b..e897f603 100644 --- a/test/comps/test_syrf.cc +++ b/test/comps/test_syrf.cc @@ -159,7 +159,7 @@ TEST_F(TestSYRF, Polynomial_Decay_general1) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); algorithm_objects all_algs(verbosity, cond_check, p, passes_per_iteration); orth_and_copy_computational_helper(all_data); @@ -184,7 +184,7 @@ TEST_F(TestSYRF, Polynomial_Decay_general2) m_info.cond_num = 2025; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); algorithm_objects all_algs(verbosity, cond_check, p, passes_per_iteration); orth_and_copy_computational_helper(all_data); diff --git a/test/comps/test_util.cc b/test/comps/test_util.cc index 492387d5..6e440b6b 100644 --- a/test/comps/test_util.cc +++ b/test/comps/test_util.cc @@ -116,7 +116,7 @@ TEST_F(TestUtil, test_spectral_norm_polynomial_decay_double_precision) { m_info.cond_num = 2025; m_info.rank = n; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy.data(), m); test_spectral_norm(state, all_data); @@ -132,7 +132,7 @@ TEST_F(TestUtil, test_spectral_norm_rank_def_mat_double_precision) { RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::adverserial); m_info.scaling = std::pow(10, 15); m_info.rank = n; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy.data(), m); test_spectral_norm(state, all_data); @@ -149,7 +149,7 @@ TEST_F(TestUtil, test_spectral_norm_polynomial_decay_single_precision) { m_info.cond_num = 2; m_info.rank = n; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy.data(), m); test_spectral_norm(state, all_data); @@ -165,7 +165,7 @@ TEST_F(TestUtil, test_spectral_norm_rank_def_mat_single_precision) { RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::adverserial); m_info.scaling = std::pow(10, 7); m_info.rank = n; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy.data(), m); test_spectral_norm(state, all_data); diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc index 2129f1c9..f928e008 100644 --- a/test/drivers/test_cqrrp.cc +++ b/test/drivers/test_cqrrp.cc @@ -160,7 +160,7 @@ TEST_F(TestCQRRP, CQRRP_blocked_full_rank_basic) { //m_info.cond_num = 2; //m_info.rank = k; //m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); #if !defined(__APPLE__) @@ -189,7 +189,7 @@ TEST_F(TestCQRRP, CQRRP_blocked_full_rank_block_change) { //m_info.cond_num = 2; //m_info.rank = k; //m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); #if !defined(__APPLE__) @@ -219,7 +219,7 @@ TEST_F(TestCQRRP, CQRRP_blocked_low_rank) { //m_info.cond_num = 2; //m_info.rank = k; //m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); #if !defined(__APPLE__) @@ -245,9 +245,9 @@ TEST_F(TestCQRRP, something) { std::vector tau(n * 2, 0.0); RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, A, state); - RandLAPACK::gen::mat_gen(m_info, B, state); - RandLAPACK::gen::mat_gen(m_info, D, state); + RandLAPACK::gen::mat_gen(m_info, A.data(), state); + RandLAPACK::gen::mat_gen(m_info, B.data(), state); + RandLAPACK::gen::mat_gen(m_info, D.data(), state); lapack::lacpy(MatrixType::General, m, n, D.data(), m, D_cpy.data(), m); lapack::geqrf(m, n, A.data(), m, tau.data()); @@ -285,7 +285,7 @@ TEST_F(TestCQRRP, something2) { std::vector tau(n * 2, 0.0); RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, A, state); + RandLAPACK::gen::mat_gen(m_info, A.data(), state); lapack::geqr(m, n, A.data(), m, tau.data(), -1); int64_t tsize = (int64_t) t_3[0]; diff --git a/test/drivers/test_cqrrpt.cc b/test/drivers/test_cqrrpt.cc index 9a130555..1e99bb54 100644 --- a/test/drivers/test_cqrrpt.cc +++ b/test/drivers/test_cqrrpt.cc @@ -148,7 +148,7 @@ TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { m_info.cond_num = 2; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT, state); @@ -173,7 +173,7 @@ TEST_F(TestCQRRPT, CQRRPT_low_rank_with_hqrrp) { m_info.cond_num = 2; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT, state); @@ -198,7 +198,7 @@ TEST_F(TestCQRRPT, CQRRPT_bad_orth) { RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::adverserial); m_info.scaling = 1e7; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT, state); diff --git a/test/drivers/test_hqrrp.cc b/test/drivers/test_hqrrp.cc index c32913d9..f08e6ee2 100644 --- a/test/drivers/test_hqrrp.cc +++ b/test/drivers/test_hqrrp.cc @@ -159,7 +159,7 @@ TEST_F(TestHQRRP, HQRRP_full_rank_cholqr) { m_info.cond_num = 2; m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); // This test uses orhr_col diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index 536db2f6..80e2358d 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -98,7 +98,7 @@ TEST_F(TestRBKI, RBKI_basic) { RandLAPACK::RBKI RBKI(false, false, tol); RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); norm_and_copy_computational_helper(norm_A, all_data); test_RBKI_general>(norm_A, all_data, RBKI, state); diff --git a/test/drivers/test_revd2.cc b/test/drivers/test_revd2.cc index f8e2a9e3..c0c184a6 100644 --- a/test/drivers/test_revd2.cc +++ b/test/drivers/test_revd2.cc @@ -249,7 +249,7 @@ TEST_F(TestREVD2, Underestimation1) { m_info.cond_num = std::pow(10, 8); m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy.data(), state); symm_mat_and_copy_computational_helper(norm_A, all_data); test_REVD2_general( @@ -287,7 +287,7 @@ TEST_F(TestREVD2, Underestimation2) { m_info.cond_num = std::pow(10, 8); m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy.data(), state); symm_mat_and_copy_computational_helper(norm_A, all_data); test_REVD2_general( @@ -325,7 +325,7 @@ TEST_F(TestREVD2, Overestimation1) { m_info.cond_num = std::pow(10, 2); m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy.data(), state); symm_mat_and_copy_computational_helper(norm_A, all_data); test_REVD2_general( @@ -363,7 +363,7 @@ TEST_F(TestREVD2, Oversetimation2) { m_info.cond_num = std::pow(10, 2); m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy.data(), state); symm_mat_and_copy_computational_helper(norm_A, all_data); test_REVD2_general( @@ -401,7 +401,7 @@ TEST_F(TestREVD2, Exactness) { m_info.cond_num = std::pow(10, 2); m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A_cpy.data(), state); symm_mat_and_copy_computational_helper(norm_A, all_data); test_REVD2_general( @@ -437,7 +437,7 @@ TEST_F(TestREVD2, Uplo) { m_info.cond_num = std::pow(10, 2); m_info.rank = k; m_info.exponent = 2.0; - RandLAPACK::gen::mat_gen(m_info, all_data.work, state); + RandLAPACK::gen::mat_gen(m_info, all_data.work.data(), state); uplo_computational_helper(all_data); diff --git a/test/drivers/test_rsvd.cc b/test/drivers/test_rsvd.cc index a95e7ec8..39ba1609 100644 --- a/test/drivers/test_rsvd.cc +++ b/test/drivers/test_rsvd.cc @@ -182,7 +182,7 @@ TEST_F(TestRSVD, SimpleTest) RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial); m_info.cond_num = 2; m_info.rank = k; - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); computational_helper(all_data); test_RSVD1_general(tol, all_data, all_algs, state); From 583d8a6fa24bab8df1b7c1ac1829b1d92080acc8 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 20 Nov 2023 13:32:21 -0800 Subject: [PATCH 15/56] Ready for RBKI benchmarking --- RandLAPACK/misc/rl_gen.hh | 11 ++++++++--- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 494609b4..4f866b08 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -452,13 +452,18 @@ void process_input_mat( workspace_query_mod = 0; } else { double value; + int i, j; // Read input file std::ifstream inputMat(filename); // Place the contents of a file into the matrix space. - int i = -1; - while (inputMat >> value){ - A[++i] = value; + // Matrix is input in a row-major order, we process data in column-major. + // Reads here are, unfortunately, sequential; + for(j = 0; j < m; ++j) { + for(i = 0; i < n; ++i) { + inputMat >> value; + A[m * i + j] = value; + } } } } diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index c1a3b9b5..ffdeeadc 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -211,7 +211,7 @@ int main(int argc, char *argv[]) { RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); - printf("%e\n", *(all_data.A.data() + 1)); + printf("%e\n", *(all_data.A.data() + m)); /* // Declare a data file std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) From 3f4430e4c3958249ae684c5dc59f883715feff41 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 21 Nov 2023 13:58:14 -0800 Subject: [PATCH 16/56] Faced an issue with accuracy. Need to check Rob's implementation. --- RandLAPACK/drivers/rl_rbki.hh | 5 +- .../bench_RBKI/RBKI_speed_comparisons.cc | 168 ++++++++---------- test/drivers/test_cqrrp.cc | 21 --- 3 files changed, 79 insertions(+), 115 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 71f8bd53..759ff69d 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -79,7 +79,6 @@ int RBKI::call( int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; int64_t space_rows = k * std::ceil(m / (T) k); - printf("%ld\n", space_rows); // We need a full copy of X and Y all the way through the algorithm // due to an operation with X_odd and Y_odd happening at the end. @@ -160,7 +159,7 @@ int RBKI::call( // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 1 at iteration %ld\n", iter_ev); + //printf("TERMINATION 1 at iteration %ld\n", iter_ev); break; } @@ -195,7 +194,7 @@ int RBKI::call( // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 2 at iteration %ld\n", iter_od); + //printf("TERMINATION 2 at iteration %ld\n", iter_od); break; } diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index ffdeeadc..00f9061d 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -16,16 +16,16 @@ struct RBKI_benchmark_data { std::vector U; std::vector V; std::vector Sigma; - std::vector A_cpy; - std::vector Sigma_exact; + std::vector Sigma_cpy_1; + std::vector Sigma_cpy_2; RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : A(m * n, 0.0), U(m * n, 0.0), V(n * n, 0.0), Sigma(n, 0.0), - A_cpy(m * n, 0.0), - Sigma_exact(n, 0.0) + Sigma_cpy_1(n, 0.0), + Sigma_cpy_2(n, 0.0) { row = m; col = n; @@ -40,20 +40,35 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state) { - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.U.begin(), all_data.U.end(), 0.0); std::fill(all_data.V.begin(), all_data.V.end(), 0.0); std::fill(all_data.Sigma.begin(), all_data.Sigma.end(), 0.0); } +template +static void update_best_time(int iter, long &t_best, long &t_curr, int accuracy_check, T* S1, T* S2, int64_t k) +{ + // Can also do this is one line + // i == 0 ? (void) (t_rbki_best = dur_rbki, accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (dur_rbki < t_rbki_best) ? ((void) (t_rbki_best = dur_rbki), accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (void) NULL; + if (iter == 0 || t_curr < t_best) { + t_best = t_curr; + if (accuracy_check) + blas::copy(k, S1, 1, S2, 1); + } +} + + template static std::vector call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, int64_t k, RBKI_benchmark_data &all_data, - RandBLAS::RNGState &state) { + RandBLAS::RNGState &state, + int accuracy_check) { + int i, j; auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; @@ -69,9 +84,9 @@ static std::vector call_all_algs( // Making sure the states are unchanged auto state_gen = state; - auto state_alg = state; + //auto state_alg = state; - for (int i = 0; i < numruns; ++i) { + for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); // Testing RBKI @@ -80,20 +95,33 @@ static std::vector call_all_algs( auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); + // Update best timing and save the singular values. + update_best_time(i, t_rbki_best, dur_rbki, accuracy_check, all_data.Sigma.data(), all_data.Sigma_cpy_1.data(), k); + state_gen = state; data_regen(m_info, all_data, state_gen); - // Testing Other + // Testing Other - SVD auto start_other = high_resolution_clock::now(); - /// RIVAL ALGORITHM CALL + lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.Sigma.data(), all_data.U.data(), m, all_data.V.data(), n); auto stop_other = high_resolution_clock::now(); dur_other = duration_cast(stop_other - start_other).count(); + if (accuracy_check) + blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_2.data(), 1); + + // Update best timing and save the singular values. + update_best_time(i, t_other_best, dur_other, accuracy_check, all_data.Sigma.data(), all_data.Sigma_cpy_2.data(), k); + state_gen = state; data_regen(m_info, all_data, state_gen); - - i == 0 ? t_rbki_best = dur_rbki : (dur_rbki < t_rbki_best) ? t_rbki_best = dur_rbki : NULL; - i == 0 ? t_other_best = dur_other : (dur_other < t_other_best) ? t_other_best = dur_other : NULL; + } + + if (accuracy_check) { + printf("%.16e\n", all_data.Sigma_cpy_1[0]); + for(j = 0; j < k; ++j) {all_data.Sigma_cpy_1[j] -= all_data.Sigma_cpy_2[j];} + T nrm_err_sigma = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1); + printf("||A_hat_rbki - A_hat_svd||_F: %.16e\n", nrm_err_sigma); } std::vector res{t_rbki_best, t_other_best}; @@ -101,45 +129,57 @@ static std::vector call_all_algs( return res; } -/* -int main() { - // Declare parameters - int64_t m = std::pow(10, 3); - int64_t n = std::pow(10, 3); - int64_t k_start = 100; - int64_t k_stop = 100; +int main(int argc, char *argv[]) { + + if(argc <= 1) + // No input + return 0; + + int64_t m = 0; + int64_t n = 0; + int64_t k_start = 0; + int64_t k_stop = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - // Timing results + int numruns = 1; + int accuracy_check = 1; std::vector res; - // Number of algorithm runs. We only record best times. - int64_t numruns = 5; - // Allocate basic workspace - RBKI_benchmark_data all_data(m, n, k_stop, tol); + // Generate the input matrix. + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); + m_info.filename = argv[1]; + m_info.workspace_query_mod = 1; + // Workspace query; + RandLAPACK::gen::mat_gen(m_info, NULL, state); - // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + // Update basic params. + m = m_info.rows; + n = m_info.cols; + k_start = std::max((int64_t) 1, n / 100); + k_stop = std::max((int64_t) 1, n / 100); + + // Allocate basic workspace. + RBKI_benchmark_data all_data(m, n, k_stop, tol); + + // Fill the data matrix; + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) + "_k_start_" + std::to_string(k_start) + "_k_stop_" + std::to_string(k_stop) - + ".dat", std::fstream::app); + + ".dat", std::fstream::app); for (;k_start <= k_stop; k_start *= 2) { - res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); + res = call_all_algs(m_info, numruns, k_start, all_data, state_constant, accuracy_check); file << res[0] << ", " << res[1] << ",\n"; } } -*/ /* - -int main(int argc, char *argv[]) { +int main() { // Declare parameters int64_t m = std::pow(10, 3); int64_t n = std::pow(10, 3); @@ -157,73 +197,19 @@ int main(int argc, char *argv[]) { RBKI_benchmark_data all_data(m, n, k_stop, tol); // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); - - m_info.filename = argv[1]; - + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); - // Declare a data file std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) + "_k_start_" + std::to_string(k_start) + "_k_stop_" + std::to_string(k_stop) - + ".dat", std::fstream::app); - for (;k_start <= k_stop; k_start *= 2) { - res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); - file << res[0] << ", " << res[1] << ",\n"; - } -} -*/ - - -int main(int argc, char *argv[]) { - - int64_t m = 0; - int64_t n = 0; - int64_t k_start = 0; - int64_t k_stop = 0; - double tol = std::pow(std::numeric_limits::epsilon(), 0.85); - auto state = RandBLAS::RNGState(); - - // Generate the input matrix. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); - m_info.filename = argv[1]; - m_info.workspace_query_mod = 1; - // Workspace query; - RandLAPACK::gen::mat_gen(m_info, NULL, state); - - // Update basic params. - m = m_info.rows; - n = m_info.cols; - k_start = std::max((int64_t) 1, n / 100); - k_stop = n; - - // Allocate basic workspace. - RBKI_benchmark_data all_data(m, n, k_stop, tol); - - - printf("%d\n", all_data.A.size()); - - // Fill the data matrix; - RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); - - printf("rows %ld, cols %ld\n", m_info.rows, m_info.cols); - printf("%e\n", *(all_data.A.data() + m)); -/* - // Declare a data file - std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) - + "_n_" + std::to_string(n) - + "_k_start_" + std::to_string(k_start) - + "_k_stop_" + std::to_string(k_stop) - + ".dat", std::fstream::app); - + + ".dat", std::fstream::app); for (;k_start <= k_stop; k_start *= 2) { res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); file << res[0] << ", " << res[1] << ",\n"; } -*/ -} \ No newline at end of file +} +*/ \ No newline at end of file diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc index f928e008..88a019c7 100644 --- a/test/drivers/test_cqrrp.cc +++ b/test/drivers/test_cqrrp.cc @@ -273,24 +273,3 @@ TEST_F(TestCQRRP, something) { char name1 [] = "D through gemm"; RandBLAS::util::print_colmaj(m, n, D_space.data(), name1); } - -/* -// Note: If Subprocess killed exception -> reload vscode -TEST_F(TestCQRRP, something2) { - int64_t m = 10; - int64_t n = 5; - auto state = RandBLAS::RNGState(); - - std::vector A(m * n, 0.0); - std::vector tau(n * 2, 0.0); - - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, A.data(), state); - - lapack::geqr(m, n, A.data(), m, tau.data(), -1); - int64_t tsize = (int64_t) t_3[0]; - t_3.resize(tsize); - auto sart_geqr = high_resolution_clock::now(); - lapack::geqr(m, n, A_1.data(), m, t_3.data(), tsize); -} -*/ \ No newline at end of file From de2ca8dd6476bf9caf08c4c08afd7fb242efdb6b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 28 Nov 2023 13:06:06 -0800 Subject: [PATCH 17/56] Tuned the benchmark for dataset 1 --- RandLAPACK/drivers/rl_cqrrp.hh | 35 ++--- RandLAPACK/drivers/rl_rbki.hh | 62 ++++++++- .../bench_RBKI/RBKI_speed_comparisons.cc | 130 ++++++++++++------ 3 files changed, 156 insertions(+), 71 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 765df493..330b1ca7 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -117,7 +117,7 @@ class CQRRP_blocked : public CQRRPalg { int64_t rank; int64_t block_size; - // 11 entries - logs time for different portions of the algorithm + // 12 entries - logs time for different portions of the algorithm std::vector times; // Times each iteration of the algorithm, divides size of a processed matrix by the time it took to process. // At each iteration, the algorithm will process rows by b_sz matrix; rows -= b_sz. @@ -145,50 +145,39 @@ int CQRRP_blocked::call( //-------TIMING VARS--------/ high_resolution_clock::time_point preallocation_t_stop; high_resolution_clock::time_point preallocation_t_start; - long preallocation_t_dur = 0; - high_resolution_clock::time_point saso_t_stop; high_resolution_clock::time_point saso_t_start; - long saso_t_dur = 0; - high_resolution_clock::time_point qrcp_t_start; high_resolution_clock::time_point qrcp_t_stop; - long qrcp_t_dur = 0; - high_resolution_clock::time_point cholqr_t_start; high_resolution_clock::time_point cholqr_t_stop; - long cholqr_t_dur = 0; - high_resolution_clock::time_point reconstruction_t_start; high_resolution_clock::time_point reconstruction_t_stop; - long reconstruction_t_dur = 0; - high_resolution_clock::time_point preconditioning_t_start; high_resolution_clock::time_point preconditioning_t_stop; - long preconditioning_t_dur = 0; - high_resolution_clock::time_point r_piv_t_start; high_resolution_clock::time_point r_piv_t_stop; - long r_piv_t_dur = 0; - high_resolution_clock::time_point updating1_t_start; high_resolution_clock::time_point updating1_t_stop; - long updating1_t_dur = 0; - high_resolution_clock::time_point updating2_t_start; high_resolution_clock::time_point updating2_t_stop; - long updating2_t_dur = 0; - high_resolution_clock::time_point updating3_t_start; high_resolution_clock::time_point updating3_t_stop; - long updating3_t_dur = 0; - high_resolution_clock::time_point total_t_start; high_resolution_clock::time_point total_t_stop; - long total_t_dur = 0; - high_resolution_clock::time_point iter_t_start; high_resolution_clock::time_point iter_t_stop; + long preallocation_t_dur = 0; + long saso_t_dur = 0; + long qrcp_t_dur = 0; + long cholqr_t_dur = 0; + long reconstruction_t_dur = 0; + long preconditioning_t_dur = 0; + long r_piv_t_dur = 0; + long updating1_t_dur = 0; + long updating2_t_dur = 0; + long updating3_t_dur = 0; + long total_t_dur = 0; if(this -> timing) { total_t_start = high_resolution_clock::now(); diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 759ff69d..2edfe1ca 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -11,6 +11,7 @@ #include #include #include +#include using namespace std::chrono; @@ -44,6 +45,7 @@ class RBKI : public RBKIalg { verbosity = verb; timing = time_subroutines; tol = ep; + max_krylov_iters = INT_MAX; } int call( int64_t m, @@ -61,6 +63,9 @@ class RBKI : public RBKIalg { bool timing; T tol; int num_krylov_iters; + int max_krylov_iters; + std::vector times; + T norm_R_end; }; // ----------------------------------------------------------------------------- @@ -76,6 +81,20 @@ int RBKI::call( T* Sigma, RandBLAS::RNGState &state ){ + + high_resolution_clock::time_point preallocation_t_start; + high_resolution_clock::time_point preallocation_t_stop; + high_resolution_clock::time_point total_t_start; + high_resolution_clock::time_point total_t_stop; + + long preallocation_t_dur = 0; + long total_t_dur = 0; + + if(this -> timing) { + total_t_start = high_resolution_clock::now(); + preallocation_t_start = high_resolution_clock::now(); + } + int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; int64_t space_rows = k * std::ceil(m / (T) k); @@ -83,17 +102,20 @@ int RBKI::call( // We need a full copy of X and Y all the way through the algorithm // due to an operation with X_odd and Y_odd happening at the end. // Space for Y_i and Y_odd. - T* Y = ( T * ) calloc( n * m, sizeof( T ) ); + T* Y = ( T * ) calloc( n * m, sizeof( T ) ); // Space for X_i and X_ev. (maybe needs to be m by m + k) T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); // tau space for QR - T* tau = ( T * ) calloc( k, sizeof( T ) ); + T* tau = ( T * ) calloc( k, sizeof( T ) ); // While R and S matrices are structured (both band), we cannot make use of this structure through // BLAS-level functions. // Note also that we store a transposed version of R. - T* R = ( T * ) calloc( n * n, sizeof( T ) ); + T* R = ( T * ) calloc( n * n, sizeof( T ) ); T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); + T* Y_orth_buf = ( T * ) calloc( k * n, sizeof( T ) ); + T* X_orth_buf = ( T * ) calloc( k * (n + k), sizeof( T ) ); + // Pointers allocation // Below pointers will be offset by (n or m) * k at every even iteration. T* Y_i = Y; @@ -110,6 +132,11 @@ int RBKI::call( T* U_hat = NULL; T* VT_hat = NULL; + if(this -> timing) { + preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); + } + // Pre-conpute Fro norm of an input matrix. T norm_A = lapack::lange(Norm::Fro, m, n, A, lda); T sq_tol = std::pow(this->tol, 2); @@ -129,7 +156,7 @@ int RBKI::call( ++iter_od; // Iterate until in-loop termination criteria is met. - while(1) { + while((iter_ev + iter_od) < max_krylov_iters) { if (iter % 2 == 0) { // Y_i = A' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); @@ -142,6 +169,10 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, R_i, n); // Y_i = Y_i - Y_od * R_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); + + // Reorthogonalization + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, Y_orth_buf, k); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, Y_orth_buf, k, 1.0, Y_i, n); } // [Y_i, R_ii] = qr(Y_i, 0) @@ -181,6 +212,10 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); //X_i = X_i - X_ev * S_i; blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); + + // Reorthogonalization + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, X_orth_buf, n + k); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, X_orth_buf, n + k, 1.0, X_i, m); // [X_i, S_ii] = qr(X_i, 0); std::fill(&tau[0], &tau[k], 0.0); @@ -206,13 +241,14 @@ int RBKI::call( } ++iter; norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n, n, R, n); - + //norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A if(norm_R > threshold) { break; } } + this -> norm_R_end = norm_R; this->num_krylov_iters = iter; iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; @@ -241,6 +277,22 @@ int RBKI::call( free(S); free(U_hat); free(VT_hat); + free(Y_orth_buf); + free(X_orth_buf); + + if(this -> timing) { + total_t_stop = high_resolution_clock::now(); + total_t_dur = duration_cast(total_t_stop - total_t_start).count(); + long t_rest = total_t_dur - (preallocation_t_dur); + this -> times.resize(3); + this -> times = {preallocation_t_dur, t_rest, total_t_dur}; + + printf("\n\n/------------CQRRP TIMING RESULTS BEGIN------------/\n"); + printf("Preallocation time: %25ld μs,\n", preallocation_t_dur); + + printf("\nPreallocation takes %22.2f%% of runtime.\n", 100 * ((T) preallocation_t_dur / (T) total_t_dur)); + printf("/-------------CQRRP TIMING RESULTS END-------------/\n\n"); + } return 0; } diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 00f9061d..cfec8ccc 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -18,6 +18,7 @@ struct RBKI_benchmark_data { std::vector Sigma; std::vector Sigma_cpy_1; std::vector Sigma_cpy_2; + std::vector Sigma_cpy_3; RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : A(m * n, 0.0), @@ -25,7 +26,8 @@ struct RBKI_benchmark_data { V(n * n, 0.0), Sigma(n, 0.0), Sigma_cpy_1(n, 0.0), - Sigma_cpy_2(n, 0.0) + Sigma_cpy_2(n, 0.0), + Sigma_cpy_3(n, 0.0) { row = m; col = n; @@ -47,40 +49,58 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, } template -static void update_best_time(int iter, long &t_best, long &t_curr, int accuracy_check, T* S1, T* S2, int64_t k) +static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t k) { // Can also do this is one line // i == 0 ? (void) (t_rbki_best = dur_rbki, accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (dur_rbki < t_rbki_best) ? ((void) (t_rbki_best = dur_rbki), accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (void) NULL; if (iter == 0 || t_curr < t_best) { t_best = t_curr; - if (accuracy_check) - blas::copy(k, S1, 1, S2, 1); + blas::copy(k, S1, 1, S2, 1); } } - +/* +template +static void svd_error(T* U1, T* S1, T* VT1, T* U2, T* S2, T* VT2) +{ + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); + +} +*/ template -static std::vector call_all_algs( +static void call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, int64_t k, + int64_t num_krylov_iters, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state, - int accuracy_check) { + std::string output_filename) { int i, j; auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; + T norm_svd; + T err_rbki; + T err_lan; + + // Set the threshold for Lanchosz + // Setting up Lanchosz - RBKI with k = 1. + RandLAPACK::RBKI Lanchosz(false, false, tol); + //Lanchosz.max_krylov_iters = 1500; // Additional params setup. RandLAPACK::RBKI RBKI(false, false, tol); + RBKI.max_krylov_iters = num_krylov_iters; // timing vars - long dur_rbki = 0; - long dur_other = 0; - long t_rbki_best = 0; - long t_other_best = 0; + long dur_rbki = 0; + long dur_other = 0; + long dur_lanchosz = 0; + long t_rbki_best = 0; + long t_other_best = 0; + long t_lanchosz_best = 0; // Making sure the states are unchanged auto state_gen = state; @@ -88,6 +108,18 @@ static std::vector call_all_algs( for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); + + // Testing Lanchosz + auto start_lanchosz = high_resolution_clock::now(); + Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + auto stop_lanchosz = high_resolution_clock::now(); + dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); + + // Update best timing and save the singular values. + update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), 1); + + state_gen = state; + data_regen(m_info, all_data, state_gen); // Testing RBKI auto start_rbki = high_resolution_clock::now(); @@ -96,7 +128,7 @@ static std::vector call_all_algs( dur_rbki = duration_cast(stop_rbki - start_rbki).count(); // Update best timing and save the singular values. - update_best_time(i, t_rbki_best, dur_rbki, accuracy_check, all_data.Sigma.data(), all_data.Sigma_cpy_1.data(), k); + update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_1.data(), k); state_gen = state; data_regen(m_info, all_data, state_gen); @@ -107,26 +139,32 @@ static std::vector call_all_algs( auto stop_other = high_resolution_clock::now(); dur_other = duration_cast(stop_other - start_other).count(); - if (accuracy_check) - blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_2.data(), 1); + blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_2.data(), 1); // Update best timing and save the singular values. - update_best_time(i, t_other_best, dur_other, accuracy_check, all_data.Sigma.data(), all_data.Sigma_cpy_2.data(), k); + update_best_time(i, t_other_best, dur_other, all_data.Sigma.data(), all_data.Sigma_cpy_2.data(), k); state_gen = state; data_regen(m_info, all_data, state_gen); } - if (accuracy_check) { - printf("%.16e\n", all_data.Sigma_cpy_1[0]); - for(j = 0; j < k; ++j) {all_data.Sigma_cpy_1[j] -= all_data.Sigma_cpy_2[j];} - T nrm_err_sigma = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1); - printf("||A_hat_rbki - A_hat_svd||_F: %.16e\n", nrm_err_sigma); - } + for(j = 0; j < k; ++j) { + all_data.Sigma_cpy_1[j] -= all_data.Sigma_cpy_2[j]; + all_data.Sigma_cpy_3[j] -= all_data.Sigma_cpy_2[j]; + } + norm_svd = blas::nrm2(k, all_data.Sigma_cpy_2.data(), 1); + err_rbki = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1) / norm_svd; + err_lan = blas::nrm2(k, all_data.Sigma_cpy_3.data(), 1) / norm_svd; + + // Print accuracy info + printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); + printf("||Sigma_ksvd - Sigma_lanc||_F / ||Sigma_lanc||_F: %.16e\n", err_lan); - std::vector res{t_rbki_best, t_other_best}; + printf("RBKI is %f times faster that SVD.\n", (T) t_other_best / t_rbki_best); + printf("Lanchosz is %f times faster that SVD.\n", (T) t_other_best / t_lanchosz_best); - return res; + std::ofstream file(output_filename, std::ios::app); + file << k << ", " << num_krylov_iters << ", " << err_rbki << ", " << err_lan << ", " << t_rbki_best << ", " << t_other_best << ", " << t_lanchosz_best << ",\n"; } int main(int argc, char *argv[]) { @@ -135,15 +173,17 @@ int main(int argc, char *argv[]) { // No input return 0; - int64_t m = 0; - int64_t n = 0; - int64_t k_start = 0; - int64_t k_stop = 0; - double tol = std::pow(std::numeric_limits::epsilon(), 0.85); - auto state = RandBLAS::RNGState(); - auto state_constant = state; - int numruns = 1; - int accuracy_check = 1; + int64_t m = 0; + int64_t n = 0; + int64_t k_start = 0; + int64_t k_stop = 0; + int64_t num_krylov_iters_start = 2; + int64_t num_krylov_iters_curr = num_krylov_iters_start; + int64_t num_krylov_iters_stop = 2048; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + int numruns = 1; std::vector res; // Generate the input matrix. @@ -156,8 +196,8 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - k_start = std::max((int64_t) 1, n / 100); - k_stop = std::max((int64_t) 1, n / 100); + k_start = 2;//std::max((int64_t) 1, n / 10); + k_stop = n;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, k_stop, tol); @@ -166,15 +206,19 @@ int main(int argc, char *argv[]) { RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file - std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) - + "_n_" + std::to_string(n) - + "_k_start_" + std::to_string(k_start) - + "_k_stop_" + std::to_string(k_stop) - + ".dat", std::fstream::app); - - for (;k_start <= k_stop; k_start *= 2) { - res = call_all_algs(m_info, numruns, k_start, all_data, state_constant, accuracy_check); - file << res[0] << ", " << res[1] << ",\n"; + std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + + "_n_" + std::to_string(n) + + "_k_start_" + std::to_string(k_start) + + "_k_stop_" + std::to_string(k_stop) + + "_num_krylov_iters_start_" + std::to_string(num_krylov_iters_start) + + "_num_krylov_iters_stop_" + std::to_string(num_krylov_iters_stop) + + ".dat"; + + for (;k_start <= k_stop; k_start *=2) { + for (;num_krylov_iters_curr <= num_krylov_iters_stop; num_krylov_iters_curr *=2) { + call_all_algs(m_info, numruns, k_start, num_krylov_iters_curr, all_data, state_constant, output_filename); + } + num_krylov_iters_curr = num_krylov_iters_start; } } From 5ee730fa6a7a137a3307e2983f0b9e249283cf84 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 7 Dec 2023 07:44:51 -0800 Subject: [PATCH 18/56] Benchmark fux --- RandLAPACK/drivers/rl_hqrrp.hh | 10 ++++----- .../bench_RBKI/RBKI_speed_comparisons.cc | 22 ++++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index 1dd6f2d3..8a568376 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -98,7 +98,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (double *) buff_U, & ldim_U, (double *) buff_T, & ldim_T, (double *) buff_B, & ldim_B, (double *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - , 1, 1, 1, 1 + //, 1, 1, 1, 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -106,7 +106,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (float *) buff_U, & ldim_U, (float *) buff_T, & ldim_T, (float *) buff_B, & ldim_B, (float *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - , 1, 1, 1, 1 + //, 1, 1, 1, 1 #endif ); } else { @@ -136,7 +136,7 @@ void _LAPACK_larf( (double *) C, & ldc_, (double *) work #ifdef LAPACK_FORTRAN_STRLEN_END - , 1 + //, 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -146,7 +146,7 @@ void _LAPACK_larf( (float *) C, & ldc_, (float *) work #ifdef LAPACK_FORTRAN_STRLEN_END - , 1 + //, 1 #endif ); } else { @@ -398,7 +398,7 @@ static int64_t NoFLA_QRP_downdate_partial_norms( char dlmach_param = 'E'; tol3z = sqrt( LAPACK_dlamch( & dlmach_param #ifdef LAPACK_FORTRAN_STRLEN_END - , 1 + //, 1 #endif ) ); ptr_d = buff_d; diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index cfec8ccc..568f42e0 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -81,9 +81,11 @@ static void call_all_algs( auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; - T norm_svd; + T norm_svd_k; + T norm_svd_lanc; T err_rbki; T err_lan; + int64_t k_lanc = std::min((int64_t) (num_krylov_iters / (T) 2), k); // Set the threshold for Lanchosz // Setting up Lanchosz - RBKI with k = 1. @@ -116,7 +118,7 @@ static void call_all_algs( dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); // Update best timing and save the singular values. - update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), 1); + update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), k_lanc); state_gen = state; data_regen(m_info, all_data, state_gen); @@ -148,13 +150,17 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen); } - for(j = 0; j < k; ++j) { + for(j = 0; j < k; ++j) all_data.Sigma_cpy_1[j] -= all_data.Sigma_cpy_2[j]; + + for(j = 0; j < k_lanc; ++j) all_data.Sigma_cpy_3[j] -= all_data.Sigma_cpy_2[j]; - } - norm_svd = blas::nrm2(k, all_data.Sigma_cpy_2.data(), 1); - err_rbki = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1) / norm_svd; - err_lan = blas::nrm2(k, all_data.Sigma_cpy_3.data(), 1) / norm_svd; + + norm_svd_k = blas::nrm2(k, all_data.Sigma_cpy_2.data(), 1); + norm_svd_lanc = blas::nrm2(k_lanc, all_data.Sigma_cpy_2.data(), 1); + + err_rbki = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1) / norm_svd_k; + err_lan = blas::nrm2(k_lanc, all_data.Sigma_cpy_3.data(), 1) / norm_svd_lanc; // Print accuracy info printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); @@ -179,7 +185,7 @@ int main(int argc, char *argv[]) { int64_t k_stop = 0; int64_t num_krylov_iters_start = 2; int64_t num_krylov_iters_curr = num_krylov_iters_start; - int64_t num_krylov_iters_stop = 2048; + int64_t num_krylov_iters_stop = 64; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; From 77a44bcde6622da18cd9e2a2a6b33860663b21d7 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 7 Dec 2023 11:58:17 -0800 Subject: [PATCH 19/56] Update --- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 568f42e0..9e2152ff 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -91,6 +91,7 @@ static void call_all_algs( // Setting up Lanchosz - RBKI with k = 1. RandLAPACK::RBKI Lanchosz(false, false, tol); //Lanchosz.max_krylov_iters = 1500; + Lanchosz.max_krylov_iters = num_krylov_iters; // Additional params setup. RandLAPACK::RBKI RBKI(false, false, tol); From ccb097f19adaa3d787d693e66badb5299cc17031 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 4 Jan 2024 02:44:41 -0800 Subject: [PATCH 20/56] Small RBKI bug fix --- RandLAPACK/drivers/rl_rbki.hh | 13 +++++++++---- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 7 +++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 2edfe1ca..5de61729 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -152,19 +152,21 @@ int RBKI::call( // Convert X_i into an explicit form. It is now stored in X_ev as it should be. lapack::ungqr(m, k, k, X_i, m, tau); - // Advance odd iteration count; + // Advance odd iteration count. ++iter_od; + // Advance iteration count. + ++iter; // Iterate until in-loop termination criteria is met. while((iter_ev + iter_od) < max_krylov_iters) { - if (iter % 2 == 0) { + if (iter % 2 != 0) { // Y_i = A' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); // Move the X_i pointer; X_i = &X_i[m * k]; - if (iter != 0) { + if (iter != 1) { // R_i' = Y_i' * Y_od blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, R_i, n); // Y_i = Y_i - Y_od * R_i @@ -195,7 +197,7 @@ int RBKI::call( } // Advance R pointers - iter == 0 ? R_i = &R_ii[k] : R_i = &R_i[k]; + iter == 1 ? R_i = &R_ii[k] : R_i = &R_i[k]; R_ii = &R_ii[(n + 1) * k]; // Advance even iteration count; @@ -270,6 +272,9 @@ int RBKI::call( // We actually perform VT = V_hat' * Y_odd' blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); + printf("%e\n", *Sigma); + printf("%e\n", *(Sigma+1)); + free(Y); free(X); free(tau); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 9e2152ff..f6c1ed81 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -90,7 +90,6 @@ static void call_all_algs( // Set the threshold for Lanchosz // Setting up Lanchosz - RBKI with k = 1. RandLAPACK::RBKI Lanchosz(false, false, tol); - //Lanchosz.max_krylov_iters = 1500; Lanchosz.max_krylov_iters = num_krylov_iters; // Additional params setup. @@ -114,7 +113,7 @@ static void call_all_algs( // Testing Lanchosz auto start_lanchosz = high_resolution_clock::now(); - Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + //Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_lanchosz = high_resolution_clock::now(); dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); @@ -190,7 +189,7 @@ int main(int argc, char *argv[]) { double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 1; + int numruns = 10; std::vector res; // Generate the input matrix. @@ -204,7 +203,7 @@ int main(int argc, char *argv[]) { m = m_info.rows; n = m_info.cols; k_start = 2;//std::max((int64_t) 1, n / 10); - k_stop = n;//std::max((int64_t) 1, n / 10); + k_stop = 2048;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, k_stop, tol); From 96c727a5a518f22924f96f188653461acd564bc3 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 16 Jan 2024 01:41:17 -0800 Subject: [PATCH 21/56] Debugging --- RandLAPACK/drivers/rl_rbki.hh | 177 ++++++++++++++++-- .../bench_RBKI/RBKI_speed_comparisons.cc | 44 ++++- 2 files changed, 194 insertions(+), 27 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 5de61729..80c92126 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -82,17 +82,32 @@ int RBKI::call( RandBLAS::RNGState &state ){ - high_resolution_clock::time_point preallocation_t_start; - high_resolution_clock::time_point preallocation_t_stop; + high_resolution_clock::time_point allocation_t_start; + high_resolution_clock::time_point allocation_t_stop; + high_resolution_clock::time_point get_factors_t_start; + high_resolution_clock::time_point get_factors_t_stop; + high_resolution_clock::time_point ungqr_t_start; + high_resolution_clock::time_point ungqr_t_stop; + high_resolution_clock::time_point reorth_t_start; + high_resolution_clock::time_point reorth_t_stop; + high_resolution_clock::time_point qr_t_start; + high_resolution_clock::time_point qr_t_stop; + high_resolution_clock::time_point gemm_A_t_start; + high_resolution_clock::time_point gemm_A_t_stop; high_resolution_clock::time_point total_t_start; high_resolution_clock::time_point total_t_stop; - long preallocation_t_dur = 0; - long total_t_dur = 0; + long allocation_t_dur = 0; + long get_factors_t_dur = 0; + long ungqr_t_dur = 0; + long reorth_t_dur = 0; + long qr_t_dur = 0; + long gemm_A_t_dur = 0; + long total_t_dur = 0; if(this -> timing) { total_t_start = high_resolution_clock::now(); - preallocation_t_start = high_resolution_clock::now(); + allocation_t_start = high_resolution_clock::now(); } int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; @@ -133,8 +148,8 @@ int RBKI::call( T* VT_hat = NULL; if(this -> timing) { - preallocation_t_stop = high_resolution_clock::now(); - preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); + allocation_t_stop = high_resolution_clock::now(); + allocation_t_dur = duration_cast(allocation_t_stop - allocation_t_start).count(); } // Pre-conpute Fro norm of an input matrix. @@ -146,12 +161,36 @@ int RBKI::call( RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; + if(this -> timing) + gemm_A_t_start = high_resolution_clock::now(); + // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); + + if(this -> timing) { + gemm_A_t_stop = high_resolution_clock::now(); + gemm_A_t_dur = duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); + } + + if(this -> timing) + qr_t_start = high_resolution_clock::now(); + lapack::geqrf(m, k, X_i, m, tau); + + if(this -> timing) { + qr_t_stop = high_resolution_clock::now(); + qr_t_dur = duration_cast(qr_t_stop - qr_t_start).count(); + ungqr_t_start = high_resolution_clock::now(); + } + // Convert X_i into an explicit form. It is now stored in X_ev as it should be. lapack::ungqr(m, k, k, X_i, m, tau); + if(this -> timing) { + ungqr_t_stop = high_resolution_clock::now(); + ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); + } + // Advance odd iteration count. ++iter_od; // Advance iteration count. @@ -160,9 +199,18 @@ int RBKI::call( // Iterate until in-loop termination criteria is met. while((iter_ev + iter_od) < max_krylov_iters) { if (iter % 2 != 0) { + + if(this -> timing) + gemm_A_t_start = high_resolution_clock::now(); + // Y_i = A' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); + if(this -> timing) { + gemm_A_t_stop = high_resolution_clock::now(); + gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); + } + // Move the X_i pointer; X_i = &X_i[m * k]; @@ -172,23 +220,48 @@ int RBKI::call( // Y_i = Y_i - Y_od * R_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); + if(this -> timing) + reorth_t_start = high_resolution_clock::now(); + // Reorthogonalization blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, Y_orth_buf, k); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, Y_orth_buf, k, 1.0, Y_i, n); + + if(this -> timing) { + reorth_t_stop = high_resolution_clock::now(); + reorth_t_dur += duration_cast(reorth_t_stop - reorth_t_start).count(); + } } // [Y_i, R_ii] = qr(Y_i, 0) std::fill(&tau[0], &tau[k], 0.0); + + if(this -> timing) + qr_t_start = high_resolution_clock::now(); + lapack::geqrf(n, k, Y_i, n, tau); + if(this -> timing) { + qr_t_stop = high_resolution_clock::now(); + qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); + } + // Copy R_ii over to R's (in transposed format). #pragma omp parallel for for(i = 0; i < k; ++i) blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n); + if(this -> timing) + ungqr_t_start = high_resolution_clock::now(); + // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be. lapack::ungqr(n, k, k, Y_i, n, tau); + if(this -> timing) { + ungqr_t_stop = high_resolution_clock::now(); + ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); + } + // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { @@ -204,9 +277,18 @@ int RBKI::call( ++iter_ev; } else { + + if(this -> timing) + gemm_A_t_start = high_resolution_clock::now(); + // X_i = A * Y_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); + if(this -> timing) { + gemm_A_t_stop = high_resolution_clock::now(); + gemm_A_t_dur =+ duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); + } + // Move the X_i pointer; Y_i = &Y_i[n * k]; @@ -214,20 +296,46 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); //X_i = X_i - X_ev * S_i; blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); + + if(this -> timing) + reorth_t_start = high_resolution_clock::now(); // Reorthogonalization blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, X_orth_buf, n + k); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, X_orth_buf, n + k, 1.0, X_i, m); + if(this -> timing) { + reorth_t_stop = high_resolution_clock::now(); + reorth_t_dur += duration_cast(reorth_t_stop - reorth_t_start).count(); + } + // [X_i, S_ii] = qr(X_i, 0); std::fill(&tau[0], &tau[k], 0.0); + + if(this -> timing) + qr_t_start = high_resolution_clock::now(); + lapack::geqrf(m, k, X_i, m, tau); + if(this -> timing) { + qr_t_stop = high_resolution_clock::now(); + qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); + } + // Copy S_ii over to S's space under S_i (offset down by iter_od * k) lapack::lacpy(MatrixType::Upper, k, k, X_i, m, S_ii, n + k); + + if(this -> timing) + ungqr_t_start = high_resolution_clock::now(); + // Convert X_i into an explicit form. It is now stored in X_ev as it should be lapack::ungqr(m, k, k, X_i, m, tau); + if(this -> timing) { + ungqr_t_stop = high_resolution_clock::now(); + ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); + } + // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { @@ -254,10 +362,18 @@ int RBKI::call( this->num_krylov_iters = iter; iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; + if(this -> timing) { + allocation_t_start = high_resolution_clock::now(); + } + U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); VT_hat = ( T * ) calloc( end_cols * end_cols, sizeof( T ) ); - //printf("rows: %ld, cols: %ld\n", end_rows, end_cols); + if(this -> timing) { + allocation_t_stop = high_resolution_clock::now(); + allocation_t_dur += duration_cast(allocation_t_stop - allocation_t_start).count(); + get_factors_t_start = high_resolution_clock::now(); + } if (iter % 2 == 0) { // [U_hat, Sigma, V_hat] = svd(R') @@ -272,8 +388,11 @@ int RBKI::call( // We actually perform VT = V_hat' * Y_odd' blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); - printf("%e\n", *Sigma); - printf("%e\n", *(Sigma+1)); + if(this -> timing) { + get_factors_t_stop = high_resolution_clock::now(); + get_factors_t_dur = duration_cast(get_factors_t_stop - get_factors_t_start).count(); + allocation_t_start = high_resolution_clock::now(); + } free(Y); free(X); @@ -285,18 +404,38 @@ int RBKI::call( free(Y_orth_buf); free(X_orth_buf); + if(this -> timing) { + allocation_t_stop = high_resolution_clock::now(); + allocation_t_dur += duration_cast(allocation_t_stop - allocation_t_start).count(); + } + if(this -> timing) { total_t_stop = high_resolution_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); - long t_rest = total_t_dur - (preallocation_t_dur); - this -> times.resize(3); - this -> times = {preallocation_t_dur, t_rest, total_t_dur}; - - printf("\n\n/------------CQRRP TIMING RESULTS BEGIN------------/\n"); - printf("Preallocation time: %25ld μs,\n", preallocation_t_dur); - - printf("\nPreallocation takes %22.2f%% of runtime.\n", 100 * ((T) preallocation_t_dur / (T) total_t_dur)); - printf("/-------------CQRRP TIMING RESULTS END-------------/\n\n"); + long t_rest = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur); + this -> times.resize(8); + this -> times = {allocation_t_dur, get_factors_t_dur, ungqr_t_dur, reorth_t_dur, qr_t_dur, gemm_A_t_dur, t_rest, total_t_dur}; + + if (this -> verbosity) { + printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); + printf("Basic info: b_sz=%ld krylov_iters=%ld\n", k, num_krylov_iters); + + printf("Allocate and free time: %25ld μs,\n", allocation_t_dur); + printf("Time to acquire the SVD factors: %25ld μs,\n", get_factors_t_dur); + printf("UNGQR time: %25ld μs,\n", ungqr_t_dur); + printf("Reorthogonalization time: %25ld μs,\n", reorth_t_dur); + printf("QR time: %25ld μs,\n", qr_t_dur); + printf("GEMM A time: %25ld μs,\n", gemm_A_t_dur); + + printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) allocation_t_dur / (T) total_t_dur)); + printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) get_factors_t_dur / (T) total_t_dur)); + printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) ungqr_t_dur / (T) total_t_dur)); + printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) reorth_t_dur / (T) total_t_dur)); + printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) qr_t_dur / (T) total_t_dur)); + printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) gemm_A_t_dur / (T) total_t_dur)); + printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) t_rest / (T) total_t_dur)); + printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); + } } return 0; diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index f6c1ed81..f6d4968d 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -49,7 +49,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, } template -static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t k) +static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t k, long* break_in, long* break_out, int timing) { // Can also do this is one line // i == 0 ? (void) (t_rbki_best = dur_rbki, accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (dur_rbki < t_rbki_best) ? ((void) (t_rbki_best = dur_rbki), accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (void) NULL; @@ -57,6 +57,8 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, t_best = t_curr; blas::copy(k, S1, 1, S2, 1); } + if (timing) + blas::copy(8, break_out, 1, break_in, 1); } /* template @@ -86,6 +88,7 @@ static void call_all_algs( T err_rbki; T err_lan; int64_t k_lanc = std::min((int64_t) (num_krylov_iters / (T) 2), k); + bool time_subroutines = true; // Set the threshold for Lanchosz // Setting up Lanchosz - RBKI with k = 1. @@ -93,7 +96,7 @@ static void call_all_algs( Lanchosz.max_krylov_iters = num_krylov_iters; // Additional params setup. - RandLAPACK::RBKI RBKI(false, false, tol); + RandLAPACK::RBKI RBKI(false, time_subroutines, tol); RBKI.max_krylov_iters = num_krylov_iters; // timing vars @@ -108,6 +111,10 @@ static void call_all_algs( auto state_gen = state; //auto state_alg = state; + // Timing breakdown vectors; + std::vector Lanc_timing_breakdown (8, 0.0); + std::vector RBKI_timing_breakdown (8, 0.0); + for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); @@ -116,9 +123,9 @@ static void call_all_algs( //Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_lanchosz = high_resolution_clock::now(); dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); - + // Update best timing and save the singular values. - update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), k_lanc); + update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); state_gen = state; data_regen(m_info, all_data, state_gen); @@ -130,7 +137,7 @@ static void call_all_algs( dur_rbki = duration_cast(stop_rbki - start_rbki).count(); // Update best timing and save the singular values. - update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_1.data(), k); + update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_1.data(), k, RBKI_timing_breakdown.data(), RBKI.times.data(), time_subroutines); state_gen = state; data_regen(m_info, all_data, state_gen); @@ -144,12 +151,12 @@ static void call_all_algs( blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_2.data(), 1); // Update best timing and save the singular values. - update_best_time(i, t_other_best, dur_other, all_data.Sigma.data(), all_data.Sigma_cpy_2.data(), k); + update_best_time(i, t_other_best, dur_other, all_data.Sigma.data(), all_data.Sigma_cpy_2.data(), k, NULL, NULL, 0); state_gen = state; data_regen(m_info, all_data, state_gen); } - + for(j = 0; j < k; ++j) all_data.Sigma_cpy_1[j] -= all_data.Sigma_cpy_2[j]; @@ -162,6 +169,27 @@ static void call_all_algs( err_rbki = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1) / norm_svd_k; err_lan = blas::nrm2(k_lanc, all_data.Sigma_cpy_3.data(), 1) / norm_svd_lanc; + if (time_subroutines) { + printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); + printf("Basic info: b_sz=%ld krylov_iters=%ld\n", k, num_krylov_iters); + + printf("Allocate and free time: %25ld μs,\n", RBKI_timing_breakdown[0]); + printf("Time to acquire the SVD factors: %25ld μs,\n", RBKI_timing_breakdown[1]); + printf("UNGQR time: %25ld μs,\n", RBKI_timing_breakdown[2]); + printf("Reorthogonalization time: %25ld μs,\n", RBKI_timing_breakdown[3]); + printf("QR time: %25ld μs,\n", RBKI_timing_breakdown[4]); + printf("GEMM A time: %25ld μs,\n", RBKI_timing_breakdown[5]); + + printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[0] / (T) RBKI_timing_breakdown[7])); + printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[1] / (T) RBKI_timing_breakdown[7])); + printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[2] / (T) RBKI_timing_breakdown[7])); + printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[3] / (T) RBKI_timing_breakdown[7])); + printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[4] / (T) RBKI_timing_breakdown[7])); + printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[5] / (T) RBKI_timing_breakdown[7])); + printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[6] / (T) RBKI_timing_breakdown[7])); + printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); + } + // Print accuracy info printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); printf("||Sigma_ksvd - Sigma_lanc||_F / ||Sigma_lanc||_F: %.16e\n", err_lan); @@ -203,7 +231,7 @@ int main(int argc, char *argv[]) { m = m_info.rows; n = m_info.cols; k_start = 2;//std::max((int64_t) 1, n / 10); - k_stop = 2048;//std::max((int64_t) 1, n / 10); + k_stop = 128;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, k_stop, tol); From dd5f190918b365ed41daff38af445d8a60f8cf6e Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 16 Jan 2024 03:08:49 -0800 Subject: [PATCH 22/56] Bug fixed --- RandLAPACK/drivers/rl_rbki.hh | 4 ++-- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 80c92126..8eddd144 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -81,7 +81,6 @@ int RBKI::call( T* Sigma, RandBLAS::RNGState &state ){ - high_resolution_clock::time_point allocation_t_start; high_resolution_clock::time_point allocation_t_stop; high_resolution_clock::time_point get_factors_t_start; @@ -113,6 +112,7 @@ int RBKI::call( int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; int64_t space_rows = k * std::ceil(m / (T) k); + int max_iters = std::min(this->max_krylov_iters, (int) (n / (T) k)); // We need a full copy of X and Y all the way through the algorithm // due to an operation with X_odd and Y_odd happening at the end. @@ -197,7 +197,7 @@ int RBKI::call( ++iter; // Iterate until in-loop termination criteria is met. - while((iter_ev + iter_od) < max_krylov_iters) { + while((iter_ev + iter_od) < max_iters) { if (iter % 2 != 0) { if(this -> timing) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index f6d4968d..821d8966 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -123,7 +123,7 @@ static void call_all_algs( //Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_lanchosz = high_resolution_clock::now(); dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); - + // Update best timing and save the singular values. update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); @@ -133,6 +133,7 @@ static void call_all_algs( // Testing RBKI auto start_rbki = high_resolution_clock::now(); RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); @@ -231,7 +232,7 @@ int main(int argc, char *argv[]) { m = m_info.rows; n = m_info.cols; k_start = 2;//std::max((int64_t) 1, n / 10); - k_stop = 128;//std::max((int64_t) 1, n / 10); + k_stop = 256;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, k_stop, tol); From 963fa0e247636ca9598db044860fbcaa2f1409cc Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 16 Jan 2024 08:23:37 -0800 Subject: [PATCH 23/56] Added detailed (maybe too much) time profiling in RBKI, fixed openmp bug, fixed norm bug. --- RandLAPACK/drivers/rl_rbki.hh | 95 +++++++++++++++---- .../bench_RBKI/RBKI_speed_comparisons.cc | 34 ++++--- 2 files changed, 98 insertions(+), 31 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 8eddd144..b7f5b9c3 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -93,6 +93,16 @@ int RBKI::call( high_resolution_clock::time_point qr_t_stop; high_resolution_clock::time_point gemm_A_t_start; high_resolution_clock::time_point gemm_A_t_stop; + high_resolution_clock::time_point main_loop_t_start; + high_resolution_clock::time_point main_loop_t_stop; + high_resolution_clock::time_point sketching_t_start; + high_resolution_clock::time_point sketching_t_stop; + high_resolution_clock::time_point r_cpy_t_start; + high_resolution_clock::time_point r_cpy_t_stop; + high_resolution_clock::time_point s_cpy_t_start; + high_resolution_clock::time_point s_cpy_t_stop; + high_resolution_clock::time_point norm_t_start; + high_resolution_clock::time_point norm_t_stop; high_resolution_clock::time_point total_t_start; high_resolution_clock::time_point total_t_stop; @@ -102,6 +112,11 @@ int RBKI::call( long reorth_t_dur = 0; long qr_t_dur = 0; long gemm_A_t_dur = 0; + long main_loop_t_dur = 0; + long sketching_t_dur = 0; + long r_cpy_t_dur = 0; + long s_cpy_t_dur = 0; + long norm_t_dur = 0; long total_t_dur = 0; if(this -> timing) { @@ -157,12 +172,18 @@ int RBKI::call( T sq_tol = std::pow(this->tol, 2); T threshold = std::sqrt(1 - sq_tol) * norm_A; + if(this -> timing) + sketching_t_start = high_resolution_clock::now(); + // Generate a dense Gaussian random matrx. RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; - if(this -> timing) + if(this -> timing) { + sketching_t_stop = high_resolution_clock::now(); + sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); gemm_A_t_start = high_resolution_clock::now(); + } // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); @@ -197,7 +218,11 @@ int RBKI::call( ++iter; // Iterate until in-loop termination criteria is met. + while((iter_ev + iter_od) < max_iters) { + if(this -> timing) + main_loop_t_start = high_resolution_clock::now(); + if (iter % 2 != 0) { if(this -> timing) @@ -217,12 +242,13 @@ int RBKI::call( if (iter != 1) { // R_i' = Y_i' * Y_od blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, R_i, n); + + if(this -> timing) + reorth_t_start = high_resolution_clock::now(); + // Y_i = Y_i - Y_od * R_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); - if(this -> timing) - reorth_t_start = high_resolution_clock::now(); - // Reorthogonalization blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, Y_orth_buf, k); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, Y_orth_buf, k, 1.0, Y_i, n); @@ -244,15 +270,18 @@ int RBKI::call( if(this -> timing) { qr_t_stop = high_resolution_clock::now(); qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); + r_cpy_t_start = high_resolution_clock::now(); } // Copy R_ii over to R's (in transposed format). - #pragma omp parallel for for(i = 0; i < k; ++i) blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n); - if(this -> timing) - ungqr_t_start = high_resolution_clock::now(); + if(this -> timing) { + r_cpy_t_stop = high_resolution_clock::now(); + r_cpy_t_dur += duration_cast(r_cpy_t_stop - r_cpy_t_start).count(); + ungqr_t_start = high_resolution_clock::now(); + } // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be. lapack::ungqr(n, k, k, Y_i, n, tau); @@ -286,7 +315,7 @@ int RBKI::call( if(this -> timing) { gemm_A_t_stop = high_resolution_clock::now(); - gemm_A_t_dur =+ duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); + gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } // Move the X_i pointer; @@ -294,11 +323,12 @@ int RBKI::call( // S_i = X_ev' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); - //X_i = X_i - X_ev * S_i; - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); - + if(this -> timing) reorth_t_start = high_resolution_clock::now(); + + //X_i = X_i - X_ev * S_i; + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); // Reorthogonalization blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, X_orth_buf, n + k); @@ -320,13 +350,17 @@ int RBKI::call( if(this -> timing) { qr_t_stop = high_resolution_clock::now(); qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); + s_cpy_t_start = high_resolution_clock::now(); } // Copy S_ii over to S's space under S_i (offset down by iter_od * k) lapack::lacpy(MatrixType::Upper, k, k, X_i, m, S_ii, n + k); - if(this -> timing) - ungqr_t_start = high_resolution_clock::now(); + if(this -> timing) { + s_cpy_t_stop = high_resolution_clock::now(); + s_cpy_t_dur += duration_cast(s_cpy_t_stop - s_cpy_t_start).count(); + ungqr_t_start = high_resolution_clock::now(); + } // Convert X_i into an explicit form. It is now stored in X_ev as it should be lapack::ungqr(m, k, k, X_i, m, tau); @@ -349,9 +383,22 @@ int RBKI::call( // Advance odd iteration count; ++iter_od; } + + if(this -> timing) + norm_t_start = high_resolution_clock::now(); + + // This is only changed on odd iters + if (iter % 2 != 0) + norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, iter_ev * k, iter_ev * k, R, n); + + if(this -> timing) { + norm_t_stop = high_resolution_clock::now(); + norm_t_dur += duration_cast(norm_t_stop - norm_t_start).count(); + main_loop_t_stop = high_resolution_clock::now(); + main_loop_t_dur += duration_cast(main_loop_t_stop - main_loop_t_start).count(); + } + ++iter; - norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n, n, R, n); - //norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A if(norm_R > threshold) { break; @@ -412,9 +459,9 @@ int RBKI::call( if(this -> timing) { total_t_stop = high_resolution_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); - long t_rest = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur); - this -> times.resize(8); - this -> times = {allocation_t_dur, get_factors_t_dur, ungqr_t_dur, reorth_t_dur, qr_t_dur, gemm_A_t_dur, t_rest, total_t_dur}; + long t_rest = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur + sketching_t_dur + r_cpy_t_dur + s_cpy_t_dur + norm_t_dur); + this -> times.resize(11); + this -> times = {allocation_t_dur, get_factors_t_dur, ungqr_t_dur, reorth_t_dur, qr_t_dur, gemm_A_t_dur, main_loop_t_dur, sketching_t_dur, r_cpy_t_dur, s_cpy_t_dur, norm_t_dur, t_rest, total_t_dur}; if (this -> verbosity) { printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); @@ -426,14 +473,24 @@ int RBKI::call( printf("Reorthogonalization time: %25ld μs,\n", reorth_t_dur); printf("QR time: %25ld μs,\n", qr_t_dur); printf("GEMM A time: %25ld μs,\n", gemm_A_t_dur); + printf("Sketching time: %25ld μs,\n", sketching_t_dur); + printf("R_ii cpy time: %25ld μs,\n", r_cpy_t_dur); + printf("S_ii cpy time: %25ld μs,\n", s_cpy_t_dur); + printf("Norm R time: %25ld μs,\n", norm_t_dur); - printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) allocation_t_dur / (T) total_t_dur)); + printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) allocation_t_dur / (T) total_t_dur)); printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) get_factors_t_dur / (T) total_t_dur)); printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) ungqr_t_dur / (T) total_t_dur)); printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) reorth_t_dur / (T) total_t_dur)); printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) qr_t_dur / (T) total_t_dur)); printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) gemm_A_t_dur / (T) total_t_dur)); + printf("Sketching takes %22.2f%% of runtime.\n", 100 * ((T) sketching_t_dur / (T) total_t_dur)); + printf("R_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) r_cpy_t_dur / (T) total_t_dur)); + printf("S_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) s_cpy_t_dur / (T) total_t_dur)); + printf("Norm R takes %22.2f%% of runtime.\n", 100 * ((T) norm_t_dur / (T) total_t_dur)); printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) t_rest / (T) total_t_dur)); + + printf("\nMain loop takes %22.2f%% of runtime.\n", 100 * ((T) main_loop_t_dur / (T) total_t_dur)); printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); } } diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 821d8966..4ac1217d 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -58,7 +58,7 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, blas::copy(k, S1, 1, S2, 1); } if (timing) - blas::copy(8, break_out, 1, break_in, 1); + blas::copy(13, break_out, 1, break_in, 1); } /* template @@ -88,7 +88,7 @@ static void call_all_algs( T err_rbki; T err_lan; int64_t k_lanc = std::min((int64_t) (num_krylov_iters / (T) 2), k); - bool time_subroutines = true; + bool time_subroutines = false; // Set the threshold for Lanchosz // Setting up Lanchosz - RBKI with k = 1. @@ -112,8 +112,8 @@ static void call_all_algs( //auto state_alg = state; // Timing breakdown vectors; - std::vector Lanc_timing_breakdown (8, 0.0); - std::vector RBKI_timing_breakdown (8, 0.0); + std::vector Lanc_timing_breakdown (13, 0.0); + std::vector RBKI_timing_breakdown (13, 0.0); for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); @@ -180,14 +180,24 @@ static void call_all_algs( printf("Reorthogonalization time: %25ld μs,\n", RBKI_timing_breakdown[3]); printf("QR time: %25ld μs,\n", RBKI_timing_breakdown[4]); printf("GEMM A time: %25ld μs,\n", RBKI_timing_breakdown[5]); - - printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[0] / (T) RBKI_timing_breakdown[7])); - printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[1] / (T) RBKI_timing_breakdown[7])); - printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[2] / (T) RBKI_timing_breakdown[7])); - printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[3] / (T) RBKI_timing_breakdown[7])); - printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[4] / (T) RBKI_timing_breakdown[7])); - printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[5] / (T) RBKI_timing_breakdown[7])); - printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[6] / (T) RBKI_timing_breakdown[7])); + printf("Sketching time: %25ld μs,\n", RBKI_timing_breakdown[7]); + printf("R_ii cpy time: %25ld μs,\n", RBKI_timing_breakdown[8]); + printf("S_ii cpy time: %25ld μs,\n", RBKI_timing_breakdown[9]); + printf("Norm time: %25ld μs,\n", RBKI_timing_breakdown[10]); + + printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[0] / (T) RBKI_timing_breakdown[12])); + printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[1] / (T) RBKI_timing_breakdown[12])); + printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[2] / (T) RBKI_timing_breakdown[12])); + printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[3] / (T) RBKI_timing_breakdown[12])); + printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[4] / (T) RBKI_timing_breakdown[12])); + printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[5] / (T) RBKI_timing_breakdown[12])); + printf("Sketching takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[7] / (T) RBKI_timing_breakdown[12])); + printf("R_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[8] / (T) RBKI_timing_breakdown[12])); + printf("S_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[9] / (T) RBKI_timing_breakdown[12])); + printf("Norm R takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[10] / (T) RBKI_timing_breakdown[12])); + printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[11] / (T) RBKI_timing_breakdown[12])); + + printf("\nMain loop takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[6] / (T) RBKI_timing_breakdown[12])); printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); } From 93c072f37c196e08fcf809b6b54f86b43ef77c67 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 18 Jan 2024 01:46:07 -0800 Subject: [PATCH 24/56] Update --- RandLAPACK/drivers/rl_rbki.hh | 5 +++++ benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index b7f5b9c3..be062358 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -176,8 +176,11 @@ int RBKI::call( sketching_t_start = high_resolution_clock::now(); // Generate a dense Gaussian random matrx. + // OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation. + omp_set_num_threads(4); RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; + omp_set_num_threads(48); if(this -> timing) { sketching_t_stop = high_resolution_clock::now(); @@ -274,8 +277,10 @@ int RBKI::call( } // Copy R_ii over to R's (in transposed format). + omp_set_num_threads(4); for(i = 0; i < k; ++i) blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n); + omp_set_num_threads(48); if(this -> timing) { r_cpy_t_stop = high_resolution_clock::now(); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 4ac1217d..79a138ea 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -214,9 +214,12 @@ static void call_all_algs( int main(int argc, char *argv[]) { - if(argc <= 1) - // No input + printf("Function begin\n"); + + if(argc <= 1) { + printf("No input provided\n"); return 0; + } int64_t m = 0; int64_t n = 0; @@ -250,6 +253,8 @@ int main(int argc, char *argv[]) { // Fill the data matrix; RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + printf("Finished data preparation\n"); + // Declare a data file std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) From 76e40844e4e1a420bf0b1c576576314611ee559b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 19 Jan 2024 07:55:24 -0800 Subject: [PATCH 25/56] Isolating GEQR in CQRRPT speed benchmark --- .../bench_CQRRPT/CQRRPT_speed_comparisons.cc | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc index 421ef94c..e9a2ebf4 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -167,7 +167,7 @@ static std::vector call_all_algs( return res; } - +/* int main() { // Declare parameters int64_t m = std::pow(2, 17); @@ -199,4 +199,25 @@ int main() { res = call_all_algs(m_info, numruns, n_start, all_data, state_constant); file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ",\n"; } +} +*/ + +int main() { + // Declare parameters + int64_t m = 1000000; + int64_t n = std::pow(2, 12); + auto state = RandBLAS::RNGState(); + + std::vector A (m * n, 0.0); + std::vector tau (n, 0.0); + + omp_set_num_threads(4); + RandBLAS::DenseDist D(m, n); + state = RandBLAS::fill_dense(D, A.data(), state).second; + omp_set_num_threads(48); + + lapack::geqr(m, n, A.data(), m, tau.data(), -1); + int64_t tsize = (int64_t) tau[0]; + tau.resize(tsize); + lapack::geqr(m, n, A.data(), m, tau.data(), tsize); } \ No newline at end of file From c99a49a76e7d4c999aa1ed5847d247aace921214 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 29 Jan 2024 01:19:46 -0800 Subject: [PATCH 26/56] Update --- .../bench_RBKI/RBKI_speed_comparisons.cc | 137 +++++++----------- 1 file changed, 53 insertions(+), 84 deletions(-) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 79a138ea..9aee8c70 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -16,18 +16,18 @@ struct RBKI_benchmark_data { std::vector U; std::vector V; std::vector Sigma; - std::vector Sigma_cpy_1; - std::vector Sigma_cpy_2; - std::vector Sigma_cpy_3; + std::vector Sigma_cpy_RBKI; + std::vector Sigma_cpy_SVD; + std::vector Sigma_cpy_Other; RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : A(m * n, 0.0), U(m * n, 0.0), V(n * n, 0.0), Sigma(n, 0.0), - Sigma_cpy_1(n, 0.0), - Sigma_cpy_2(n, 0.0), - Sigma_cpy_3(n, 0.0) + Sigma_cpy_RBKI(n, 0.0), + Sigma_cpy_SVD(n, 0.0), + Sigma_cpy_Other(n, 0.0) { row = m; col = n; @@ -40,9 +40,10 @@ struct RBKI_benchmark_data { template static void data_regen(RandLAPACK::gen::mat_gen_info m_info, RBKI_benchmark_data &all_data, - RandBLAS::RNGState &state) { + RandBLAS::RNGState &state, int overwrite_A) { - RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + if (overwrite_A) + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.U.begin(), all_data.U.end(), 0.0); std::fill(all_data.V.begin(), all_data.V.end(), 0.0); std::fill(all_data.Sigma.begin(), all_data.Sigma.end(), 0.0); @@ -51,8 +52,6 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, template static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t k, long* break_in, long* break_out, int timing) { - // Can also do this is one line - // i == 0 ? (void) (t_rbki_best = dur_rbki, accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (dur_rbki < t_rbki_best) ? ((void) (t_rbki_best = dur_rbki), accuracy_check ? blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_1.data(), 1): (void) NULL) : (void) NULL; if (iter == 0 || t_curr < t_best) { t_best = t_curr; blas::copy(k, S1, 1, S2, 1); @@ -60,14 +59,32 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, if (timing) blas::copy(13, break_out, 1, break_in, 1); } -/* -template -static void svd_error(T* U1, T* S1, T* VT1, T* U2, T* S2, T* VT2) + +template +static long run_svd( + RandLAPACK::gen::mat_gen_info m_info, + RBKI_benchmark_data &all_data, + RandBLAS::RNGState &state) { - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); + + auto m = all_data.row; + auto n = all_data.col; + auto tol = all_data.tolerance; + + // Testing Other - SVD + auto start_svd = high_resolution_clock::now(); + lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.Sigma.data(), all_data.U.data(), m, all_data.V.data(), n); + auto stop_svd = high_resolution_clock::now(); + long dur_svd = duration_cast(stop_svd - start_svd).count(); + + blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_SVD.data(), 1); + + auto state_gen = state; + data_regen(m_info, all_data, state_gen, 1); + return dur_svd; } -*/ + template static void call_all_algs( @@ -77,7 +94,8 @@ static void call_all_algs( int64_t num_krylov_iters, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state, - std::string output_filename) { + std::string output_filename, + long dur_svd) { int i, j; auto m = all_data.row; @@ -101,10 +119,8 @@ static void call_all_algs( // timing vars long dur_rbki = 0; - long dur_other = 0; long dur_lanchosz = 0; long t_rbki_best = 0; - long t_other_best = 0; long t_lanchosz_best = 0; // Making sure the states are unchanged @@ -125,10 +141,10 @@ static void call_all_algs( dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); // Update best timing and save the singular values. - update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_3.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); + //update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_Other.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); - state_gen = state; - data_regen(m_info, all_data, state_gen); + //state_gen = state; + //data_regen(m_info, all_data, state_gen, 0); // Testing RBKI auto start_rbki = high_resolution_clock::now(); @@ -138,37 +154,23 @@ static void call_all_algs( dur_rbki = duration_cast(stop_rbki - start_rbki).count(); // Update best timing and save the singular values. - update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_1.data(), k, RBKI_timing_breakdown.data(), RBKI.times.data(), time_subroutines); + update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_RBKI.data(), k, RBKI_timing_breakdown.data(), RBKI.times.data(), time_subroutines); state_gen = state; - data_regen(m_info, all_data, state_gen); - - // Testing Other - SVD - auto start_other = high_resolution_clock::now(); - lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.Sigma.data(), all_data.U.data(), m, all_data.V.data(), n); - auto stop_other = high_resolution_clock::now(); - dur_other = duration_cast(stop_other - start_other).count(); - - blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_2.data(), 1); - - // Update best timing and save the singular values. - update_best_time(i, t_other_best, dur_other, all_data.Sigma.data(), all_data.Sigma_cpy_2.data(), k, NULL, NULL, 0); - - state_gen = state; - data_regen(m_info, all_data, state_gen); + data_regen(m_info, all_data, state_gen, 0); } for(j = 0; j < k; ++j) - all_data.Sigma_cpy_1[j] -= all_data.Sigma_cpy_2[j]; + all_data.Sigma_cpy_RBKI[j] -= all_data.Sigma_cpy_SVD[j]; for(j = 0; j < k_lanc; ++j) - all_data.Sigma_cpy_3[j] -= all_data.Sigma_cpy_2[j]; + all_data.Sigma_cpy_Other[j] -= all_data.Sigma_cpy_SVD[j]; - norm_svd_k = blas::nrm2(k, all_data.Sigma_cpy_2.data(), 1); - norm_svd_lanc = blas::nrm2(k_lanc, all_data.Sigma_cpy_2.data(), 1); + norm_svd_k = blas::nrm2(k, all_data.Sigma_cpy_SVD.data(), 1); + norm_svd_lanc = blas::nrm2(k_lanc, all_data.Sigma_cpy_SVD.data(), 1); - err_rbki = blas::nrm2(k, all_data.Sigma_cpy_1.data(), 1) / norm_svd_k; - err_lan = blas::nrm2(k_lanc, all_data.Sigma_cpy_3.data(), 1) / norm_svd_lanc; + err_rbki = blas::nrm2(k, all_data.Sigma_cpy_RBKI.data(), 1) / norm_svd_k; + err_lan = blas::nrm2(k_lanc, all_data.Sigma_cpy_Other.data(), 1) / norm_svd_lanc; if (time_subroutines) { printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); @@ -205,11 +207,11 @@ static void call_all_algs( printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); printf("||Sigma_ksvd - Sigma_lanc||_F / ||Sigma_lanc||_F: %.16e\n", err_lan); - printf("RBKI is %f times faster that SVD.\n", (T) t_other_best / t_rbki_best); - printf("Lanchosz is %f times faster that SVD.\n", (T) t_other_best / t_lanchosz_best); + printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); + printf("Lanchosz is %f times faster that SVD.\n", (T) dur_svd / t_lanchosz_best); std::ofstream file(output_filename, std::ios::app); - file << k << ", " << num_krylov_iters << ", " << err_rbki << ", " << err_lan << ", " << t_rbki_best << ", " << t_other_best << ", " << t_lanchosz_best << ",\n"; + file << k << ", " << num_krylov_iters << ", " << err_rbki << ", " << err_lan << ", " << t_rbki_best << ", " << dur_svd << ", " << t_lanchosz_best << ",\n"; } int main(int argc, char *argv[]) { @@ -264,46 +266,13 @@ int main(int argc, char *argv[]) { + "_num_krylov_iters_stop_" + std::to_string(num_krylov_iters_stop) + ".dat"; + // SVD run takes very long & is only needed once for all sizes + long dur_svd = run_svd(m_info, all_data, state); + for (;k_start <= k_stop; k_start *=2) { for (;num_krylov_iters_curr <= num_krylov_iters_stop; num_krylov_iters_curr *=2) { - call_all_algs(m_info, numruns, k_start, num_krylov_iters_curr, all_data, state_constant, output_filename); + call_all_algs(m_info, numruns, k_start, num_krylov_iters_curr, all_data, state_constant, output_filename, dur_svd); } num_krylov_iters_curr = num_krylov_iters_start; } -} - -/* -int main() { - // Declare parameters - int64_t m = std::pow(10, 3); - int64_t n = std::pow(10, 3); - int64_t k_start = 100; - int64_t k_stop = 100; - double tol = std::pow(std::numeric_limits::epsilon(), 0.85); - auto state = RandBLAS::RNGState(); - auto state_constant = state; - // Timing results - std::vector res; - // Number of algorithm runs. We only record best times. - int64_t numruns = 5; - - // Allocate basic workspace - RBKI_benchmark_data all_data(m, n, k_stop, tol); - - // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - - // Declare a data file - std::fstream file("RBKI_speed_comp_m_" + std::to_string(m) - + "_n_" + std::to_string(n) - + "_k_start_" + std::to_string(k_start) - + "_k_stop_" + std::to_string(k_stop) - + ".dat", std::fstream::app); - - for (;k_start <= k_stop; k_start *= 2) { - res = call_all_algs(m_info, numruns, k_start, all_data, state_constant); - file << res[0] << ", " << res[1] << ",\n"; - } -} -*/ \ No newline at end of file +} \ No newline at end of file From a477e3f65f3a77e4b98df1e82b452c692ff3f679 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 30 Jan 2024 07:34:23 -0800 Subject: [PATCH 27/56] Update --- benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc | 9 +++++---- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 8 ++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc index e9a2ebf4..c8ad5e5a 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -167,7 +167,7 @@ static std::vector call_all_algs( return res; } -/* + int main() { // Declare parameters int64_t m = std::pow(2, 17); @@ -180,7 +180,7 @@ int main() { // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 1; + int64_t numruns = 75; // Allocate basic workspace QR_benchmark_data all_data(m, n_stop, tol, d_factor); @@ -200,8 +200,8 @@ int main() { file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ",\n"; } } -*/ +/* int main() { // Declare parameters int64_t m = 1000000; @@ -220,4 +220,5 @@ int main() { int64_t tsize = (int64_t) tau[0]; tau.resize(tsize); lapack::geqr(m, n, A.data(), m, tau.data(), tsize); -} \ No newline at end of file +} +*/ \ No newline at end of file diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 9aee8c70..6061f290 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -66,11 +66,15 @@ static long run_svd( RBKI_benchmark_data &all_data, RandBLAS::RNGState &state) { - auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; + // Using this call for BLAS/LAPACK warmup + lapack::gesdd(Job::NoVec, 10, 10, all_data.A.data(), 10, all_data.Sigma.data(), all_data.U.data(), 10, all_data.V.data(), 10); + auto state_gen = state; + data_regen(m_info, all_data, state_gen, 1); + // Testing Other - SVD auto start_svd = high_resolution_clock::now(); lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.Sigma.data(), all_data.U.data(), m, all_data.V.data(), n); @@ -79,7 +83,7 @@ static long run_svd( blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_SVD.data(), 1); - auto state_gen = state; + state_gen = state; data_regen(m_info, all_data, state_gen, 1); return dur_svd; From 0fd485e7f04ba3efe01cb348ad90a08dffb31f2f Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 30 Jan 2024 07:34:52 -0800 Subject: [PATCH 28/56] Update --- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 6061f290..85f4a9e0 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -140,15 +140,15 @@ static void call_all_algs( // Testing Lanchosz auto start_lanchosz = high_resolution_clock::now(); - //Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_lanchosz = high_resolution_clock::now(); dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); // Update best timing and save the singular values. - //update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_Other.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); + update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_Other.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); - //state_gen = state; - //data_regen(m_info, all_data, state_gen, 0); + state_gen = state; + data_regen(m_info, all_data, state_gen, 0); // Testing RBKI auto start_rbki = high_resolution_clock::now(); From a14855023b6cab29121f9bd105e707c7097a6314 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 1 Feb 2024 04:54:58 -0800 Subject: [PATCH 29/56] Update --- RandLAPACK/drivers/rl_nysbki.hh | 117 ++++++++++++++++++ .../bench_RBKI/RBKI_speed_comparisons.cc | 2 +- 2 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 RandLAPACK/drivers/rl_nysbki.hh diff --git a/RandLAPACK/drivers/rl_nysbki.hh b/RandLAPACK/drivers/rl_nysbki.hh new file mode 100644 index 00000000..439c4c02 --- /dev/null +++ b/RandLAPACK/drivers/rl_nysbki.hh @@ -0,0 +1,117 @@ +#ifndef randlapack_NysBKI_h +#define randlapack_NysBKI_h + +#include "rl_util.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_hqrrp.hh" + +#include +#include +#include +#include +#include +#include + +using namespace std::chrono; + +namespace RandLAPACK { + +template +class NysBKIalg { + public: + virtual ~NysBKIalg() {} + virtual int call( + int64_t m, + T* A, + int64_t lda, + int64_t k, + T* V, + T* Lambda, + RandBLAS::RNGState &state + ) = 0; +}; + +template +class NysBKI : public NysBKIalg { + public: + NysBKI( + bool verb, + bool time_subroutines, + T ep + ) { + verbosity = verb; + timing = time_subroutines; + tol = ep; + max_krylov_iters = INT_MAX; + } + int call( + int64_t m, + T* A, + int64_t lda, + int64_t k, + T* V, + T* Lambda, + RandBLAS::RNGState &state + ) override; + public: + bool verbosity; + bool timing; + T tol; + int num_krylov_iters; + int max_krylov_iters; + std::vector times; + T norm_R_end; +}; + +// ----------------------------------------------------------------------------- +template +int NysBKI::call( + int64_t m, + T* A, + int64_t lda, + int64_t k, + T* V, + T* Lambda, + RandBLAS::RNGState &state +){ + int iter = 0; + + T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); + T* X_i = X; + T* Y = ( T * ) calloc( m * (m + k), sizeof( T ) ); + T* Y_i = Y; + + // tau space for QR + T* tau = ( T * ) calloc( k, sizeof( T ) ); + + + // Generate a dense Gaussian random matrx. + RandBLAS::DenseDist D(m, k); + state = RandBLAS::fill_dense(D, X_i, state).second; + // [X_i, ~] = qr(randn(m, m), 0) + lapack::geqrf(m, k, X_i, m, tau); + // Y_i = A * X_i + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, m); + + while(iter < max_krylov_iters) { + // Advance X_i pointer + X_i = X_i + (m * k); + lapack::lacpy(MatrixType::Upper, m, k, X, m, X_i, m); + + if (!iter) { + // X_i+1 = Y_i + tol * X_i; + blas::scal(m * k, this->tol, X_i, 1); + blas::axpy(m * k, 1.0, Y_i, 1, X_i, 1); + } else { + + } + + + + } + + return 0; +} +} // end namespace RandLAPACK +#endif \ No newline at end of file diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 85f4a9e0..9cf9150f 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -110,7 +110,7 @@ static void call_all_algs( T err_rbki; T err_lan; int64_t k_lanc = std::min((int64_t) (num_krylov_iters / (T) 2), k); - bool time_subroutines = false; + bool time_subroutines = true; // Set the threshold for Lanchosz // Setting up Lanchosz - RBKI with k = 1. From ef1bfadb2150880a94f3a8d7e944e23b18e63537 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 9 Feb 2024 01:18:05 -0800 Subject: [PATCH 30/56] Adding a benchmark for apple project --- RandLAPACK/drivers/rl_cqrrp.hh | 16 ++++++++-------- benchmark/CMakeLists.txt | 7 ++++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 330b1ca7..54998f78 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -287,7 +287,7 @@ int CQRRP_blocked::call( RandBLAS::sketch_general( Layout::ColMajor, Op::NoTrans, Op::NoTrans, - d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_sk, d + d, n, m, (T) 1.0, S, 0, 0, A, lda, (T) 0.0, A_sk, d ); if(this -> timing) { @@ -306,7 +306,7 @@ int CQRRP_blocked::call( // Zero-out data - may not be necessary std::fill(&J_buffer[0], &J_buffer[n], 0); std::fill(&J_buffer_lu[0], &J_buffer_lu[std::min(d, n)], 0); - std::fill(&Work2[0], &Work2[n], 0.0); + std::fill(&Work2[0], &Work2[n], (T) 0.0); if(this -> timing) qrcp_t_start = high_resolution_clock::now(); @@ -360,7 +360,7 @@ int CQRRP_blocked::call( // A_pre = AJ(:, 1:b_sz) * inv(R_sk) // Performing preconditioning of the current matrix A. - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, b_sz, 1.0, R_sk, d, A_work, lda); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, b_sz, (T) 1.0, R_sk, d, A_work, lda); if(this -> timing) { preconditioning_t_stop = high_resolution_clock::now(); @@ -371,11 +371,11 @@ int CQRRP_blocked::call( cholqr_t_start = high_resolution_clock::now(); // Performing Cholesky QR - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, b_sz, rows, 1.0, A_work, lda, 0.0, R_cholqr, b_sz_const); + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, b_sz, rows, (T) 1.0, A_work, lda, (T) 0.0, R_cholqr, b_sz_const); lapack::potrf(Uplo::Upper, b_sz, R_cholqr, b_sz_const); // Compute Q_econ from Cholesky QR - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, b_sz, 1.0, R_cholqr, b_sz_const, A_work, lda); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, b_sz, (T) 1.0, R_cholqr, b_sz_const, A_work, lda); if(this -> timing) { cholqr_t_stop = high_resolution_clock::now(); @@ -437,7 +437,7 @@ int CQRRP_blocked::call( // Alternatively, instead of trmm + copy, we could perform a single gemm. // Compute R11 = R11_full(1:b_sz, :) * R_sk // R11_full is stored in R_cholqr space, R_sk is stored in A_sk space. - blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, b_sz, b_sz, 1.0, R_sk, d, R_cholqr, b_sz_const); + blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, b_sz, b_sz, (T) 1.0, R_sk, d, R_cholqr, b_sz_const); // Need to copy R11 over form R_cholqr into the appropriate space in A. // We cannot avoid this copy, since trmm() assumes R_cholqr is a square matrix. // In a global sense, this is identical to: @@ -536,11 +536,11 @@ int CQRRP_blocked::call( // trsm (R_sk, R11) -> R_sk // Clearing the lower-triangular portion here is necessary, if there is a more elegant way, need to use that. RandLAPACK::util::get_U(b_sz, b_sz, R_sk, d); - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, b_sz, b_sz, 1.0, R11, lda, R_sk, d); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, b_sz, b_sz, (T) 1.0, R11, lda, R_sk, d); // R_sk_12 - R_sk_11 * inv(R_11) * R_12 // Side note: might need to be careful when d = b_sz. // Cannot perform trmm here as an alternative, since matrix difference is involved. - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, b_sz, cols - b_sz, b_sz, -1.0, R_sk, d, R12, lda, 1.0, &R_sk[d * b_sz], d); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, b_sz, cols - b_sz, b_sz, (T) -1.0, R_sk, d, R12, lda, (T) 1.0, &R_sk[d * b_sz], d); // Changing the sampling dimension parameter sampling_dimension = std::min(sampling_dimension, cols); diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 99dfe76a..c6802754 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -58,8 +58,9 @@ add_benchmark(NAME CQRRPT_runtime_breakdown CXX_SOURCES bench_CQRRPT/CQRRPT_runt add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) # CQRRP benchmarks -add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_Apple_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) From c354fd791c233dfbb2a224c841d10da4f0431983 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 9 Feb 2024 02:18:02 -0800 Subject: [PATCH 31/56] Update --- .../CQRRP_Apple_runtime_breakdown.cc | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc diff --git a/benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc b/benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc new file mode 100644 index 00000000..08400e66 --- /dev/null +++ b/benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc @@ -0,0 +1,168 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct QR_speed_benchmark_data { + int64_t row; + int64_t col; + T tolerance; + T sampling_factor; + std::vector A; + std::vector tau; + std::vector J; + + QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + A(m * n, 0.0), + tau(n, 0.0), + J(n, 0) + { + row = m; + col = n; + tolerance = tol; + sampling_factor = d_factor; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + QR_speed_benchmark_data &all_data, + RandBLAS::RNGState &state, int apply_itoa) { + + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); + if (apply_itoa) { + std::iota(all_data.J.begin(), all_data.J.end(), 1); + } else { + std::fill(all_data.J.begin(), all_data.J.end(), 0); + } +} + +template +static std::vector call_all_algs( + RandLAPACK::gen::mat_gen_info m_info_cqrrp, + RandLAPACK::gen::mat_gen_info m_info_rest, + int64_t numruns, + int64_t b_sz, + QR_speed_benchmark_data &all_data_cqrrp, + QR_speed_benchmark_data &all_data_rest, + RandBLAS::RNGState &state) { + + auto m = all_data_cqrrp.row; + auto n = all_data_cqrrp.col; + auto tol = all_data_cqrrp.tolerance; + auto d_factor = all_data_cqrrp.sampling_factor; + + // Additional params setup. + RandLAPACK::CQRRP_blocked CQRRP_blocked(false, tol, b_sz); + CQRRP_blocked.nnz = 2; + CQRRP_blocked.num_threads = 48; + // We are nbot using panel pivoting in performance testing. + int panel_pivoting = 0; + + // timing vars + long dur_cqrrp = 0; + long dur_geqrf = 0; + long dur_getrf = 0; + long t_cqrrp_best = 0; + long t_geqrf_best = 0; + long t_getrf_best = 0; + + // Making sure the states are unchanged + auto state_gen = state; + auto state_alg = state; + + for (int i = 0; i < numruns; ++i) { + printf("ITERATION\n"); + // Testing GEQRF + auto start_getrf = high_resolution_clock::now(); + lapack::getrf(m, n, all_data_rest.A.data(), m, all_data_rest.J.data()); + auto stop_getrf = high_resolution_clock::now(); + auto dur_getrf = duration_cast(stop_getrf - start_getrf).count(); + printf("TOTAL TIME FOR GETRF %ld\n", dur_getrf); + // Update best timing + i == 0 ? t_getrf_best = dur_getrf : (dur_getrf < t_getrf_best) ? t_getrf_best = dur_getrf : NULL; + + data_regen(m_info_rest, all_data_rest, state_gen, 0); + state_gen = state; + + // Testing GEQRF + auto start_geqrf = high_resolution_clock::now(); + lapack::geqrf(m, n, all_data_rest.A.data(), m, all_data_rest.tau.data()); + auto stop_geqrf = high_resolution_clock::now(); + dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); + printf("TOTAL TIME FOR GEQRF %ld\n", dur_geqrf); + // Update best timing + i == 0 ? t_geqrf_best = dur_geqrf : (dur_geqrf < t_geqrf_best) ? t_geqrf_best = dur_geqrf : NULL; + + // Clear and re-generate data + data_regen(m_info_rest, all_data_rest, state_gen, 0); + state_gen = state; + + // Testing CQRRP - best setup + auto start_cqrrp = high_resolution_clock::now(); + CQRRP_blocked.call(m, n, all_data_cqrrp.A.data(), m, d_factor, all_data_cqrrp.tau.data(), all_data_cqrrp.J.data(), state_alg); + auto stop_cqrrp = high_resolution_clock::now(); + dur_cqrrp = duration_cast(stop_cqrrp - start_cqrrp).count(); + printf("TOTAL TIME FOR CQRRP %ld\n", dur_cqrrp); + // Update best timing + i == 0 ? t_cqrrp_best = dur_cqrrp : (dur_cqrrp < t_cqrrp_best) ? t_cqrrp_best = dur_cqrrp : NULL; + + // Clear and re-generate data + data_regen(m_info_cqrrp, all_data_cqrrp, state_gen, 1); + state_gen = state; + state_alg = state; + } + + std::vector res{t_cqrrp_best, t_geqrf_best, t_getrf_best}; + + return res; +} + +int main() { + // Declare parameters + int64_t m = std::pow(2, 14); + int64_t n = std::pow(2, 14); + double d_factor = 1.25; + int64_t b_sz_start = 256; + int64_t b_sz_end = 2048; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_cpy = state; + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 5; + + // Allocate basic workspace - double + QR_speed_benchmark_data all_data_d(m, n, tol, d_factor); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info_d(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info_d, all_data_d.A.data(), state); + + // Allocate basic workspace - float + QR_speed_benchmark_data all_data_f(m, n, (float) tol, (float) d_factor); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info_f(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info_f, all_data_f.A.data(), state_cpy); + + // Declare a data file + std::fstream file("Apple_QR_time_raw_rows_" + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_b_sz_start_" + std::to_string(b_sz_start) + + "_b_sz_end_" + std::to_string(b_sz_end) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); +#if !defined(__APPLE__) + for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { + res = call_all_algs(m_info_f, m_info_d, numruns, b_sz_start, all_data_f, all_data_d, state_constant); + file << res[0] << ", " << res[1] << ", " << res[2] << ",\n"; + } +#endif +} From 958fb6b22b3c681f08c85ee135da3453dd0dc0d4 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Sat, 10 Feb 2024 00:36:51 -0800 Subject: [PATCH 32/56] Added RBKI runtime benchmark --- benchmark/CMakeLists.txt | 1 + .../bench_RBKI/RBKI_runtime_benchmark.cc | 160 ++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 benchmark/bench_RBKI/RBKI_runtime_benchmark.cc diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index c6802754..ea306c55 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -64,3 +64,4 @@ add_benchmark(NAME CQRRP_Apple_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_A add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME RBKI_runtime_benchmark CXX_SOURCES bench_RBKI/RBKI_runtime_benchmark.cc LINK_LIBS ${Benchmark_libs}) \ No newline at end of file diff --git a/benchmark/bench_RBKI/RBKI_runtime_benchmark.cc b/benchmark/bench_RBKI/RBKI_runtime_benchmark.cc new file mode 100644 index 00000000..e4947051 --- /dev/null +++ b/benchmark/bench_RBKI/RBKI_runtime_benchmark.cc @@ -0,0 +1,160 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct RBKI_benchmark_data { + int64_t row; + int64_t col; + int64_t rank; // has to be modifiable + T tolerance; + std::vector A; + std::vector U; + std::vector V; + std::vector Sigma; + std::vector Sigma_cpy_RBKI; + std::vector Sigma_cpy_SVD; + std::vector Sigma_cpy_Other; + + RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : + A(m * n, 0.0), + U(m * n, 0.0), + V(n * n, 0.0), + Sigma(n, 0.0), + Sigma_cpy_RBKI(n, 0.0), + Sigma_cpy_SVD(n, 0.0), + Sigma_cpy_Other(n, 0.0) + { + row = m; + col = n; + rank = k; + tolerance = tol; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + RBKI_benchmark_data &all_data, + RandBLAS::RNGState &state, int overwrite_A) { + + if (overwrite_A) + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + std::fill(all_data.U.begin(), all_data.U.end(), 0.0); + std::fill(all_data.V.begin(), all_data.V.end(), 0.0); + std::fill(all_data.Sigma.begin(), all_data.Sigma.end(), 0.0); +} + +template +static void call_all_algs( + RandLAPACK::gen::mat_gen_info m_info, + int64_t numruns, + int64_t k, + int64_t num_krylov_iters, + RBKI_benchmark_data &all_data, + RandBLAS::RNGState &state, + std::string output_filename) { + + auto m = all_data.row; + auto n = all_data. col; + auto tol = all_data.tolerance; + bool time_subroutines = true; + + // Additional params setup. + RandLAPACK::RBKI RBKI(false, time_subroutines, tol); + RBKI.max_krylov_iters = num_krylov_iters; + + + // Making sure the states are unchanged + auto state_gen = state; + + // Timing vars + long dur_rbki = 0; + long t_rbki_best = 0; + std::vector inner_timing_best; + + for (int i = 0; i < numruns; ++i) { + printf("Iteration %d start.\n", i); + auto start_rbki = high_resolution_clock::now(); + RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + auto stop_rbki = high_resolution_clock::now(); + dur_rbki = duration_cast(stop_rbki - start_rbki).count(); + // Update best timing + if (!i || dur_rbki < t_rbki_best) {t_rbki_best = dur_rbki; inner_timing_best = RBKI.times;} + // Clear and re-generate data + data_regen(m_info, all_data, state_gen, 0); + state_gen = state; + } + + // Add info about the run + inner_timing_best.insert (inner_timing_best.begin(), k); + inner_timing_best.insert (inner_timing_best.begin(), num_krylov_iters); + + std::ofstream file(output_filename, std::ios::app); + std::copy(inner_timing_best.begin(), inner_timing_best.end(), std::ostream_iterator(file, ", ")); + file << "\n"; +} + +int main(int argc, char *argv[]) { + + printf("Function begin\n"); + + if(argc <= 1) { + printf("No input provided\n"); + return 0; + } + + int64_t m = 0; + int64_t n = 0; + int64_t k_start = 0; + int64_t k_stop = 0; + int64_t num_krylov_iters_start = 2; + int64_t num_krylov_iters_curr = num_krylov_iters_start; + int64_t num_krylov_iters_stop = 64; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + int numruns = 5; + std::vector res; + + // Generate the input matrix. + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); + m_info.filename = argv[1]; + m_info.workspace_query_mod = 1; + // Workspace query; + RandLAPACK::gen::mat_gen(m_info, NULL, state); + + // Update basic params. + m = m_info.rows; + n = m_info.cols; + k_start = 2;//std::max((int64_t) 1, n / 10); + k_stop = 256;//std::max((int64_t) 1, n / 10); + + // Allocate basic workspace. + RBKI_benchmark_data all_data(m, n, k_stop, tol); + + // Fill the data matrix; + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + + printf("Finished data preparation\n"); + + // Declare a data file + std::string output_filename = "RBKI_runtime_breakdown_m_" + std::to_string(m) + + "_n_" + std::to_string(n) + + "_k_start_" + std::to_string(k_start) + + "_k_stop_" + std::to_string(k_stop) + + "_num_krylov_iters_start_" + std::to_string(num_krylov_iters_start) + + "_num_krylov_iters_stop_" + std::to_string(num_krylov_iters_stop) + + ".dat"; + + for (;k_start <= k_stop; k_start *=2) { + for (;num_krylov_iters_curr <= num_krylov_iters_stop; num_krylov_iters_curr *=2) { + call_all_algs(m_info, numruns, k_start, num_krylov_iters_curr, all_data, state_constant, output_filename); + } + num_krylov_iters_curr = num_krylov_iters_start; + } +} \ No newline at end of file From 1089aa4f71e6ce2fd45b8edbcb15c1b67d4f154b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 12 Feb 2024 07:48:35 -0800 Subject: [PATCH 33/56] Reworking of RBKI speed benchmark. --- RandLAPACK/misc/rl_gen.hh | 2 +- .../bench_RBKI/RBKI_speed_comparisons.cc | 190 +++++------------- 2 files changed, 47 insertions(+), 145 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 4f866b08..2490782b 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -420,7 +420,7 @@ void gen_kahan_mat( free(C); } -/// Generates Kahan matrix +/// Reads a matrix from a file template void process_input_mat( int64_t &m, diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 9cf9150f..f41daf2c 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -10,28 +10,24 @@ template struct RBKI_benchmark_data { int64_t row; int64_t col; - int64_t rank; // has to be modifiable T tolerance; std::vector A; std::vector U; std::vector V; std::vector Sigma; std::vector Sigma_cpy_RBKI; - std::vector Sigma_cpy_SVD; - std::vector Sigma_cpy_Other; + std::vector Sigma_SVD; - RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : + RBKI_benchmark_data(int64_t m, int64_t n, T tol) : A(m * n, 0.0), U(m * n, 0.0), V(n * n, 0.0), Sigma(n, 0.0), Sigma_cpy_RBKI(n, 0.0), - Sigma_cpy_SVD(n, 0.0), - Sigma_cpy_Other(n, 0.0) + Sigma_SVD(n, 0.0) { row = m; col = n; - rank = k; tolerance = tol; } }; @@ -50,172 +46,76 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, } template -static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t k, long* break_in, long* break_out, int timing) +static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t target_rank) { if (iter == 0 || t_curr < t_best) { t_best = t_curr; - blas::copy(k, S1, 1, S2, 1); + blas::copy(target_rank, S1, 1, S2, 1); } - if (timing) - blas::copy(13, break_out, 1, break_in, 1); } -template -static long run_svd( - RandLAPACK::gen::mat_gen_info m_info, - RBKI_benchmark_data &all_data, - RandBLAS::RNGState &state) -{ - auto m = all_data.row; - auto n = all_data.col; - auto tol = all_data.tolerance; - - // Using this call for BLAS/LAPACK warmup - lapack::gesdd(Job::NoVec, 10, 10, all_data.A.data(), 10, all_data.Sigma.data(), all_data.U.data(), 10, all_data.V.data(), 10); - auto state_gen = state; - data_regen(m_info, all_data, state_gen, 1); - - // Testing Other - SVD - auto start_svd = high_resolution_clock::now(); - lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.Sigma.data(), all_data.U.data(), m, all_data.V.data(), n); - auto stop_svd = high_resolution_clock::now(); - long dur_svd = duration_cast(stop_svd - start_svd).count(); - - blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_SVD.data(), 1); - - state_gen = state; - data_regen(m_info, all_data, state_gen, 1); - - return dur_svd; -} - - template static void call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, - int64_t k, - int64_t num_krylov_iters, + int64_t b_sz, + int64_t target_rank, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state, std::string output_filename, long dur_svd) { + printf("\nBlock size %ld, target rank %ld\n", b_sz, target_rank); int i, j; auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; T norm_svd_k; - T norm_svd_lanc; T err_rbki; - T err_lan; - int64_t k_lanc = std::min((int64_t) (num_krylov_iters / (T) 2), k); - bool time_subroutines = true; - - // Set the threshold for Lanchosz - // Setting up Lanchosz - RBKI with k = 1. - RandLAPACK::RBKI Lanchosz(false, false, tol); - Lanchosz.max_krylov_iters = num_krylov_iters; + bool time_subroutines = false; // Additional params setup. RandLAPACK::RBKI RBKI(false, time_subroutines, tol); - RBKI.max_krylov_iters = num_krylov_iters; + // Matrices R or S that give us the singular value spectrum returned by RBKI will be of size b_sz * num_krylov_iters / 2. + // These matrices will be full-rank. + // Hence, target_rank = b_sz * num_krylov_iters / 2 + RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); // timing vars - long dur_rbki = 0; - long dur_lanchosz = 0; - long t_rbki_best = 0; - long t_lanchosz_best = 0; + long dur_rbki = 0; + long t_rbki_best = 0; // Making sure the states are unchanged auto state_gen = state; - //auto state_alg = state; - - // Timing breakdown vectors; - std::vector Lanc_timing_breakdown (13, 0.0); - std::vector RBKI_timing_breakdown (13, 0.0); for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); - - // Testing Lanchosz - auto start_lanchosz = high_resolution_clock::now(); - Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); - auto stop_lanchosz = high_resolution_clock::now(); - dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); - - // Update best timing and save the singular values. - update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_Other.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); - - state_gen = state; - data_regen(m_info, all_data, state_gen, 0); // Testing RBKI auto start_rbki = high_resolution_clock::now(); - RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); - + RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); // Update best timing and save the singular values. - update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_RBKI.data(), k, RBKI_timing_breakdown.data(), RBKI.times.data(), time_subroutines); + update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_RBKI.data(), target_rank); state_gen = state; data_regen(m_info, all_data, state_gen, 0); } - for(j = 0; j < k; ++j) - all_data.Sigma_cpy_RBKI[j] -= all_data.Sigma_cpy_SVD[j]; + for(j = 0; j < target_rank; ++j) + all_data.Sigma_cpy_RBKI[j] -= all_data.Sigma_SVD[j]; - for(j = 0; j < k_lanc; ++j) - all_data.Sigma_cpy_Other[j] -= all_data.Sigma_cpy_SVD[j]; - - norm_svd_k = blas::nrm2(k, all_data.Sigma_cpy_SVD.data(), 1); - norm_svd_lanc = blas::nrm2(k_lanc, all_data.Sigma_cpy_SVD.data(), 1); - - err_rbki = blas::nrm2(k, all_data.Sigma_cpy_RBKI.data(), 1) / norm_svd_k; - err_lan = blas::nrm2(k_lanc, all_data.Sigma_cpy_Other.data(), 1) / norm_svd_lanc; - - if (time_subroutines) { - printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); - printf("Basic info: b_sz=%ld krylov_iters=%ld\n", k, num_krylov_iters); - - printf("Allocate and free time: %25ld μs,\n", RBKI_timing_breakdown[0]); - printf("Time to acquire the SVD factors: %25ld μs,\n", RBKI_timing_breakdown[1]); - printf("UNGQR time: %25ld μs,\n", RBKI_timing_breakdown[2]); - printf("Reorthogonalization time: %25ld μs,\n", RBKI_timing_breakdown[3]); - printf("QR time: %25ld μs,\n", RBKI_timing_breakdown[4]); - printf("GEMM A time: %25ld μs,\n", RBKI_timing_breakdown[5]); - printf("Sketching time: %25ld μs,\n", RBKI_timing_breakdown[7]); - printf("R_ii cpy time: %25ld μs,\n", RBKI_timing_breakdown[8]); - printf("S_ii cpy time: %25ld μs,\n", RBKI_timing_breakdown[9]); - printf("Norm time: %25ld μs,\n", RBKI_timing_breakdown[10]); - - printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[0] / (T) RBKI_timing_breakdown[12])); - printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[1] / (T) RBKI_timing_breakdown[12])); - printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[2] / (T) RBKI_timing_breakdown[12])); - printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[3] / (T) RBKI_timing_breakdown[12])); - printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[4] / (T) RBKI_timing_breakdown[12])); - printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[5] / (T) RBKI_timing_breakdown[12])); - printf("Sketching takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[7] / (T) RBKI_timing_breakdown[12])); - printf("R_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[8] / (T) RBKI_timing_breakdown[12])); - printf("S_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[9] / (T) RBKI_timing_breakdown[12])); - printf("Norm R takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[10] / (T) RBKI_timing_breakdown[12])); - printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[11] / (T) RBKI_timing_breakdown[12])); - - printf("\nMain loop takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[6] / (T) RBKI_timing_breakdown[12])); - printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); - } + norm_svd_k = blas::nrm2(target_rank, all_data.Sigma_SVD.data(), 1); + err_rbki = blas::nrm2(target_rank, all_data.Sigma_cpy_RBKI.data(), 1) / norm_svd_k; // Print accuracy info printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); - printf("||Sigma_ksvd - Sigma_lanc||_F / ||Sigma_lanc||_F: %.16e\n", err_lan); - - printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); - printf("Lanchosz is %f times faster that SVD.\n", (T) dur_svd / t_lanchosz_best); + printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); std::ofstream file(output_filename, std::ios::app); - file << k << ", " << num_krylov_iters << ", " << err_rbki << ", " << err_lan << ", " << t_rbki_best << ", " << dur_svd << ", " << t_lanchosz_best << ",\n"; + file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << err_rbki << ", " << t_rbki_best << ", " << dur_svd << ",\n"; } int main(int argc, char *argv[]) { @@ -229,15 +129,16 @@ int main(int argc, char *argv[]) { int64_t m = 0; int64_t n = 0; - int64_t k_start = 0; - int64_t k_stop = 0; - int64_t num_krylov_iters_start = 2; - int64_t num_krylov_iters_curr = num_krylov_iters_start; - int64_t num_krylov_iters_stop = 64; + int64_t b_sz_start = 0; + int64_t b_sz_stop = 0; + int64_t target_rank_start = 256; + int64_t target_rank_curr = target_rank_start; + int64_t target_rank_stop = 4096; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 10; + int numruns = 5; + long dur_svd = 0; std::vector res; // Generate the input matrix. @@ -250,33 +151,34 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - k_start = 2;//std::max((int64_t) 1, n / 10); - k_stop = 256;//std::max((int64_t) 1, n / 10); + b_sz_start = 2;//std::max((int64_t) 1, n / 10); + b_sz_stop = 128;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. - RBKI_benchmark_data all_data(m, n, k_stop, tol); + RBKI_benchmark_data all_data(m, n, tol); // Fill the data matrix; RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + // Read the singular vectors from argv2 + int64_t buf1 = 1; + int buf2 = 0; + RandLAPACK::gen::process_input_mat(m, buf1, all_data.Sigma_SVD.data(), argv[2], buf2); printf("Finished data preparation\n"); // Declare a data file - std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) - + "_k_start_" + std::to_string(k_start) - + "_k_stop_" + std::to_string(k_stop) - + "_num_krylov_iters_start_" + std::to_string(num_krylov_iters_start) - + "_num_krylov_iters_stop_" + std::to_string(num_krylov_iters_stop) + + "_b_sz_start_" + std::to_string(b_sz_start) + + "_b_sz_stop_" + std::to_string(b_sz_stop) + + "_num_krylov_iters_start_" + std::to_string(target_rank_start) + + "_num_krylov_iters_stop_" + std::to_string(target_rank_stop) + ".dat"; - // SVD run takes very long & is only needed once for all sizes - long dur_svd = run_svd(m_info, all_data, state); - - for (;k_start <= k_stop; k_start *=2) { - for (;num_krylov_iters_curr <= num_krylov_iters_stop; num_krylov_iters_curr *=2) { - call_all_algs(m_info, numruns, k_start, num_krylov_iters_curr, all_data, state_constant, output_filename, dur_svd); + for (;b_sz_start <= b_sz_stop; b_sz_start *=2) { + for (;target_rank_curr <= target_rank_stop; target_rank_curr *=2) { + call_all_algs(m_info, numruns, b_sz_start, target_rank_curr, all_data, state_constant, output_filename, dur_svd); } - num_krylov_iters_curr = num_krylov_iters_start; + target_rank_curr = target_rank_start; } } \ No newline at end of file From 28ee6cad0126498cab4bb8b4cea071fc944e2d67 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 12 Feb 2024 08:39:39 -0800 Subject: [PATCH 34/56] Update --- .../bench_RBKI/RBKI_speed_comparisons.cc | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index f41daf2c..91bcc231 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -88,6 +88,9 @@ static void call_all_algs( // Making sure the states are unchanged auto state_gen = state; + // Pre-compute the 2-norm of the Sigma vector from Direct SVD + norm_svd_k = blas::nrm2(target_rank, all_data.Sigma_SVD.data(), 1); + for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); @@ -96,26 +99,22 @@ static void call_all_algs( RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); + + for(j = 0; j < target_rank; ++j) + all_data.Sigma[j] -= all_data.Sigma_SVD[j]; - // Update best timing and save the singular values. - update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_RBKI.data(), target_rank); + err_rbki = blas::nrm2(target_rank, all_data.Sigma.data(), 1) / norm_svd_k; + // Print accuracy info + printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); + printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); + + std::ofstream file(output_filename, std::ios::app); + file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << err_rbki << ", " << dur_rbki << ", " << dur_svd << ",\n"; + state_gen = state; data_regen(m_info, all_data, state_gen, 0); } - - for(j = 0; j < target_rank; ++j) - all_data.Sigma_cpy_RBKI[j] -= all_data.Sigma_SVD[j]; - - norm_svd_k = blas::nrm2(target_rank, all_data.Sigma_SVD.data(), 1); - err_rbki = blas::nrm2(target_rank, all_data.Sigma_cpy_RBKI.data(), 1) / norm_svd_k; - - // Print accuracy info - printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); - printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); - - std::ofstream file(output_filename, std::ios::app); - file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << err_rbki << ", " << t_rbki_best << ", " << dur_svd << ",\n"; } int main(int argc, char *argv[]) { @@ -131,7 +130,7 @@ int main(int argc, char *argv[]) { int64_t n = 0; int64_t b_sz_start = 0; int64_t b_sz_stop = 0; - int64_t target_rank_start = 256; + int64_t target_rank_start = 512; int64_t target_rank_curr = target_rank_start; int64_t target_rank_stop = 4096; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); @@ -151,7 +150,7 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - b_sz_start = 2;//std::max((int64_t) 1, n / 10); + b_sz_start = 8;//std::max((int64_t) 1, n / 10); b_sz_stop = 128;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. From d8a1fd24f4bf50d423e2d72e857f584779c1da49 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 28 Feb 2024 01:18:21 -0800 Subject: [PATCH 35/56] RBKI benchmark update, print statements in --- RandLAPACK/drivers/rl_rbki.hh | 50 ++++++-- .../bench_RBKI/RBKI_speed_comparisons.cc | 114 +++++++++++++----- 2 files changed, 122 insertions(+), 42 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index be062358..0721ad13 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -127,7 +127,7 @@ int RBKI::call( int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; int64_t space_rows = k * std::ceil(m / (T) k); - int max_iters = std::min(this->max_krylov_iters, (int) (n / (T) k)); + int max_iters = this->max_krylov_iters;//std::min(this->max_krylov_iters, (int) (n / (T) k)); // We need a full copy of X and Y all the way through the algorithm // due to an operation with X_odd and Y_odd happening at the end. @@ -140,7 +140,12 @@ int RBKI::call( // While R and S matrices are structured (both band), we cannot make use of this structure through // BLAS-level functions. // Note also that we store a transposed version of R. - T* R = ( T * ) calloc( n * n, sizeof( T ) ); + // + // At each iterations, matrices R and S grow by b_sz. + // At the end, size of R would by d x d and size of S would + // be (d + 1) x d, where d = numiters_complete * b_sz, d <= n. + // Note that the total amount of iterations will always be numiters <= n * 2 / block_size + T* R = ( T * ) calloc( n * n, sizeof( T ) ); T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); T* Y_orth_buf = ( T * ) calloc( k * n, sizeof( T ) ); @@ -177,16 +182,19 @@ int RBKI::call( // Generate a dense Gaussian random matrx. // OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation. - omp_set_num_threads(4); + //omp_set_num_threads(4); RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; - omp_set_num_threads(48); + //omp_set_num_threads(48); if(this -> timing) { sketching_t_stop = high_resolution_clock::now(); sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); gemm_A_t_start = high_resolution_clock::now(); } + printf("m %d, n %d, k %d\n", m, n, k); + char name[] = "A"; + //RandBLAS::util::print_colmaj(n, k, Y_i, name); // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); @@ -222,12 +230,12 @@ int RBKI::call( // Iterate until in-loop termination criteria is met. - while((iter_ev + iter_od) < max_iters) { + while(1) { if(this -> timing) main_loop_t_start = high_resolution_clock::now(); if (iter % 2 != 0) { - + printf("First\n"); if(this -> timing) gemm_A_t_start = high_resolution_clock::now(); @@ -299,7 +307,7 @@ int RBKI::call( // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - //printf("TERMINATION 1 at iteration %ld\n", iter_ev); + printf("TERMINATION 1 at iteration %ld\n", iter); break; } @@ -311,7 +319,7 @@ int RBKI::call( ++iter_ev; } else { - + printf("Second\n"); if(this -> timing) gemm_A_t_start = high_resolution_clock::now(); @@ -378,7 +386,7 @@ int RBKI::call( // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - //printf("TERMINATION 2 at iteration %ld\n", iter_od); + printf("TERMINATION 2 at iteration %ld\n", iter); break; } @@ -403,17 +411,26 @@ int RBKI::call( main_loop_t_dur += duration_cast(main_loop_t_stop - main_loop_t_start).count(); } + if (iter >= max_iters) { + break; + } + ++iter; //norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A if(norm_R > threshold) { + printf("Threshold termination\n"); break; } + printf("Iter_ev + iter_od %d\n", iter_ev + iter_od); } + printf("Total iters %d\n", iter); this -> norm_R_end = norm_R; this->num_krylov_iters = iter; iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; + printf("End rows & cols %d, %d\n", end_rows, end_cols); + if(this -> timing) { allocation_t_start = high_resolution_clock::now(); } @@ -427,12 +444,24 @@ int RBKI::call( get_factors_t_start = high_resolution_clock::now(); } - if (iter % 2 == 0) { + if (iter % 2 != 0) { + printf("First option\n"); // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); } else { + printf("Second option\n"); // [U_hat, Sigma, V_hat] = svd(S) lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); + /* + char name1[] = "U_hat"; + RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name1); + + char name3[] = "Sigma"; + RandBLAS::util::print_colmaj(n, 1, Sigma, name3); + + char name2[] = "V_hat"; + RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name2); + */ } // U = X_ev * U_hat blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); @@ -499,7 +528,6 @@ int RBKI::call( printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); } } - return 0; } } // end namespace RandLAPACK diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 91bcc231..2712a08e 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -13,18 +13,16 @@ struct RBKI_benchmark_data { T tolerance; std::vector A; std::vector U; - std::vector V; + std::vector VT; // RBKI returns V' std::vector Sigma; - std::vector Sigma_cpy_RBKI; - std::vector Sigma_SVD; + std::vector U_cpy; + std::vector VT_cpy; RBKI_benchmark_data(int64_t m, int64_t n, T tol) : A(m * n, 0.0), U(m * n, 0.0), - V(n * n, 0.0), - Sigma(n, 0.0), - Sigma_cpy_RBKI(n, 0.0), - Sigma_SVD(n, 0.0) + VT(n * n, 0.0), + Sigma(n, 0.0) { row = m; col = n; @@ -41,7 +39,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, if (overwrite_A) RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.U.begin(), all_data.U.end(), 0.0); - std::fill(all_data.V.begin(), all_data.V.end(), 0.0); + std::fill(all_data.VT.begin(), all_data.VT.end(), 0.0); std::fill(all_data.Sigma.begin(), all_data.Sigma.end(), 0.0); } @@ -54,12 +52,62 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, } } + // This routine computes the residual norm error, consisting of two parts (one of which) vanishes + // in exact precision. Target_rank defines size of U, V as returned by RBKI; custom_rank <= target_rank. + template + static T + residual_error_comp(RBKI_benchmark_data &all_data, int64_t target_rank, int64_t custom_rank) { + + auto m = all_data.row; + auto n = all_data.col; + + T* U_cpy_dat = RandLAPACK::util::upsize(m * target_rank, all_data.U_cpy); + T* VT_cpy_dat = RandLAPACK::util::upsize(n * target_rank, all_data.VT_cpy); + + lapack::lacpy(MatrixType::General, m, target_rank, all_data.U.data(), m, U_cpy_dat, m); + lapack::lacpy(MatrixType::General, n, target_rank, all_data.VT.data(), n, VT_cpy_dat, n); + + // AV - US + // Scale columns of U by S + for (int i = 0; i < target_rank; ++i) + blas::scal(n, all_data.Sigma[i], &U_cpy_dat[m * i], 1); + + // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); + + // A'U - VS + // Scale columns of V by S + // Since we have VT, we will be scaling its rows + + //char name[] = "V_cpy_pre"; + //RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name); + + for (int i = 0; i < n; ++i) + blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); + + //char name1[] = "V_cpy_post"; + //RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name1); + + // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). + // We will actually have to perform U' * A - Sigma * VT. + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, target_rank, custom_rank, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); + + //char name3[] = "A'U"; + //RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name3); + + T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m) / std::sqrt(custom_rank); + T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n) / std::sqrt(custom_rank); + + return std::sqrt( std::pow(nrm2, 2) ); + } + template static void call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, int64_t b_sz, int64_t target_rank, + int64_t custom_rank, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state, std::string output_filename, @@ -67,9 +115,9 @@ static void call_all_algs( printf("\nBlock size %ld, target rank %ld\n", b_sz, target_rank); int i, j; - auto m = all_data.row; - auto n = all_data.col; - auto tol = all_data.tolerance; + auto m = all_data.row; + auto n = all_data.col; + auto tol = all_data.tolerance; T norm_svd_k; T err_rbki; bool time_subroutines = false; @@ -80,6 +128,7 @@ static void call_all_algs( // These matrices will be full-rank. // Hence, target_rank = b_sz * num_krylov_iters / 2 RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); + printf("Max Krylov iters %d\n", RBKI.max_krylov_iters); // timing vars long dur_rbki = 0; @@ -88,29 +137,35 @@ static void call_all_algs( // Making sure the states are unchanged auto state_gen = state; - // Pre-compute the 2-norm of the Sigma vector from Direct SVD - norm_svd_k = blas::nrm2(target_rank, all_data.Sigma_SVD.data(), 1); - for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); // Testing RBKI auto start_rbki = high_resolution_clock::now(); - RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state); auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); + + char name[] = "A"; + //RandBLAS::util::print_colmaj(m, n, all_data.A.data(), name); + + char name1[] = "U"; + //RandBLAS::util::print_colmaj(m, target_rank, all_data.U.data(), name1); + + char name3[] = "Sigma"; + //RandBLAS::util::print_colmaj(target_rank, 1, all_data.Sigma.data(), name3); + + char name2[] = "VT"; + //RandBLAS::util::print_colmaj(n, n, all_data.VT.data(), name2); - for(j = 0; j < target_rank; ++j) - all_data.Sigma[j] -= all_data.Sigma_SVD[j]; - err_rbki = blas::nrm2(target_rank, all_data.Sigma.data(), 1) / norm_svd_k; + T residual_err = residual_error_comp(all_data, target_rank, custom_rank); // Print accuracy info - printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); - printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); - + printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(traget_rank): %.16e\n", residual_err); + std::ofstream file(output_filename, std::ios::app); - file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << err_rbki << ", " << dur_rbki << ", " << dur_svd << ",\n"; + file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << custom_rank << ", " << residual_err << ", " << dur_rbki << ", " << dur_svd << ",\n"; state_gen = state; data_regen(m_info, all_data, state_gen, 0); @@ -132,11 +187,12 @@ int main(int argc, char *argv[]) { int64_t b_sz_stop = 0; int64_t target_rank_start = 512; int64_t target_rank_curr = target_rank_start; - int64_t target_rank_stop = 4096; + int64_t target_rank_stop = 512; + int64_t custom_rank = 32; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 5; + int numruns = 1; long dur_svd = 0; std::vector res; @@ -150,18 +206,14 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - b_sz_start = 8;//std::max((int64_t) 1, n / 10); - b_sz_stop = 128;//std::max((int64_t) 1, n / 10); + b_sz_start = 16;//std::max((int64_t) 1, n / 10); + b_sz_stop = 16;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, tol); // Fill the data matrix; RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); - // Read the singular vectors from argv2 - int64_t buf1 = 1; - int buf2 = 0; - RandLAPACK::gen::process_input_mat(m, buf1, all_data.Sigma_SVD.data(), argv[2], buf2); printf("Finished data preparation\n"); @@ -176,7 +228,7 @@ int main(int argc, char *argv[]) { for (;b_sz_start <= b_sz_stop; b_sz_start *=2) { for (;target_rank_curr <= target_rank_stop; target_rank_curr *=2) { - call_all_algs(m_info, numruns, b_sz_start, target_rank_curr, all_data, state_constant, output_filename, dur_svd); + call_all_algs(m_info, numruns, b_sz_start, target_rank_curr, custom_rank, all_data, state_constant, output_filename, dur_svd); } target_rank_curr = target_rank_start; } From 01b1f8a19286768b7b697253ed730baec47f3dab Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 28 Feb 2024 01:39:31 -0800 Subject: [PATCH 36/56] Ready to benchmark on large matrices --- RandLAPACK/drivers/rl_rbki.hh | 27 ++---------- .../bench_RBKI/RBKI_speed_comparisons.cc | 42 +++++-------------- 2 files changed, 13 insertions(+), 56 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 0721ad13..045cdc6e 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -192,9 +192,6 @@ int RBKI::call( sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); gemm_A_t_start = high_resolution_clock::now(); } - printf("m %d, n %d, k %d\n", m, n, k); - char name[] = "A"; - //RandBLAS::util::print_colmaj(n, k, Y_i, name); // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); @@ -235,7 +232,6 @@ int RBKI::call( main_loop_t_start = high_resolution_clock::now(); if (iter % 2 != 0) { - printf("First\n"); if(this -> timing) gemm_A_t_start = high_resolution_clock::now(); @@ -307,7 +303,7 @@ int RBKI::call( // Early termination // if (abs(R(end)) <= sqrt(eps('double'))) if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 1 at iteration %ld\n", iter); + //printf("TERMINATION 1 at iteration %ld\n", iter); break; } @@ -319,7 +315,6 @@ int RBKI::call( ++iter_ev; } else { - printf("Second\n"); if(this -> timing) gemm_A_t_start = high_resolution_clock::now(); @@ -386,7 +381,7 @@ int RBKI::call( // Early termination // if (abs(S(end)) <= sqrt(eps('double'))) if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits::epsilon())) { - printf("TERMINATION 2 at iteration %ld\n", iter); + //printf("TERMINATION 2 at iteration %ld\n", iter); break; } @@ -418,19 +413,15 @@ int RBKI::call( ++iter; //norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A if(norm_R > threshold) { - printf("Threshold termination\n"); + //printf("Threshold termination\n"); break; } - printf("Iter_ev + iter_od %d\n", iter_ev + iter_od); } - printf("Total iters %d\n", iter); this -> norm_R_end = norm_R; this->num_krylov_iters = iter; iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; - printf("End rows & cols %d, %d\n", end_rows, end_cols); - if(this -> timing) { allocation_t_start = high_resolution_clock::now(); } @@ -445,23 +436,11 @@ int RBKI::call( } if (iter % 2 != 0) { - printf("First option\n"); // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); } else { - printf("Second option\n"); // [U_hat, Sigma, V_hat] = svd(S) lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); - /* - char name1[] = "U_hat"; - RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name1); - - char name3[] = "Sigma"; - RandBLAS::util::print_colmaj(n, 1, Sigma, name3); - - char name2[] = "V_hat"; - RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name2); - */ } // U = X_ev * U_hat blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 2712a08e..3df38609 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -71,30 +71,18 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, // Scale columns of U by S for (int i = 0; i < target_rank; ++i) blas::scal(n, all_data.Sigma[i], &U_cpy_dat[m * i], 1); - // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); // A'U - VS // Scale columns of V by S // Since we have VT, we will be scaling its rows - - //char name[] = "V_cpy_pre"; - //RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name); - for (int i = 0; i < n; ++i) blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); - - //char name1[] = "V_cpy_post"; - //RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name1); - // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). // We will actually have to perform U' * A - Sigma * VT. blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, target_rank, custom_rank, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); - //char name3[] = "A'U"; - //RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name3); - T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m) / std::sqrt(custom_rank); T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n) / std::sqrt(custom_rank); @@ -145,27 +133,17 @@ static void call_all_algs( RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state); auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); - - char name[] = "A"; - //RandBLAS::util::print_colmaj(m, n, all_data.A.data(), name); - - char name1[] = "U"; - //RandBLAS::util::print_colmaj(m, target_rank, all_data.U.data(), name1); - - char name3[] = "Sigma"; - //RandBLAS::util::print_colmaj(target_rank, 1, all_data.Sigma.data(), name3); - - char name2[] = "VT"; - //RandBLAS::util::print_colmaj(n, n, all_data.VT.data(), name2); - T residual_err = residual_error_comp(all_data, target_rank, custom_rank); + T residual_err_custom = residual_error_comp(all_data, target_rank, custom_rank); + T residual_err_target = residual_error_comp(all_data, target_rank, target_rank); // Print accuracy info - printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(traget_rank): %.16e\n", residual_err); + printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(custom_rank): %.16e\n", residual_err_custom); + printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(traget_rank): %.16e\n", residual_err_target); std::ofstream file(output_filename, std::ios::app); - file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << custom_rank << ", " << residual_err << ", " << dur_rbki << ", " << dur_svd << ",\n"; + file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << custom_rank << ", " << residual_err_target << ", " << residual_err_custom << ", " << dur_rbki << ", " << dur_svd << ",\n"; state_gen = state; data_regen(m_info, all_data, state_gen, 0); @@ -187,12 +165,12 @@ int main(int argc, char *argv[]) { int64_t b_sz_stop = 0; int64_t target_rank_start = 512; int64_t target_rank_curr = target_rank_start; - int64_t target_rank_stop = 512; - int64_t custom_rank = 32; + int64_t target_rank_stop = 4096; + int64_t custom_rank = 10; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 1; + int numruns = 3; long dur_svd = 0; std::vector res; @@ -206,8 +184,8 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - b_sz_start = 16;//std::max((int64_t) 1, n / 10); - b_sz_stop = 16;//std::max((int64_t) 1, n / 10); + b_sz_start = 8;//std::max((int64_t) 1, n / 10); + b_sz_stop = 128;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, tol); From 7b81e1d77751a8e9af5921522d9dd015d70b507c Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 5 Mar 2024 00:38:06 -0800 Subject: [PATCH 37/56] Update --- benchmark/bench_RBKI/RBKI_speed_comparisons.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 3df38609..92f725f6 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -83,10 +83,10 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, // We will actually have to perform U' * A - Sigma * VT. blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, target_rank, custom_rank, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); - T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m) / std::sqrt(custom_rank); - T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n) / std::sqrt(custom_rank); + T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); + T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n); - return std::sqrt( std::pow(nrm2, 2) ); + return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); } template @@ -165,7 +165,7 @@ int main(int argc, char *argv[]) { int64_t b_sz_stop = 0; int64_t target_rank_start = 512; int64_t target_rank_curr = target_rank_start; - int64_t target_rank_stop = 4096; + int64_t target_rank_stop = 512; int64_t custom_rank = 10; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -184,8 +184,8 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - b_sz_start = 8;//std::max((int64_t) 1, n / 10); - b_sz_stop = 128;//std::max((int64_t) 1, n / 10); + b_sz_start = 16;//std::max((int64_t) 1, n / 10); + b_sz_stop = 16;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, tol); From e15cfd55734a04ecb16ae3447439202c4cce3a0a Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 5 Mar 2024 23:35:40 -0800 Subject: [PATCH 38/56] Benchmarking ICQRRP with QP3 --- RandLAPACK/drivers/rl_cqrrp.hh | 6 ++-- RandLAPACK/drivers/rl_rbki.hh | 21 ++++++++++++ .../bench_CQRRP/CQRRP_speed_comparisons.cc | 20 +++++------ .../bench_RBKI/RBKI_speed_comparisons.cc | 34 +++++++++++++++---- 4 files changed, 63 insertions(+), 18 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 54998f78..998c0b63 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -310,7 +310,9 @@ int CQRRP_blocked::call( if(this -> timing) qrcp_t_start = high_resolution_clock::now(); - + + lapack::geqp3(sampling_dimension, cols, A_sk, d, J_buffer, Work2); + /* // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. // Get a transpose of A_sk #pragma omp parallel for @@ -329,7 +331,7 @@ int CQRRP_blocked::call( util::col_swap(sampling_dimension, cols, cols, A_sk, d, J_buf); // Perform an unpivoted QR on A_sk lapack::geqrf(sampling_dimension, cols, A_sk, d, Work2); - + */ if(this -> timing) { qrcp_t_stop = high_resolution_clock::now(); qrcp_t_dur += duration_cast(qrcp_t_stop - qrcp_t_start).count(); diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 045cdc6e..1369cc86 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -187,6 +187,17 @@ int RBKI::call( state = RandBLAS::fill_dense(D, Y_i, state).second; //omp_set_num_threads(48); + +/***********************************************************************************/ + std::ofstream file("SKETCHING_OPERATOR.txt", std::ios::app); + for (int i = 0; i < k; ++i) { + for (int j = 0; j < n; ++j) { + file << *(Y_i + i * n + j) << " "; + } + file << "\n"; // Move to the next line after each row + } +/***********************************************************************************/ + if(this -> timing) { sketching_t_stop = high_resolution_clock::now(); sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); @@ -417,6 +428,7 @@ int RBKI::call( break; } } + printf("Total iters %d\n", iter); this -> norm_R_end = norm_R; this->num_krylov_iters = iter; @@ -435,6 +447,15 @@ int RBKI::call( get_factors_t_start = high_resolution_clock::now(); } + printf("%ld, %ld\n", end_rows, end_cols); + std::ofstream file2("S_TO_DECOMPOSE.txt", std::ios::app); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < (n+k); ++j) { + file2 << *(S + i * (n + k) + j) << " "; + } + file2 << "\n"; // Move to the next line after each row + } + if (iter % 2 != 0) { // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); diff --git a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc index 87a62cb3..f259c426 100644 --- a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc +++ b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc @@ -83,7 +83,7 @@ static std::vector call_all_algs( printf("ITERATION\n"); // Testing GEQRF auto start_geqp3 = high_resolution_clock::now(); - lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); + //lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); auto stop_geqp3 = high_resolution_clock::now(); auto dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); printf("TOTAL TIME FOR GEQP3 %ld\n", dur_geqp3); @@ -92,7 +92,7 @@ static std::vector call_all_algs( // Testing GEQRF auto start_geqrf = high_resolution_clock::now(); - lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); + //lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); auto stop_geqrf = high_resolution_clock::now(); dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); printf("TOTAL TIME FOR GEQRF %ld\n", dur_geqrf); @@ -121,7 +121,7 @@ static std::vector call_all_algs( // Testing HQRRP with GEQRF auto start_hqrrp_geqrf = high_resolution_clock::now(); - RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 0, state_alg_1, (T*) nullptr); + //RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 0, state_alg_1, (T*) nullptr); auto stop_hqrrp_geqrf = high_resolution_clock::now(); dur_hqrrp_geqrf = duration_cast(stop_hqrrp_geqrf - start_hqrrp_geqrf).count(); printf("TOTAL TIME FOR HQRRP WITH GEQRF %ld\n", dur_hqrrp_geqrf); @@ -136,7 +136,7 @@ static std::vector call_all_algs( // Testing HQRRP with Cholqr auto start_hqrrp_cholqr = high_resolution_clock::now(); - RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 1, state_alg_3, (T*) nullptr); + //RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 1, state_alg_3, (T*) nullptr); auto stop_hqrrp_cholqr = high_resolution_clock::now(); dur_hqrrp_cholqr = duration_cast(stop_hqrrp_cholqr - start_hqrrp_cholqr).count(); printf("TOTAL TIME FOR HQRRP WITH CHOLQRQ %ld\n", dur_hqrrp_cholqr); @@ -162,18 +162,18 @@ static std::vector call_all_algs( int main() { // Declare parameters - int64_t m = 10000;//std::pow(2, 14); - int64_t n = 10000;//std::pow(2, 14); - double d_factor = 1.0; + int64_t m = std::pow(2, 16); + int64_t n = std::pow(2, 16); + double d_factor = 1.25; int64_t b_sz_start = 256; - int64_t b_sz_end = 256; + int64_t b_sz_end = 2048; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 5; + int64_t numruns = 2; // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, tol, d_factor); @@ -182,7 +182,7 @@ int main() { RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file - std::fstream file("QR_time_raw_rows_" + std::to_string(m) + std::fstream file("ICQRRP_QP3_QR_time_raw_rows_" + std::to_string(m) + "_cols_" + std::to_string(n) + "_b_sz_start_" + std::to_string(b_sz_start) + "_b_sz_end_" + std::to_string(b_sz_end) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 92f725f6..f05d3bed 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -57,7 +57,7 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, template static T residual_error_comp(RBKI_benchmark_data &all_data, int64_t target_rank, int64_t custom_rank) { - + printf("%ld\n", custom_rank); auto m = all_data.row; auto n = all_data.col; @@ -86,7 +86,9 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n); - return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); + printf("%e %e\n", nrm1, nrm2); + + return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); } template @@ -133,6 +135,28 @@ static void call_all_algs( RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state); auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); + + std::ofstream file1("U.txt", std::ios::app); + for (int i = 0; i < target_rank; ++i) { + for (int j = 0; j < m; ++j) { + file1 << *(all_data.U.data() + i * m + j) << " "; + } + file1 << "\n"; // Move to the next line after each row + } + + std::ofstream file2("VT.txt", std::ios::app); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < target_rank; ++j) { + file2 << *(all_data.VT.data() + i * n + j) << " "; + } + file2 << "\n"; // Move to the next line after each row + } + + std::ofstream file3("S.txt", std::ios::app); + for (int i = 0; i < target_rank; ++i) { + file3 << *(all_data.Sigma.data() + i) << " "; + file3 << "\n"; // Move to the next line after each row + } T residual_err_custom = residual_error_comp(all_data, target_rank, custom_rank); @@ -142,9 +166,7 @@ static void call_all_algs( printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(custom_rank): %.16e\n", residual_err_custom); printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(traget_rank): %.16e\n", residual_err_target); - std::ofstream file(output_filename, std::ios::app); - file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << custom_rank << ", " << residual_err_target << ", " << residual_err_custom << ", " << dur_rbki << ", " << dur_svd << ",\n"; - +std::ofstream file(output_filename, std::ios::app); state_gen = state; data_regen(m_info, all_data, state_gen, 0); } @@ -170,7 +192,7 @@ int main(int argc, char *argv[]) { double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 3; + int numruns = 1; long dur_svd = 0; std::vector res; From e93299d5cfdeb08236072619d44b5eb3997aefc7 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 11 Mar 2024 09:57:53 -0700 Subject: [PATCH 39/56] Finished yet another RBKI debug. prints in --- RandLAPACK/drivers/rl_rbki.hh | 85 ++++++++++++++++--- .../bench_RBKI/RBKI_speed_comparisons.cc | 46 ++++++---- 2 files changed, 101 insertions(+), 30 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 1369cc86..243c5829 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -12,6 +12,7 @@ #include #include #include +#include using namespace std::chrono; @@ -187,12 +188,11 @@ int RBKI::call( state = RandBLAS::fill_dense(D, Y_i, state).second; //omp_set_num_threads(48); - /***********************************************************************************/ - std::ofstream file("SKETCHING_OPERATOR.txt", std::ios::app); + std::ofstream file("run_out/SKETCHING_OPERATOR.txt", std::ios::trunc); for (int i = 0; i < k; ++i) { for (int j = 0; j < n; ++j) { - file << *(Y_i + i * n + j) << " "; + file << std::setprecision(20) << *(Y_i + i * n + j) << " "; } file << "\n"; // Move to the next line after each row } @@ -237,7 +237,8 @@ int RBKI::call( ++iter; // Iterate until in-loop termination criteria is met. - + char name [] = "S_TO_DECOMPOSE"; + char name1 [] = "Y_i"; while(1) { if(this -> timing) main_loop_t_start = high_resolution_clock::now(); @@ -245,7 +246,6 @@ int RBKI::call( if (iter % 2 != 0) { if(this -> timing) gemm_A_t_start = high_resolution_clock::now(); - // Y_i = A' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); @@ -276,13 +276,12 @@ int RBKI::call( reorth_t_dur += duration_cast(reorth_t_stop - reorth_t_start).count(); } } - + /****************************************ISSUE ABOVE****************************************************************************/ // [Y_i, R_ii] = qr(Y_i, 0) std::fill(&tau[0], &tau[k], 0.0); if(this -> timing) qr_t_start = high_resolution_clock::now(); - lapack::geqrf(n, k, Y_i, n, tau); if(this -> timing) { @@ -305,7 +304,7 @@ int RBKI::call( // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be. lapack::ungqr(n, k, k, Y_i, n, tau); - + if(this -> timing) { ungqr_t_stop = high_resolution_clock::now(); ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); @@ -331,7 +330,7 @@ int RBKI::call( // X_i = A * Y_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); - + if(this -> timing) { gemm_A_t_stop = high_resolution_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); @@ -363,7 +362,7 @@ int RBKI::call( if(this -> timing) qr_t_start = high_resolution_clock::now(); - + lapack::geqrf(m, k, X_i, m, tau); if(this -> timing) { @@ -401,6 +400,7 @@ int RBKI::call( S_ii = &S_ii[((n + k) + 1) * k]; // Advance odd iteration count; ++iter_od; + //RandBLAS::util::print_colmaj(n+k, n, S, name); } if(this -> timing) @@ -432,7 +432,10 @@ int RBKI::call( this -> norm_R_end = norm_R; this->num_krylov_iters = iter; - iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; + //iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; + end_cols = num_krylov_iters * k / 2; + iter % 2 == 0 ? end_rows = end_cols + k : end_rows = end_cols; + if(this -> timing) { allocation_t_start = high_resolution_clock::now(); @@ -448,27 +451,83 @@ int RBKI::call( } printf("%ld, %ld\n", end_rows, end_cols); - std::ofstream file2("S_TO_DECOMPOSE.txt", std::ios::app); + std::ofstream file2("run_out/S_TO_DECOMPOSE.txt", std::ios::trunc); for (int i = 0; i < n; ++i) { for (int j = 0; j < (n+k); ++j) { - file2 << *(S + i * (n + k) + j) << " "; + file2 << std::setprecision(20) << *(S + i * (n + k) + j) << " "; } file2 << "\n"; // Move to the next line after each row } + char name_A [] = "A"; + //RandBLAS::util::print_colmaj(m, n, A, name_A); + + char name0 [] = "S TO DECOMPOSE"; + //RandBLAS::util::print_colmaj(n + k, n, S, name0); + + + //RandBLAS::util::print_colmaj(n+k, n, S, name); if (iter % 2 != 0) { + printf("Decomposing R\n"); // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); } else { + printf("Decomposing S\n"); // [U_hat, Sigma, V_hat] = svd(S) lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); } + + char name2 [] = "U_hat"; + char name3 [] = "VT_hat"; + char name4 [] = "Sigma"; + + //RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name2); + //RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name3); + //RandBLAS::util::print_colmaj(end_cols, 1, Sigma, name4); + + std::ofstream file5("run_out/U_hat.txt", std::ios::trunc); + for (int i = 0; i < end_cols; ++i) { + for (int j = 0; j < end_rows; ++j) { + file5 << std::setprecision(20) << *(U_hat + i * end_rows + j) << " "; + } + file5 << "\n"; // Move to the next line after each row + } + + + std::ofstream file6("run_out/VT_hat.txt", std::ios::trunc); + for (int i = 0; i < end_cols; ++i) { + for (int j = 0; j < end_cols; ++j) { + file6 << std::setprecision(20) << *(VT_hat + i * end_cols + j) << " "; + } + file6 << "\n"; // Move to the next line after each row + } + + + // U = X_ev * U_hat blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); // V = Y_od * V_hat // We actually perform VT = V_hat' * Y_odd' blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); + + std::ofstream file3("run_out/X_ev.txt", std::ios::trunc); + for (int i = 0; i < end_rows; ++i) { + for (int j = 0; j < m; ++j) { + file3 << std::setprecision(20) << *(X_ev + i * m + j) << " "; + } + file3 << "\n"; // Move to the next line after each row + } + + + std::ofstream file4("run_out/Y_od.txt", std::ios::trunc); + for (int i = 0; i < end_cols; ++i) { + for (int j = 0; j < n; ++j) { + file4 << std::setprecision(20) << *(Y_od + i * n + j) << " "; + } + file4 << "\n"; // Move to the next line after each row + } + if(this -> timing) { get_factors_t_stop = high_resolution_clock::now(); get_factors_t_dur = duration_cast(get_factors_t_stop - get_factors_t_start).count(); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index f05d3bed..7f1b7ad9 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -5,6 +5,7 @@ #include #include +#include template struct RBKI_benchmark_data { @@ -61,30 +62,37 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, auto m = all_data.row; auto n = all_data.col; - T* U_cpy_dat = RandLAPACK::util::upsize(m * target_rank, all_data.U_cpy); - T* VT_cpy_dat = RandLAPACK::util::upsize(n * target_rank, all_data.VT_cpy); + T* U_cpy_dat = RandLAPACK::util::upsize(m * n, all_data.U_cpy); + T* VT_cpy_dat = RandLAPACK::util::upsize(n * n, all_data.VT_cpy); - lapack::lacpy(MatrixType::General, m, target_rank, all_data.U.data(), m, U_cpy_dat, m); - lapack::lacpy(MatrixType::General, n, target_rank, all_data.VT.data(), n, VT_cpy_dat, n); + lapack::lacpy(MatrixType::General, m, n, all_data.U.data(), m, U_cpy_dat, m); + lapack::lacpy(MatrixType::General, n, n, all_data.VT.data(), n, VT_cpy_dat, n); // AV - US // Scale columns of U by S - for (int i = 0; i < target_rank; ++i) - blas::scal(n, all_data.Sigma[i], &U_cpy_dat[m * i], 1); + for (int i = 0; i < custom_rank; ++i) + blas::scal(m, all_data.Sigma[i], &U_cpy_dat[m * i], 1); + + // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); + // A'U - VS // Scale columns of V by S // Since we have VT, we will be scaling its rows - for (int i = 0; i < n; ++i) - blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); + // The data is, however, stored in a column-major format, so it is a bit weird. + //for (int i = 0; i < n; ++i) + // blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); + for (int i = 0; i < custom_rank; ++i) + blas::scal(n, all_data.Sigma[i], &VT_cpy_dat[i], n); // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). // We will actually have to perform U' * A - Sigma * VT. - blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, target_rank, custom_rank, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); + + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, custom_rank, n, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); - T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n); + T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); printf("%e %e\n", nrm1, nrm2); @@ -136,25 +144,28 @@ static void call_all_algs( auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); - std::ofstream file1("U.txt", std::ios::app); + std::ofstream file1("run_out/U.txt", std::ios::trunc); for (int i = 0; i < target_rank; ++i) { for (int j = 0; j < m; ++j) { - file1 << *(all_data.U.data() + i * m + j) << " "; + file1 << std::setprecision(20) << *(all_data.U.data() + i * m + j) << " "; } file1 << "\n"; // Move to the next line after each row } - std::ofstream file2("VT.txt", std::ios::app); + std::ofstream file2("run_out/VT.txt", std::ios::trunc); for (int i = 0; i < n; ++i) { for (int j = 0; j < target_rank; ++j) { - file2 << *(all_data.VT.data() + i * n + j) << " "; + file2 << std::setprecision(20) << *(all_data.VT.data() + i * n + j) << " "; } file2 << "\n"; // Move to the next line after each row } - std::ofstream file3("S.txt", std::ios::app); + char name [] = "VT"; + //RandBLAS::util::print_colmaj(n, n, all_data.VT.data(), name); + + std::ofstream file3("run_out/S.txt", std::ios::trunc); for (int i = 0; i < target_rank; ++i) { - file3 << *(all_data.Sigma.data() + i) << " "; + file3 << std::setprecision(20) << *(all_data.Sigma.data() + i) << " "; file3 << "\n"; // Move to the next line after each row } @@ -166,7 +177,8 @@ static void call_all_algs( printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(custom_rank): %.16e\n", residual_err_custom); printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(traget_rank): %.16e\n", residual_err_target); -std::ofstream file(output_filename, std::ios::app); + std::ofstream file(output_filename, std::ios::app); + file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << custom_rank << ", " << residual_err_target << ", " << residual_err_custom << ", " << dur_rbki << ", " << dur_svd << ",\n"; state_gen = state; data_regen(m_info, all_data, state_gen, 0); } From 60f50b52bf1d1db27ceac4a019c039acdb9ac65b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 11 Mar 2024 10:19:00 -0700 Subject: [PATCH 40/56] Ready for benchmarking --- RandLAPACK/drivers/rl_rbki.hh | 65 ------------------- .../bench_RBKI/RBKI_speed_comparisons.cc | 38 ++--------- 2 files changed, 4 insertions(+), 99 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 243c5829..69261b1f 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -428,7 +428,6 @@ int RBKI::call( break; } } - printf("Total iters %d\n", iter); this -> norm_R_end = norm_R; this->num_krylov_iters = iter; @@ -450,84 +449,20 @@ int RBKI::call( get_factors_t_start = high_resolution_clock::now(); } - printf("%ld, %ld\n", end_rows, end_cols); - std::ofstream file2("run_out/S_TO_DECOMPOSE.txt", std::ios::trunc); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < (n+k); ++j) { - file2 << std::setprecision(20) << *(S + i * (n + k) + j) << " "; - } - file2 << "\n"; // Move to the next line after each row - } - - char name_A [] = "A"; - //RandBLAS::util::print_colmaj(m, n, A, name_A); - - char name0 [] = "S TO DECOMPOSE"; - //RandBLAS::util::print_colmaj(n + k, n, S, name0); - - - //RandBLAS::util::print_colmaj(n+k, n, S, name); if (iter % 2 != 0) { - printf("Decomposing R\n"); // [U_hat, Sigma, V_hat] = svd(R') lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols); } else { - printf("Decomposing S\n"); // [U_hat, Sigma, V_hat] = svd(S) lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols); } - char name2 [] = "U_hat"; - char name3 [] = "VT_hat"; - char name4 [] = "Sigma"; - - //RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name2); - //RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name3); - //RandBLAS::util::print_colmaj(end_cols, 1, Sigma, name4); - - std::ofstream file5("run_out/U_hat.txt", std::ios::trunc); - for (int i = 0; i < end_cols; ++i) { - for (int j = 0; j < end_rows; ++j) { - file5 << std::setprecision(20) << *(U_hat + i * end_rows + j) << " "; - } - file5 << "\n"; // Move to the next line after each row - } - - - std::ofstream file6("run_out/VT_hat.txt", std::ios::trunc); - for (int i = 0; i < end_cols; ++i) { - for (int j = 0; j < end_cols; ++j) { - file6 << std::setprecision(20) << *(VT_hat + i * end_cols + j) << " "; - } - file6 << "\n"; // Move to the next line after each row - } - - - // U = X_ev * U_hat blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m); // V = Y_od * V_hat // We actually perform VT = V_hat' * Y_odd' blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); - - std::ofstream file3("run_out/X_ev.txt", std::ios::trunc); - for (int i = 0; i < end_rows; ++i) { - for (int j = 0; j < m; ++j) { - file3 << std::setprecision(20) << *(X_ev + i * m + j) << " "; - } - file3 << "\n"; // Move to the next line after each row - } - - - std::ofstream file4("run_out/Y_od.txt", std::ios::trunc); - for (int i = 0; i < end_cols; ++i) { - for (int j = 0; j < n; ++j) { - file4 << std::setprecision(20) << *(Y_od + i * n + j) << " "; - } - file4 << "\n"; // Move to the next line after each row - } - if(this -> timing) { get_factors_t_stop = high_resolution_clock::now(); get_factors_t_dur = duration_cast(get_factors_t_stop - get_factors_t_start).count(); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 7f1b7ad9..7ee2903f 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -58,7 +58,6 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, template static T residual_error_comp(RBKI_benchmark_data &all_data, int64_t target_rank, int64_t custom_rank) { - printf("%ld\n", custom_rank); auto m = all_data.row; auto n = all_data.col; @@ -94,8 +93,6 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); - printf("%e %e\n", nrm1, nrm2); - return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); } @@ -126,7 +123,6 @@ static void call_all_algs( // These matrices will be full-rank. // Hence, target_rank = b_sz * num_krylov_iters / 2 RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); - printf("Max Krylov iters %d\n", RBKI.max_krylov_iters); // timing vars long dur_rbki = 0; @@ -144,32 +140,6 @@ static void call_all_algs( auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); - std::ofstream file1("run_out/U.txt", std::ios::trunc); - for (int i = 0; i < target_rank; ++i) { - for (int j = 0; j < m; ++j) { - file1 << std::setprecision(20) << *(all_data.U.data() + i * m + j) << " "; - } - file1 << "\n"; // Move to the next line after each row - } - - std::ofstream file2("run_out/VT.txt", std::ios::trunc); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < target_rank; ++j) { - file2 << std::setprecision(20) << *(all_data.VT.data() + i * n + j) << " "; - } - file2 << "\n"; // Move to the next line after each row - } - - char name [] = "VT"; - //RandBLAS::util::print_colmaj(n, n, all_data.VT.data(), name); - - std::ofstream file3("run_out/S.txt", std::ios::trunc); - for (int i = 0; i < target_rank; ++i) { - file3 << std::setprecision(20) << *(all_data.Sigma.data() + i) << " "; - file3 << "\n"; // Move to the next line after each row - } - - T residual_err_custom = residual_error_comp(all_data, target_rank, custom_rank); T residual_err_target = residual_error_comp(all_data, target_rank, target_rank); @@ -199,7 +169,7 @@ int main(int argc, char *argv[]) { int64_t b_sz_stop = 0; int64_t target_rank_start = 512; int64_t target_rank_curr = target_rank_start; - int64_t target_rank_stop = 512; + int64_t target_rank_stop = 4096; int64_t custom_rank = 10; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -211,15 +181,15 @@ int main(int argc, char *argv[]) { // Generate the input matrix. RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); m_info.filename = argv[1]; - m_info.workspace_query_mod = 1; + m_info.workspace_query_mod = 3; // Workspace query; RandLAPACK::gen::mat_gen(m_info, NULL, state); // Update basic params. m = m_info.rows; n = m_info.cols; - b_sz_start = 16;//std::max((int64_t) 1, n / 10); - b_sz_stop = 16;//std::max((int64_t) 1, n / 10); + b_sz_start = 8;//std::max((int64_t) 1, n / 10); + b_sz_stop = 256;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, tol); From 63904698bdc1e1fbee0fae111fb992cc3955dd6d Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 13 Mar 2024 01:28:14 -0700 Subject: [PATCH 41/56] Reworked RBKI benchmark to be based on num matmuls rather than target rank. --- .../bench_RBKI/RBKI_speed_comparisons.cc | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 7ee2903f..c896b861 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -101,13 +101,13 @@ static void call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, int64_t b_sz, - int64_t target_rank, + int64_t num_matmuls, int64_t custom_rank, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state, std::string output_filename, long dur_svd) { - printf("\nBlock size %ld, target rank %ld\n", b_sz, target_rank); + printf("\nBlock size %ld, num matmuls %ld\n", b_sz, num_matmuls); int i, j; auto m = all_data.row; @@ -122,7 +122,11 @@ static void call_all_algs( // Matrices R or S that give us the singular value spectrum returned by RBKI will be of size b_sz * num_krylov_iters / 2. // These matrices will be full-rank. // Hence, target_rank = b_sz * num_krylov_iters / 2 - RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); + // RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); + // + // Instead of the above approach, we now pre-specify the maximum number of Krylov iters that we allow for in num_matmuls. + RBKI.max_krylov_iters = (int) num_matmuls; + int64_t target_rank = b_sz * num_matmuls / 2; // timing vars long dur_rbki = 0; @@ -167,29 +171,29 @@ int main(int argc, char *argv[]) { int64_t n = 0; int64_t b_sz_start = 0; int64_t b_sz_stop = 0; - int64_t target_rank_start = 512; - int64_t target_rank_curr = target_rank_start; - int64_t target_rank_stop = 4096; + int64_t num_matmuls_start = 2; + int64_t num_matmuls_curr = num_matmuls_start; + int64_t num_matmuls_stop = 20; int64_t custom_rank = 10; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 1; + int numruns = 5; long dur_svd = 0; std::vector res; // Generate the input matrix. RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::custom_input); m_info.filename = argv[1]; - m_info.workspace_query_mod = 3; + m_info.workspace_query_mod = 1; // Workspace query; RandLAPACK::gen::mat_gen(m_info, NULL, state); // Update basic params. m = m_info.rows; n = m_info.cols; - b_sz_start = 8;//std::max((int64_t) 1, n / 10); - b_sz_stop = 256;//std::max((int64_t) 1, n / 10); + b_sz_start = 16;//std::max((int64_t) 1, n / 10); + b_sz_stop = 128;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. RBKI_benchmark_data all_data(m, n, tol); @@ -204,14 +208,14 @@ int main(int argc, char *argv[]) { + "_n_" + std::to_string(n) + "_b_sz_start_" + std::to_string(b_sz_start) + "_b_sz_stop_" + std::to_string(b_sz_stop) - + "_num_krylov_iters_start_" + std::to_string(target_rank_start) - + "_num_krylov_iters_stop_" + std::to_string(target_rank_stop) + + "_num_matmuls_start_" + std::to_string(num_matmuls_start) + + "_num_matmuls_stop_" + std::to_string(num_matmuls_stop) + ".dat"; for (;b_sz_start <= b_sz_stop; b_sz_start *=2) { - for (;target_rank_curr <= target_rank_stop; target_rank_curr *=2) { - call_all_algs(m_info, numruns, b_sz_start, target_rank_curr, custom_rank, all_data, state_constant, output_filename, dur_svd); + for (;num_matmuls_curr <= num_matmuls_stop; ++num_matmuls_curr) { + call_all_algs(m_info, numruns, b_sz_start, num_matmuls_curr, custom_rank, all_data, state_constant, output_filename, dur_svd); } - target_rank_curr = target_rank_start; + num_matmuls_curr = num_matmuls_start; } } \ No newline at end of file From 2a3694ffeff9bdd00ba2f4f8c9b7f84a2a3c54b7 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 20 Mar 2024 07:20:41 -0700 Subject: [PATCH 42/56] Update before reworking RBKI --- RandLAPACK/drivers/rl_cqrrp.hh | 3 + RandLAPACK/drivers/rl_cqrrpt.hh | 3 + RandLAPACK/drivers/rl_nysbki.hh | 117 -------------------------------- RandLAPACK/drivers/rl_rbki.hh | 60 ++++++++++++++++ 4 files changed, 66 insertions(+), 117 deletions(-) delete mode 100644 RandLAPACK/drivers/rl_nysbki.hh diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 998c0b63..564325e1 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -85,6 +85,9 @@ class CQRRP_blocked : public CQRRPalg { /// @param[in] tau /// Pointer to a vector of size n. On entry, is empty. /// + /// @param[in] state + /// RNG state parameter, required for sketching operator generation. + /// /// @param[out] A /// Overwritten by Implicit Q and explicit R factors. /// diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index 77406edd..70aa3743 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -92,6 +92,9 @@ class CQRRPT : public CQRRPTalg { /// Represents the upper-triangular R factor of QR factorization. /// On entry, is empty and may not have any space allocated for it. /// + /// @param[in] state + /// RNG state parameter, required for sketching operator generation. + /// /// @param[out] A /// Overwritten by an m-by-k orthogonal Q factor. /// Matrix is stored explicitly. diff --git a/RandLAPACK/drivers/rl_nysbki.hh b/RandLAPACK/drivers/rl_nysbki.hh deleted file mode 100644 index 439c4c02..00000000 --- a/RandLAPACK/drivers/rl_nysbki.hh +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef randlapack_NysBKI_h -#define randlapack_NysBKI_h - -#include "rl_util.hh" -#include "rl_blaspp.hh" -#include "rl_lapackpp.hh" -#include "rl_hqrrp.hh" - -#include -#include -#include -#include -#include -#include - -using namespace std::chrono; - -namespace RandLAPACK { - -template -class NysBKIalg { - public: - virtual ~NysBKIalg() {} - virtual int call( - int64_t m, - T* A, - int64_t lda, - int64_t k, - T* V, - T* Lambda, - RandBLAS::RNGState &state - ) = 0; -}; - -template -class NysBKI : public NysBKIalg { - public: - NysBKI( - bool verb, - bool time_subroutines, - T ep - ) { - verbosity = verb; - timing = time_subroutines; - tol = ep; - max_krylov_iters = INT_MAX; - } - int call( - int64_t m, - T* A, - int64_t lda, - int64_t k, - T* V, - T* Lambda, - RandBLAS::RNGState &state - ) override; - public: - bool verbosity; - bool timing; - T tol; - int num_krylov_iters; - int max_krylov_iters; - std::vector times; - T norm_R_end; -}; - -// ----------------------------------------------------------------------------- -template -int NysBKI::call( - int64_t m, - T* A, - int64_t lda, - int64_t k, - T* V, - T* Lambda, - RandBLAS::RNGState &state -){ - int iter = 0; - - T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); - T* X_i = X; - T* Y = ( T * ) calloc( m * (m + k), sizeof( T ) ); - T* Y_i = Y; - - // tau space for QR - T* tau = ( T * ) calloc( k, sizeof( T ) ); - - - // Generate a dense Gaussian random matrx. - RandBLAS::DenseDist D(m, k); - state = RandBLAS::fill_dense(D, X_i, state).second; - // [X_i, ~] = qr(randn(m, m), 0) - lapack::geqrf(m, k, X_i, m, tau); - // Y_i = A * X_i - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, m); - - while(iter < max_krylov_iters) { - // Advance X_i pointer - X_i = X_i + (m * k); - lapack::lacpy(MatrixType::Upper, m, k, X, m, X_i, m); - - if (!iter) { - // X_i+1 = Y_i + tol * X_i; - blas::scal(m * k, this->tol, X_i, 1); - blas::axpy(m * k, 1.0, Y_i, 1, X_i, 1); - } else { - - } - - - - } - - return 0; -} -} // end namespace RandLAPACK -#endif \ No newline at end of file diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 69261b1f..71ed4c4f 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -21,6 +21,21 @@ namespace RandLAPACK { template class RBKIalg { public: + + /// RBKI algorithm is a method for finding truncated SVD based on block Krylov iterations. + /// This algorithm is a version of Algroithm A.1 from https://arxiv.org/pdf/2306.12418.pdf + /// + /// The main difference is in the fact that an economy SVD is performed only once at the very end + /// of the algorithm run and that the termination criteria is not based on singular vectir residual evaluation. + /// Instead, the scheme terminates if: + /// 1. ||R||_F > sqrt(1 - eps^2) ||A||_F, which ensures that we've exhausted all vectors and doing more + /// iterations would bring no benefit or that ||A - hat(A)||_F < eps * ||A||_F. + /// 2. Stop if the bottom right entry of R or S is numerically close to zero (up to square root of machine eps). + /// + /// The main cos of this algorithm comes from large GEMMs with the input matrix A. + /// + /// The algorithm optionally times all of its subcomponents through a user-defined 'timing' parameter. + virtual ~RBKIalg() {} virtual int call( int64_t m, @@ -48,6 +63,51 @@ class RBKI : public RBKIalg { tol = ep; max_krylov_iters = INT_MAX; } + + /// Computes a QR factorization with column pivots of the form: + /// A[:, J] = QR, + /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. + /// Stores implict Q factor and explicit R factor in A's space (output formatted exactly like GEQP3). + /// + /// @param[in] m + /// The number of rows in the matrix A. + /// + /// @param[in] n + /// The number of columns in the matrix A. + /// + /// @param[in] A + /// Pointer to the m-by-n matrix A, stored in a column-major format. + /// + /// @param[in] lda + /// Leading dimension of A. + /// + /// @param[in] k + /// Sampling dimension of a sketching operator, m >= (k * n) >= n. + /// + /// @param[in] U + /// On output, an empty matrix. + /// + /// @param[in] VT + /// On output, an empty matrix. + /// + /// @param[in] Sigma + /// On output, an empty matrix. + /// + /// @param[in] state + /// RNG state parameter, required for sketching operator generation. + /// + /// @param[out] U + /// Stores m by ((num_iters / 2) * k) orthonormal matrix of left singular vectors. + /// + /// @param[out] VT + /// Stores ((num_iters / 2) * k) * n orthonormal matrix of right singular vectors. + /// + /// @param[out] Sigma + /// Stores ((num_iters / 2) * k) singular values. + /// + /// @return = 0: successful exit + /// + int call( int64_t m, int64_t n, From b0e000190e6ad74f25f9cf2850baddbb5527f798 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 20 Mar 2024 11:53:32 -0700 Subject: [PATCH 43/56] Fix --- RandLAPACK/drivers/rl_cqrrp.hh | 6 +- RandLAPACK/drivers/rl_rbki.hh | 15 +-- .../bench_RBKI/RBKI_speed_comparisons.cc | 70 ++++++------ test/drivers/test_rbki.cc | 102 +++++++++++------- 4 files changed, 100 insertions(+), 93 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 564325e1..be436cef 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -314,8 +314,8 @@ int CQRRP_blocked::call( if(this -> timing) qrcp_t_start = high_resolution_clock::now(); - lapack::geqp3(sampling_dimension, cols, A_sk, d, J_buffer, Work2); - /* + //lapack::geqp3(sampling_dimension, cols, A_sk, d, J_buffer, Work2); + // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. // Get a transpose of A_sk #pragma omp parallel for @@ -334,7 +334,7 @@ int CQRRP_blocked::call( util::col_swap(sampling_dimension, cols, cols, A_sk, d, J_buf); // Perform an unpivoted QR on A_sk lapack::geqrf(sampling_dimension, cols, A_sk, d, Work2); - */ + if(this -> timing) { qrcp_t_stop = high_resolution_clock::now(); qrcp_t_dur += duration_cast(qrcp_t_stop - qrcp_t_start).count(); diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 71ed4c4f..f8e94b94 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -187,7 +187,6 @@ int RBKI::call( int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; T norm_R = 0; - int64_t space_rows = k * std::ceil(m / (T) k); int max_iters = this->max_krylov_iters;//std::min(this->max_krylov_iters, (int) (n / (T) k)); // We need a full copy of X and Y all the way through the algorithm @@ -248,16 +247,6 @@ int RBKI::call( state = RandBLAS::fill_dense(D, Y_i, state).second; //omp_set_num_threads(48); -/***********************************************************************************/ - std::ofstream file("run_out/SKETCHING_OPERATOR.txt", std::ios::trunc); - for (int i = 0; i < k; ++i) { - for (int j = 0; j < n; ++j) { - file << std::setprecision(20) << *(Y_i + i * n + j) << " "; - } - file << "\n"; // Move to the next line after each row - } -/***********************************************************************************/ - if(this -> timing) { sketching_t_stop = high_resolution_clock::now(); sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); @@ -297,8 +286,6 @@ int RBKI::call( ++iter; // Iterate until in-loop termination criteria is met. - char name [] = "S_TO_DECOMPOSE"; - char name1 [] = "Y_i"; while(1) { if(this -> timing) main_loop_t_start = high_resolution_clock::now(); @@ -553,7 +540,7 @@ int RBKI::call( if (this -> verbosity) { printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); - printf("Basic info: b_sz=%ld krylov_iters=%ld\n", k, num_krylov_iters); + printf("Basic info: b_sz=%ld krylov_iters=%d\n", k, num_krylov_iters); printf("Allocate and free time: %25ld μs,\n", allocation_t_dur); printf("Time to acquire the SVD factors: %25ld μs,\n", get_factors_t_dur); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index c896b861..08fa2b41 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -53,48 +53,48 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, } } - // This routine computes the residual norm error, consisting of two parts (one of which) vanishes - // in exact precision. Target_rank defines size of U, V as returned by RBKI; custom_rank <= target_rank. - template - static T - residual_error_comp(RBKI_benchmark_data &all_data, int64_t target_rank, int64_t custom_rank) { - auto m = all_data.row; - auto n = all_data.col; +// This routine computes the residual norm error, consisting of two parts (one of which) vanishes +// in exact precision. Target_rank defines size of U, V as returned by RBKI; custom_rank <= target_rank. +template +static T +residual_error_comp(RBKI_benchmark_data &all_data, int64_t target_rank, int64_t custom_rank) { + auto m = all_data.row; + auto n = all_data.col; - T* U_cpy_dat = RandLAPACK::util::upsize(m * n, all_data.U_cpy); - T* VT_cpy_dat = RandLAPACK::util::upsize(n * n, all_data.VT_cpy); + T* U_cpy_dat = RandLAPACK::util::upsize(m * n, all_data.U_cpy); + T* VT_cpy_dat = RandLAPACK::util::upsize(n * n, all_data.VT_cpy); - lapack::lacpy(MatrixType::General, m, n, all_data.U.data(), m, U_cpy_dat, m); - lapack::lacpy(MatrixType::General, n, n, all_data.VT.data(), n, VT_cpy_dat, n); + lapack::lacpy(MatrixType::General, m, n, all_data.U.data(), m, U_cpy_dat, m); + lapack::lacpy(MatrixType::General, n, n, all_data.VT.data(), n, VT_cpy_dat, n); - // AV - US - // Scale columns of U by S - for (int i = 0; i < custom_rank; ++i) - blas::scal(m, all_data.Sigma[i], &U_cpy_dat[m * i], 1); + // AV - US + // Scale columns of U by S + for (int i = 0; i < custom_rank; ++i) + blas::scal(m, all_data.Sigma[i], &U_cpy_dat[m * i], 1); - // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); + // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); - // A'U - VS - // Scale columns of V by S - // Since we have VT, we will be scaling its rows - // The data is, however, stored in a column-major format, so it is a bit weird. - //for (int i = 0; i < n; ++i) - // blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); - for (int i = 0; i < custom_rank; ++i) - blas::scal(n, all_data.Sigma[i], &VT_cpy_dat[i], n); - // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). - // We will actually have to perform U' * A - Sigma * VT. + // A'U - VS + // Scale columns of V by S + // Since we have VT, we will be scaling its rows + // The data is, however, stored in a column-major format, so it is a bit weird. + //for (int i = 0; i < n; ++i) + // blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); + for (int i = 0; i < custom_rank; ++i) + blas::scal(n, all_data.Sigma[i], &VT_cpy_dat[i], n); + // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). + // We will actually have to perform U' * A - Sigma * VT. - blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, custom_rank, n, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, custom_rank, n, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); - T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); - T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); + T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); + T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); - return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); - } + return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); +} template static void call_all_algs( @@ -173,12 +173,12 @@ int main(int argc, char *argv[]) { int64_t b_sz_stop = 0; int64_t num_matmuls_start = 2; int64_t num_matmuls_curr = num_matmuls_start; - int64_t num_matmuls_stop = 20; + int64_t num_matmuls_stop = 50; int64_t custom_rank = 10; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 5; + int numruns = 3; long dur_svd = 0; std::vector res; @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { printf("Finished data preparation\n"); // Declare a data file - std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + std::string output_filename = "COMBINED_1_2_3_4_5_RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) + "_b_sz_start_" + std::to_string(b_sz_start) + "_b_sz_stop_" + std::to_string(b_sz_stop) diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index 80e2358d..9821c2ad 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -20,86 +20,106 @@ class TestRBKI : public ::testing::Test struct RBKITestData { int64_t row; int64_t col; - int64_t rank; // has to be modifiable std::vector A; std::vector U; - std::vector V; + std::vector VT; // RBKI returns V' std::vector Sigma; - std::vector A_cpy; - std::vector Sigma_exact; + std::vector U_cpy; + std::vector VT_cpy; - RBKITestData(int64_t m, int64_t n, int64_t k) : + RBKITestData(int64_t m, int64_t n) : A(m * n, 0.0), U(m * n, 0.0), - V(n * n, 0.0), - Sigma(n, 0.0), - A_cpy(m * n, 0.0), - Sigma_exact(n, 0.0) + VT(n * n, 0.0), + Sigma(n, 0.0) { row = m; col = n; - rank = k; } }; - template - static void norm_and_copy_computational_helper(T &norm_A, RBKITestData &all_data) { + + // This routine computes the residual norm error, consisting of two parts (one of which) vanishes + // in exact precision. Target_rank defines size of U, V as returned by RBKI; custom_rank <= target_rank. + template + static T + residual_error_comp(RBKITestData &all_data, int64_t custom_rank) { auto m = all_data.row; auto n = all_data.col; - lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy.data(), m); - norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + T* U_cpy_dat = RandLAPACK::util::upsize(m * n, all_data.U_cpy); + T* VT_cpy_dat = RandLAPACK::util::upsize(n * n, all_data.VT_cpy); + + lapack::lacpy(MatrixType::General, m, n, all_data.U.data(), m, U_cpy_dat, m); + lapack::lacpy(MatrixType::General, n, n, all_data.VT.data(), n, VT_cpy_dat, n); + + // AV - US + // Scale columns of U by S + for (int i = 0; i < custom_rank; ++i) + blas::scal(m, all_data.Sigma[i], &U_cpy_dat[m * i], 1); + + + // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); + + + // A'U - VS + // Scale columns of V by S + // Since we have VT, we will be scaling its rows + // The data is, however, stored in a column-major format, so it is a bit weird. + //for (int i = 0; i < n; ++i) + // blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); + for (int i = 0; i < custom_rank; ++i) + blas::scal(n, all_data.Sigma[i], &VT_cpy_dat[i], n); + // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). + // We will actually have to perform U' * A - Sigma * VT. + + blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, custom_rank, n, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); + + T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); + T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); + + return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); } + template static void test_RBKI_general( - T norm_A, + int64_t b_sz, + int64_t target_rank, + int64_t custom_rank, RBKITestData &all_data, alg_type &RBKI, RandBLAS::RNGState &state) { auto m = all_data.row; auto n = all_data.col; - auto k = all_data.rank; + RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); - RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); + RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state); // Compute singular values via a deterministic method - lapack::gesdd(Job::NoVec, m, n, all_data.A_cpy.data(), m, all_data.Sigma_exact.data(), NULL, m, NULL, n); - - // Find diff between singular values computed by two methods - int cnt = -1; - - for(int i = 0; i < n; ++i) { - //printf("%e, %e\n", all_data.Sigma[i], all_data.Sigma_exact[i]); - } - - std::for_each(all_data.Sigma.data(), all_data.Sigma.data() + k, - // Lambda expression begins - [&cnt, &all_data](T &entry) { - entry -= all_data.Sigma_exact[++cnt]; - } - ); - T norm = blas::nrm2(k, all_data.Sigma.data(), 1); - printf("||A_svd - A_rbki||_F: %e\n", norm); + T residual_err_custom = residual_error_comp(all_data, custom_rank); + printf("residual_err_custom %e\n", residual_err_custom); + ASSERT_NEAR(residual_err_custom, 8.039386e-13, std::pow(std::numeric_limits::epsilon(), 0.825)); } }; // Note: If Subprocess killed exception -> reload vscode TEST_F(TestRBKI, RBKI_basic) { - int64_t m = 4000; - int64_t n = 200; - int64_t k = 100; - double norm_A = 0; + int64_t m = 4000; + int64_t n = 200; + int64_t b_sz = 10; + int64_t target_rank = 200; + int64_t custom_rank = 10; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); - RBKITestData all_data(m, n, k); + RBKITestData all_data(m, n); RandLAPACK::RBKI RBKI(false, false, tol); RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); - norm_and_copy_computational_helper(norm_A, all_data); - test_RBKI_general>(norm_A, all_data, RBKI, state); + test_RBKI_general>(b_sz, target_rank, custom_rank, all_data, RBKI, state); } From b115bda94eb44eadd0e7767c810d408ab84cda12 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 20 Mar 2024 12:44:04 -0700 Subject: [PATCH 44/56] Update before RBKI rewwork --- RandLAPACK/drivers/rl_rbki.hh | 43 +++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index f8e94b94..156cf066 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -191,12 +191,16 @@ int RBKI::call( // We need a full copy of X and Y all the way through the algorithm // due to an operation with X_odd and Y_odd happening at the end. + // Below pointers stay the same throughout the alg; the space will be alloacted iteratively // Space for Y_i and Y_odd. - T* Y = ( T * ) calloc( n * m, sizeof( T ) ); - // Space for X_i and X_ev. (maybe needs to be m by m + k) - T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); - // tau space for QR - T* tau = ( T * ) calloc( k, sizeof( T ) ); + T* Y_od = ( T * ) calloc( n * m, sizeof( T ) ); + //T* Y_od = ( T * ) calloc( n * k, sizeof( T ) ); + int64_t curr_Y_cols = k; + // Space for X_i and X_ev. + T* X_ev = ( T * ) calloc( m * (n + k), sizeof( T ) ); + //T* X_ev = ( T * ) calloc( m * k, sizeof( T ) ); + int64_t curr_X_cols = k; + // While R and S matrices are structured (both band), we cannot make use of this structure through // BLAS-level functions. // Note also that we store a transposed version of R. @@ -213,11 +217,8 @@ int RBKI::call( // Pointers allocation // Below pointers will be offset by (n or m) * k at every even iteration. - T* Y_i = Y; - T* X_i = X; - // Below pointers stay the same throughout the alg. - T* Y_od = Y; - T* X_ev = X; + T* Y_i = Y_od; + T* X_i = X_ev; // S and S pointers are offset at every step. T* R_i = NULL; T* R_ii = R; @@ -226,6 +227,8 @@ int RBKI::call( // Pre-decloration of SVD-related buffers. T* U_hat = NULL; T* VT_hat = NULL; + // tau space for QR + T* tau = ( T * ) calloc( k, sizeof( T ) ); if(this -> timing) { allocation_t_stop = high_resolution_clock::now(); @@ -300,8 +303,13 @@ int RBKI::call( gemm_A_t_stop = high_resolution_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } - +/* + // Allocate more spece for Y_od + curr_X_cols += k; + realloc(X_ev, m * curr_X_cols * sizeof( T )); // Move the X_i pointer; + X_i = &X_ev[m * (curr_X_cols * k)]; +*/ X_i = &X_i[m * k]; if (iter != 1) { @@ -382,10 +390,15 @@ int RBKI::call( gemm_A_t_stop = high_resolution_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } - +/* + // Allocate more spece for Y_od + curr_Y_cols += k; + realloc(Y_od, n * curr_Y_cols * sizeof( T )); // Move the X_i pointer; + Y_i = &Y_od[n * (curr_Y_cols - k)]; +*/ Y_i = &Y_i[n * k]; - + // S_i = X_ev' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); @@ -516,8 +529,8 @@ int RBKI::call( allocation_t_start = high_resolution_clock::now(); } - free(Y); - free(X); + free(Y_od); + free(X_ev); free(tau); free(R); free(S); From fc50a553b6e9b43ef878a0c55ad19927c78a7019 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 21 Mar 2024 13:45:56 -0700 Subject: [PATCH 45/56] Reworkd allocation logic in RBKI. Old logic commented out --- RandLAPACK/drivers/rl_rbki.hh | 64 ++++++++++++++++++++++------------- test/drivers/test_rbki.cc | 4 +-- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 156cf066..49fd50fe 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -193,12 +193,10 @@ int RBKI::call( // due to an operation with X_odd and Y_odd happening at the end. // Below pointers stay the same throughout the alg; the space will be alloacted iteratively // Space for Y_i and Y_odd. - T* Y_od = ( T * ) calloc( n * m, sizeof( T ) ); - //T* Y_od = ( T * ) calloc( n * k, sizeof( T ) ); + T* Y_od = ( T * ) calloc( n * k, sizeof( T ) ); int64_t curr_Y_cols = k; // Space for X_i and X_ev. - T* X_ev = ( T * ) calloc( m * (n + k), sizeof( T ) ); - //T* X_ev = ( T * ) calloc( m * k, sizeof( T ) ); + T* X_ev = ( T * ) calloc( m * k, sizeof( T ) ); int64_t curr_X_cols = k; // While R and S matrices are structured (both band), we cannot make use of this structure through @@ -209,9 +207,12 @@ int RBKI::call( // At the end, size of R would by d x d and size of S would // be (d + 1) x d, where d = numiters_complete * b_sz, d <= n. // Note that the total amount of iterations will always be numiters <= n * 2 / block_size - T* R = ( T * ) calloc( n * n, sizeof( T ) ); - T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); + //T* R = ( T * ) calloc( n * n, sizeof( T ) ); + T* R = ( T * ) calloc( n * k, sizeof( T ) ); + //T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); + T* S = ( T * ) calloc( (n + k) * k, sizeof( T ) ); + // These buffers are of constant size T* Y_orth_buf = ( T * ) calloc( k * n, sizeof( T ) ); T* X_orth_buf = ( T * ) calloc( k * (n + k), sizeof( T ) ); @@ -303,14 +304,14 @@ int RBKI::call( gemm_A_t_stop = high_resolution_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } -/* + // Allocate more spece for Y_od curr_X_cols += k; - realloc(X_ev, m * curr_X_cols * sizeof( T )); + X_ev = ( T * ) realloc(X_ev, m * curr_X_cols * sizeof( T )); // Move the X_i pointer; - X_i = &X_ev[m * (curr_X_cols * k)]; -*/ - X_i = &X_i[m * k]; + X_i = &X_ev[m * (curr_X_cols - k)]; + + //X_i = &X_i[m * k]; if (iter != 1) { // R_i' = Y_i' * Y_od @@ -331,7 +332,7 @@ int RBKI::call( reorth_t_dur += duration_cast(reorth_t_stop - reorth_t_start).count(); } } - /****************************************ISSUE ABOVE****************************************************************************/ + // [Y_i, R_ii] = qr(Y_i, 0) std::fill(&tau[0], &tau[k], 0.0); @@ -372,9 +373,16 @@ int RBKI::call( break; } + // Allocate more space for R + R = ( T * ) realloc(R, n * curr_X_cols * sizeof( T )); + // Need to make sure the newly-allocated space is empty + memset(&R[n * (curr_X_cols - k)], 0.0, n * k * sizeof( T )); + // Advance R pointers - iter == 1 ? R_i = &R_ii[k] : R_i = &R_i[k]; - R_ii = &R_ii[(n + 1) * k]; + //iter == 1 ? R_i = &R_ii[k] : R_i = &R_i[k]; + //R_ii = &R_ii[(n + 1) * k]; + R_i = &R[(iter_ev + 1) * k]; + R_ii = &R[(n * k * (iter_ev + 1)) + k + (k * (iter_ev))]; // Advance even iteration count; ++iter_ev; @@ -390,14 +398,14 @@ int RBKI::call( gemm_A_t_stop = high_resolution_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } -/* + // Allocate more spece for Y_od curr_Y_cols += k; - realloc(Y_od, n * curr_Y_cols * sizeof( T )); + Y_od = ( T * ) realloc(Y_od, n * curr_Y_cols * sizeof( T )); // Move the X_i pointer; Y_i = &Y_od[n * (curr_Y_cols - k)]; -*/ - Y_i = &Y_i[n * k]; + + //Y_i = &Y_i[n * k]; // S_i = X_ev' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); @@ -455,12 +463,23 @@ int RBKI::call( break; } - // Advance R pointers - S_i = &S_i[(n + k) * k]; - S_ii = &S_ii[((n + k) + 1) * k]; + // Allocate more space for S + S = ( T * ) realloc(S, (n + k) * curr_Y_cols * sizeof( T )); + // Need to make sure the newly-allocated space is empty + memset(&S[(n + k)* (curr_Y_cols - k)], 0.0, (n + k) * k * sizeof( T )); + + //char name [] = "S"; + //RandBLAS::util::print_colmaj(n + k, curr_Y_cols, S, name); + + // Advance S pointers + //S_i = &S_i[(n + k) * k]; + //S_ii = &S_ii[((n + k) + 1) * k]; + S_i = &S[(n + k) * k * iter_od]; + S_ii = &S[(n + k) * k * iter_od + k + (iter_od * k)]; + // Advance odd iteration count; ++iter_od; - //RandBLAS::util::print_colmaj(n+k, n, S, name); + } if(this -> timing) @@ -528,7 +547,6 @@ int RBKI::call( get_factors_t_dur = duration_cast(get_factors_t_stop - get_factors_t_start).count(); allocation_t_start = high_resolution_clock::now(); } - free(Y_od); free(X_ev); free(tau); diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index 9821c2ad..a6f588ae 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -107,11 +107,11 @@ class TestRBKI : public ::testing::Test // Note: If Subprocess killed exception -> reload vscode TEST_F(TestRBKI, RBKI_basic) { - int64_t m = 4000; + int64_t m = 400; int64_t n = 200; int64_t b_sz = 10; int64_t target_rank = 200; - int64_t custom_rank = 10; + int64_t custom_rank = 100; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); From 8e6211e6947c93af8cd85ea14cec689e568e79b9 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 21 Mar 2024 13:47:01 -0700 Subject: [PATCH 46/56] Removed commented out logic --- RandLAPACK/drivers/rl_rbki.hh | 9 --------- 1 file changed, 9 deletions(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 49fd50fe..fa518c34 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -207,9 +207,7 @@ int RBKI::call( // At the end, size of R would by d x d and size of S would // be (d + 1) x d, where d = numiters_complete * b_sz, d <= n. // Note that the total amount of iterations will always be numiters <= n * 2 / block_size - //T* R = ( T * ) calloc( n * n, sizeof( T ) ); T* R = ( T * ) calloc( n * k, sizeof( T ) ); - //T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) ); T* S = ( T * ) calloc( (n + k) * k, sizeof( T ) ); // These buffers are of constant size @@ -379,8 +377,6 @@ int RBKI::call( memset(&R[n * (curr_X_cols - k)], 0.0, n * k * sizeof( T )); // Advance R pointers - //iter == 1 ? R_i = &R_ii[k] : R_i = &R_i[k]; - //R_ii = &R_ii[(n + 1) * k]; R_i = &R[(iter_ev + 1) * k]; R_ii = &R[(n * k * (iter_ev + 1)) + k + (k * (iter_ev))]; @@ -468,12 +464,7 @@ int RBKI::call( // Need to make sure the newly-allocated space is empty memset(&S[(n + k)* (curr_Y_cols - k)], 0.0, (n + k) * k * sizeof( T )); - //char name [] = "S"; - //RandBLAS::util::print_colmaj(n + k, curr_Y_cols, S, name); - // Advance S pointers - //S_i = &S_i[(n + k) * k]; - //S_ii = &S_ii[((n + k) + 1) * k]; S_i = &S[(n + k) * k * iter_od]; S_ii = &S[(n + k) * k * iter_od + k + (iter_od * k)]; From b824aa09035a2ffc402aa6dffd48ce81fb37ffa8 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Thu, 21 Mar 2024 13:47:25 -0700 Subject: [PATCH 47/56] Removed commented out logic --- RandLAPACK/drivers/rl_rbki.hh | 1 - 1 file changed, 1 deletion(-) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index fa518c34..225519c8 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -501,7 +501,6 @@ int RBKI::call( this -> norm_R_end = norm_R; this->num_krylov_iters = iter; - //iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od; end_cols = num_krylov_iters * k / 2; iter % 2 == 0 ? end_rows = end_cols + k : end_rows = end_cols; From 109edae21be6d3e8782e712d2071cfe02559c08c Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 11:42:50 -0700 Subject: [PATCH 48/56] Update per Riley's comments --- RandLAPACK/comps/rl_orth.hh | 2 +- RandLAPACK/drivers/rl_cqrrp.hh | 5 +- RandLAPACK/drivers/rl_rbki.hh | 20 ++++---- RandLAPACK/misc/rl_gen.hh | 21 ++++----- RandLAPACK/misc/rl_util.hh | 29 +++++++++++- benchmark/CMakeLists.txt | 4 +- benchmark/Gemm_vs_ormqr.cc | 2 +- benchmark/bench_CQRRP/CQRRP_pivot_quality.cc | 4 ++ .../bench_CQRRP/CQRRP_runtime_breakdown.cc | 14 ++++++ ...breakdown.cc => CQRRP_single_precision.cc} | 4 ++ .../bench_CQRRP/CQRRP_speed_comparisons.cc | 11 +++++ .../bench_CQRRPT/CQRRPT_pivot_quality.cc | 4 ++ .../bench_CQRRPT/CQRRPT_runtime_breakdown.cc | 10 ++++ .../bench_CQRRPT/CQRRPT_speed_comparisons.cc | 11 +++++ ...benchmark.cc => RBKI_runtime_breakdown.cc} | 46 +++++++++++-------- .../bench_RBKI/RBKI_speed_comparisons.cc | 19 +++++--- test/drivers/test_rbki.cc | 11 ++--- 17 files changed, 156 insertions(+), 61 deletions(-) rename benchmark/bench_CQRRP/{CQRRP_Apple_runtime_breakdown.cc => CQRRP_single_precision.cc} (97%) rename benchmark/bench_RBKI/{RBKI_runtime_benchmark.cc => RBKI_runtime_breakdown.cc} (83%) diff --git a/RandLAPACK/comps/rl_orth.hh b/RandLAPACK/comps/rl_orth.hh index 136698d3..a1ea44ef 100644 --- a/RandLAPACK/comps/rl_orth.hh +++ b/RandLAPACK/comps/rl_orth.hh @@ -96,7 +96,7 @@ int CholQRQ::call( // Scheme may succeed, but output garbage if(this->cond_check) { if(util::cond_num_check(k, k, Q_gram.data(), (this->Q_gram_cpy).data(), (this->s).data(), this->verbosity) > (1 / std::sqrt(std::numeric_limits::epsilon()))){ - //return 1; + return 1; } } diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index be436cef..080b7746 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -318,9 +318,8 @@ int CQRRP_blocked::call( // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. // Get a transpose of A_sk - #pragma omp parallel for - for(i = 0; i < cols; ++i) - blas::copy(sampling_dimension, &A_sk[i * d], 1, &A_sk_trans[i], n); + util::transposition(sampling_dimension, cols, A_sk, d, A_sk_trans, n, 0); + // Perform a row-pivoted LU on a transpose of A_sk lapack::getrf(cols, sampling_dimension, A_sk_trans, n, J_buffer_lu); // Fill the pivot vector, apply swaps found via lu on A_sk'. diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 225519c8..4c31191e 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -127,6 +127,9 @@ class RBKI : public RBKIalg { int max_krylov_iters; std::vector times; T norm_R_end; + + int num_threads_some; + int num_threads_rest; }; // ----------------------------------------------------------------------------- @@ -185,7 +188,7 @@ int RBKI::call( allocation_t_start = high_resolution_clock::now(); } - int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0; + int64_t iter = 0, iter_od = 0, iter_ev = 0, end_rows = 0, end_cols = 0; T norm_R = 0; int max_iters = this->max_krylov_iters;//std::min(this->max_krylov_iters, (int) (n / (T) k)); @@ -244,10 +247,10 @@ int RBKI::call( // Generate a dense Gaussian random matrx. // OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation. - //omp_set_num_threads(4); + omp_set_num_threads(this->num_threads_some); RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; - //omp_set_num_threads(48); + omp_set_num_threads(this->num_threads_rest); if(this -> timing) { sketching_t_stop = high_resolution_clock::now(); @@ -345,10 +348,9 @@ int RBKI::call( } // Copy R_ii over to R's (in transposed format). - omp_set_num_threads(4); - for(i = 0; i < k; ++i) - blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n); - omp_set_num_threads(48); + omp_set_num_threads(this->num_threads_some); + util::transposition(0, k, Y_i, n, R_ii, n, 1); + omp_set_num_threads(this->num_threads_rest); if(this -> timing) { r_cpy_t_stop = high_resolution_clock::now(); @@ -499,7 +501,7 @@ int RBKI::call( } } - this -> norm_R_end = norm_R; + this->norm_R_end = norm_R; this->num_krylov_iters = iter; end_cols = num_krylov_iters * k / 2; iter % 2 == 0 ? end_rows = end_cols + k : end_rows = end_cols; @@ -556,7 +558,7 @@ int RBKI::call( total_t_stop = high_resolution_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_rest = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur + sketching_t_dur + r_cpy_t_dur + s_cpy_t_dur + norm_t_dur); - this -> times.resize(11); + this -> times.resize(13); this -> times = {allocation_t_dur, get_factors_t_dur, ungqr_t_dur, reorth_t_dur, qr_t_dur, gemm_A_t_dur, main_loop_t_dur, sketching_t_dur, r_cpy_t_dur, s_cpy_t_dur, norm_t_dur, t_rest, total_t_dur}; if (this -> verbosity) { diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 2490782b..5cbd4bb7 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -131,13 +131,10 @@ void gen_poly_mat( T b = std::pow(a * first_entry, -1 / p) - offset; // apply lambda function to every entry of s std::fill(s, s + offset, 1.0); - std::for_each(s + offset, s + k, - // Lambda expression begins - [&p, &offset, &a, &b](T &entry) { - entry = 1 / (a * std::pow(offset + b, p)); - ++offset; - } - ); + for (int i = offset; i < k; ++i) { + s[i] = 1 / (a * std::pow(offset + b, p)); + ++offset; + } // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); @@ -179,12 +176,10 @@ void gen_exp_mat( // apply lambda function to every entry of s // Please make sure that the first singular value is always 1 std::fill(s, s + offset, 1.0); - std::for_each(s + offset, s + k, - // Lambda expression begins - [&t, &cnt](T &entry) { - entry = (std::exp(++cnt * -t)); - } - ); + for (int i = offset; i < k; ++i) { + s[i] = (std::exp(++cnt * -t)); + ++offset; + } // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 911c8177..0368235f 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -187,7 +187,7 @@ template T cond_num_check( int64_t m, int64_t n, - T* A, + const T* A, T* A_cpy, T* s, bool verbose @@ -213,7 +213,7 @@ template int64_t rank_check( int64_t m, int64_t n, - T* A + const T* A ) { T* A_pre_cpy = ( T * ) calloc( m * n, sizeof( T ) ); T* s = ( T * ) calloc( n, sizeof( T ) ); @@ -389,5 +389,30 @@ void eat_lda_slack( delete [] work; } +// Perform an explicit transposition of a given matrix, +// write the transpose into a buffer. +// WARNING: OMP parallelism occasionally tanks the performance. +template +void transposition( + int64_t m, + int64_t n, + const T* A, + int64_t lda, + T* AT, + int64_t ldat, + int copy_upper_triangle +) { + if (copy_upper_triangle) { + // Only transposing the upper-triangular portion of the original + #pragma omp parallel for + for(int i = 0; i < n; ++i) + blas::copy(i + 1, &A[i * lda], 1, &AT[i], ldat); + } else { + #pragma omp parallel for + for(int i = 0; i < n; ++i) + blas::copy(m, &A[i * lda], 1, &AT[i], ldat); + } +} + } // end namespace util #endif diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index ea306c55..03106f0b 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -60,8 +60,8 @@ add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivo # CQRRP benchmarks add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_Apple_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_single_precision CXX_SOURCES bench_CQRRP/CQRRP_single_precision.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME RBKI_runtime_benchmark CXX_SOURCES bench_RBKI/RBKI_runtime_benchmark.cc LINK_LIBS ${Benchmark_libs}) \ No newline at end of file +add_benchmark(NAME RBKI_runtime_breakdown CXX_SOURCES bench_RBKI/RBKI_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) \ No newline at end of file diff --git a/benchmark/Gemm_vs_ormqr.cc b/benchmark/Gemm_vs_ormqr.cc index 1f587f84..91c8a203 100644 --- a/benchmark/Gemm_vs_ormqr.cc +++ b/benchmark/Gemm_vs_ormqr.cc @@ -73,4 +73,4 @@ int main() { test_speed(std::pow(2, 14), std::pow(2, 9), 10, state); test_speed(std::pow(2, 15), std::pow(2, 10), 10, state); return 0; -} \ No newline at end of file +} diff --git a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc index f3429cd9..4eead9e7 100644 --- a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc +++ b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc @@ -1,3 +1,7 @@ +/* +Performs computations in order to assess the pivot quality of ICQRRP. +The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper. +*/ #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc index c6f44e84..cb7fae7e 100644 --- a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc +++ b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc @@ -1,3 +1,17 @@ +/* +ICQRRP runtime breakdown benchmark - assesses the time taken by each subcomponent of ICQRRP. +There are 9 things that we time: + 1. SASO generation and application time + 2. QRCP time. + 3. Preconditioning time. + 4. Time to perform Cholesky QR. + 5. Time to restore Householder vectors. + 6. Time to compute A_new, R12. + 7. Time to update factors Q, R. + 8. Time to update the sketch. + 9. Time to pivot trailing columns of R-factor. +*/ + #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc b/benchmark/bench_CQRRP/CQRRP_single_precision.cc similarity index 97% rename from benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc rename to benchmark/bench_CQRRP/CQRRP_single_precision.cc index 08400e66..ff238ad0 100644 --- a/benchmark/bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc +++ b/benchmark/bench_CQRRP/CQRRP_single_precision.cc @@ -1,3 +1,7 @@ +/* +This benchmarks compares single-precision ICQRRP with double-precision GETRF and GEQRF. +We anticipate that single-precision ICQRRP can be used as part of the linear system solving process. +*/ #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc index f259c426..82498e37 100644 --- a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc +++ b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc @@ -1,3 +1,14 @@ +/* +ICQRRP speed comparison benchmark - runs: + 1. ICQRRP + 2. GEQRF + 3. GEQP3 - takes too long! + 5. HQRRP + CholQR + 6. HQRRP + GEQRF +for a matrix with fixed number of rows and columns and a varying ICQRRP block size. +Records the best timing, saves that into a file. +*/ + #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc index c0ae5b16..5ccbcec7 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc @@ -1,3 +1,7 @@ +/* +Performs computations in order to assess the pivot quality of CQRRPT. +The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper. +*/ #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc index 953d88e2..8bb7b469 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc @@ -1,3 +1,13 @@ +/* +CQRRPT runtime breakdown benchmark - assesses the time taken by each subcomponent of CQRRPT. +There are 6 things that we time: + 1. SASO generation and application time + 2. QRCP time. + 3. Time it takes to compute numerical rank k. + 4. piv(A). + 5. TRSM(A). + 6. Time to perform Cholesky QR. +*/ #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc index c8ad5e5a..7f501878 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -1,3 +1,14 @@ +/* +CQRRPT speed comparison benchmark - runs: + 1. CQRRPT + 2. GEQR + 3. GEQRF + 4. GEQP3 + 5. GEQPT + 6. SCHOLQR +for a matrix with fixed number of rows and a varying number of columns. +Records the best timing, saves that into a file. +*/ #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" diff --git a/benchmark/bench_RBKI/RBKI_runtime_benchmark.cc b/benchmark/bench_RBKI/RBKI_runtime_breakdown.cc similarity index 83% rename from benchmark/bench_RBKI/RBKI_runtime_benchmark.cc rename to benchmark/bench_RBKI/RBKI_runtime_breakdown.cc index e4947051..3de93820 100644 --- a/benchmark/bench_RBKI/RBKI_runtime_benchmark.cc +++ b/benchmark/bench_RBKI/RBKI_runtime_breakdown.cc @@ -1,3 +1,18 @@ +/* +RBKI runtime breakdown benchmark - assesses the time taken by each subcomponent of RBKI. +Records all, data, not just the best. +There are 10 things that we time: + 1.Allocate and free time. + 2.Time to acquire the SVD factors. + 3.UNGQR time. + 4.Reorthogonalization time. + 5.QR time. + 6.GEMM A time. + 7.Sketching time. + 8.R_ii cpy time. + 9.S_ii cpy time. + 10.Norm R time. +*/ #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" @@ -68,35 +83,30 @@ static void call_all_algs( RandLAPACK::RBKI RBKI(false, time_subroutines, tol); RBKI.max_krylov_iters = num_krylov_iters; - // Making sure the states are unchanged auto state_gen = state; // Timing vars - long dur_rbki = 0; - long t_rbki_best = 0; - std::vector inner_timing_best; + std::vector inner_timing; for (int i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); - auto start_rbki = high_resolution_clock::now(); RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); - auto stop_rbki = high_resolution_clock::now(); - dur_rbki = duration_cast(stop_rbki - start_rbki).count(); - // Update best timing - if (!i || dur_rbki < t_rbki_best) {t_rbki_best = dur_rbki; inner_timing_best = RBKI.times;} + + // Update timing vector + inner_timing = RBKI.times; + // Add info about the run + inner_timing.insert (inner_timing.begin(), k); + inner_timing.insert (inner_timing.begin(), num_krylov_iters); + + std::ofstream file(output_filename, std::ios::app); + std::copy(inner_timing.begin(), inner_timing.end(), std::ostream_iterator(file, ", ")); + file << "\n"; + // Clear and re-generate data data_regen(m_info, all_data, state_gen, 0); state_gen = state; } - - // Add info about the run - inner_timing_best.insert (inner_timing_best.begin(), k); - inner_timing_best.insert (inner_timing_best.begin(), num_krylov_iters); - - std::ofstream file(output_filename, std::ios::app); - std::copy(inner_timing_best.begin(), inner_timing_best.end(), std::ostream_iterator(file, ", ")); - file << "\n"; } int main(int argc, char *argv[]) { @@ -157,4 +167,4 @@ int main(int argc, char *argv[]) { } num_krylov_iters_curr = num_krylov_iters_start; } -} \ No newline at end of file +} diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 08fa2b41..f88a6e7c 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -1,3 +1,11 @@ +/* +RBKI speed comparison benchmark - technically only runs RBKI, but has an option to run SVD (gesdd()) to be compared against RBKI (direct SVD is WAY slower than RBKI). +The user is required to provide a matrix file to be read, set min and max numbers of large gemms (Krylov iterations) that the algorithm is allowed to perform min and max block sizes that RBKI is to use; +furthermore, the user is to provide a 'custom rank' parameter (number of singular vectors to approximate by RBKI). +The benchmark outputs the basic data of a given run, as well as the RBKI runtime and singular vector residual error, +which is computed as "sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F / sqrt(custom_rank)" (for "custom rank" singular vectors and values). +*/ + #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" @@ -93,7 +101,7 @@ residual_error_comp(RBKI_benchmark_data &all_data, int64_t target_rank, int64 T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); - return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); + return std::hypot(nrm1, nrm2); } template @@ -109,16 +117,16 @@ static void call_all_algs( long dur_svd) { printf("\nBlock size %ld, num matmuls %ld\n", b_sz, num_matmuls); - int i, j; + int i; auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; - T norm_svd_k; - T err_rbki; bool time_subroutines = false; // Additional params setup. RandLAPACK::RBKI RBKI(false, time_subroutines, tol); + RBKI.num_threads_some = 4; + RBKI.num_threads_rest = 48; // Matrices R or S that give us the singular value spectrum returned by RBKI will be of size b_sz * num_krylov_iters / 2. // These matrices will be full-rank. // Hence, target_rank = b_sz * num_krylov_iters / 2 @@ -130,7 +138,6 @@ static void call_all_algs( // timing vars long dur_rbki = 0; - long t_rbki_best = 0; // Making sure the states are unchanged auto state_gen = state; @@ -218,4 +225,4 @@ int main(int argc, char *argv[]) { } num_matmuls_curr = num_matmuls_start; } -} \ No newline at end of file +} diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index a6f588ae..d0b5a1e9 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -58,7 +58,6 @@ class TestRBKI : public ::testing::Test for (int i = 0; i < custom_rank; ++i) blas::scal(m, all_data.Sigma[i], &U_cpy_dat[m * i], 1); - // Compute AV(:, 1:custom_rank) - SU(1:custom_rank) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m); @@ -67,19 +66,16 @@ class TestRBKI : public ::testing::Test // Scale columns of V by S // Since we have VT, we will be scaling its rows // The data is, however, stored in a column-major format, so it is a bit weird. - //for (int i = 0; i < n; ++i) - // blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n); for (int i = 0; i < custom_rank; ++i) blas::scal(n, all_data.Sigma[i], &VT_cpy_dat[i], n); // Compute A'U(:, 1:custom_rank) - VS(1:custom_rank). // We will actually have to perform U' * A - Sigma * VT. - blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, custom_rank, n, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n); T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m); T nrm2 = lapack::lange(Norm::Fro, custom_rank, n, VT_cpy_dat, n); - return std::sqrt(std::pow(nrm1, 2) + std::pow(nrm2, 2)); + return std::hypot(nrm1, nrm2); } @@ -101,7 +97,8 @@ class TestRBKI : public ::testing::Test T residual_err_custom = residual_error_comp(all_data, custom_rank); printf("residual_err_custom %e\n", residual_err_custom); - ASSERT_NEAR(residual_err_custom, 8.039386e-13, std::pow(std::numeric_limits::epsilon(), 0.825)); + //ASSERT_NEAR(residual_err_custom, 8.039386e-13, std::pow(std::numeric_limits::epsilon(), 0.825)); + ASSERT_LE(residual_err_custom, 10 * std::pow(std::numeric_limits::epsilon(), 0.825)); } }; @@ -117,6 +114,8 @@ TEST_F(TestRBKI, RBKI_basic) { RBKITestData all_data(m, n); RandLAPACK::RBKI RBKI(false, false, tol); + RBKI.num_threads_some = 4; + RBKI.num_threads_rest = 16; RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); From 5cf905a4bc7dae47bfd100f9adeb470e70f6a159 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 11:46:24 -0700 Subject: [PATCH 49/56] Git compilation update --- RandLAPACK/drivers/rl_hqrrp.hh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index 8a568376..1dd6f2d3 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -98,7 +98,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (double *) buff_U, & ldim_U, (double *) buff_T, & ldim_T, (double *) buff_B, & ldim_B, (double *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1, 1, 1, 1 + , 1, 1, 1, 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -106,7 +106,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (float *) buff_U, & ldim_U, (float *) buff_T, & ldim_T, (float *) buff_B, & ldim_B, (float *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1, 1, 1, 1 + , 1, 1, 1, 1 #endif ); } else { @@ -136,7 +136,7 @@ void _LAPACK_larf( (double *) C, & ldc_, (double *) work #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1 + , 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -146,7 +146,7 @@ void _LAPACK_larf( (float *) C, & ldc_, (float *) work #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1 + , 1 #endif ); } else { @@ -398,7 +398,7 @@ static int64_t NoFLA_QRP_downdate_partial_norms( char dlmach_param = 'E'; tol3z = sqrt( LAPACK_dlamch( & dlmach_param #ifdef LAPACK_FORTRAN_STRLEN_END - //, 1 + , 1 #endif ) ); ptr_d = buff_d; From 41a81173020a8b6829738018d2b1db9b3d7e3a85 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 13:12:17 -0700 Subject: [PATCH 50/56] Updae --- benchmark/bench_CQRRP/CQRRP_pivot_quality.cc | 2 +- benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc | 2 +- test/drivers/test_rbki.cc | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc index 4eead9e7..4bf840de 100644 --- a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc +++ b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc @@ -1,6 +1,6 @@ /* Performs computations in order to assess the pivot quality of ICQRRP. -The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper. +The setup is described in detail in Section 4 of The arXiv version 2 CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper. */ #include "RandLAPACK.hh" #include "rl_blaspp.hh" diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc index 5ccbcec7..034935ad 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc @@ -1,6 +1,6 @@ /* Performs computations in order to assess the pivot quality of CQRRPT. -The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper. +The setup is described in detail in Section 4 of The arXiv version 2 CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper. */ #include "RandLAPACK.hh" #include "rl_blaspp.hh" diff --git a/test/drivers/test_rbki.cc b/test/drivers/test_rbki.cc index d0b5a1e9..2f30faea 100644 --- a/test/drivers/test_rbki.cc +++ b/test/drivers/test_rbki.cc @@ -97,7 +97,6 @@ class TestRBKI : public ::testing::Test T residual_err_custom = residual_error_comp(all_data, custom_rank); printf("residual_err_custom %e\n", residual_err_custom); - //ASSERT_NEAR(residual_err_custom, 8.039386e-13, std::pow(std::numeric_limits::epsilon(), 0.825)); ASSERT_LE(residual_err_custom, 10 * std::pow(std::numeric_limits::epsilon(), 0.825)); } }; From 9865d287988481e8c5aa4c2b2b67bdae31e09f67 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 13:17:47 -0700 Subject: [PATCH 51/56] Update --- RandLAPACK/drivers/rl_hqrrp.hh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index 1dd6f2d3..8a568376 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -98,7 +98,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (double *) buff_U, & ldim_U, (double *) buff_T, & ldim_T, (double *) buff_B, & ldim_B, (double *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - , 1, 1, 1, 1 + //, 1, 1, 1, 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -106,7 +106,7 @@ void _LAPACK_lafrb( & m_, & n_, & k_, (float *) buff_U, & ldim_U, (float *) buff_T, & ldim_T, (float *) buff_B, & ldim_B, (float *) buff_W, & ldim_W #ifdef LAPACK_FORTRAN_STRLEN_END - , 1, 1, 1, 1 + //, 1, 1, 1, 1 #endif ); } else { @@ -136,7 +136,7 @@ void _LAPACK_larf( (double *) C, & ldc_, (double *) work #ifdef LAPACK_FORTRAN_STRLEN_END - , 1 + //, 1 #endif ); } else if (typeid(T) == typeid(float)) { @@ -146,7 +146,7 @@ void _LAPACK_larf( (float *) C, & ldc_, (float *) work #ifdef LAPACK_FORTRAN_STRLEN_END - , 1 + //, 1 #endif ); } else { @@ -398,7 +398,7 @@ static int64_t NoFLA_QRP_downdate_partial_norms( char dlmach_param = 'E'; tol3z = sqrt( LAPACK_dlamch( & dlmach_param #ifdef LAPACK_FORTRAN_STRLEN_END - , 1 + //, 1 #endif ) ); ptr_d = buff_d; From 5d869fc18a3404bb43052bef0af86a1f443ebaa6 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 13:32:43 -0700 Subject: [PATCH 52/56] Messing with the generator --- RandLAPACK/misc/rl_gen.hh | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 5cbd4bb7..53d98819 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -111,7 +111,7 @@ template void gen_poly_mat( int64_t &m, int64_t &n, - T* A, + std::vector &A, int64_t k, T cond, T p, @@ -120,8 +120,8 @@ void gen_poly_mat( ) { // Predeclare to all nonzero constants, start decay where needed - T* s = ( T * ) calloc( k, sizeof( T ) ); - T* S = ( T * ) calloc( k * k, sizeof( T ) ); + std::vector s(k, 1.0); + std::vector S(k * k, 0.0); // The first 10% of the singular values will be equal to one int offset = (int) floor(k * 0.1); @@ -130,23 +130,27 @@ void gen_poly_mat( T a = std::pow((std::pow(last_entry, -1 / p) - std::pow(first_entry, -1 / p)) / (k - offset), p); T b = std::pow(a * first_entry, -1 / p) - offset; // apply lambda function to every entry of s - std::fill(s, s + offset, 1.0); - for (int i = offset; i < k; ++i) { - s[i] = 1 / (a * std::pow(offset + b, p)); - ++offset; - } + std::for_each(s.begin() + offset, s.end(), + // Lambda expression begins + [&p, &offset, &a, &b](T &entry) { + entry = 1 / (a * std::pow(offset + b, p)); + ++offset; + } + ); // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); if (diagon) { - lapack::lacpy(MatrixType::General, k, k, S, k, A, k); + if (!(m == k || n == k)) { + m = k; + n = k; + A.resize(k * k); + } + lapack::lacpy(MatrixType::General, k, k, S.data(), k, A.data(), k); } else { RandLAPACK::gen::gen_singvec(m, n, A, k, S, state); } - - free(s); - free(S); } /// Generates a matrix with exponentially-decaying spectrum of the following form: From e5dd024856367a3ed3690ba3eb7a22d69df0b5e6 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 13:36:50 -0700 Subject: [PATCH 53/56] Messing with the generator --- RandLAPACK/misc/rl_gen.hh | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 53d98819..b95a8e98 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -111,7 +111,7 @@ template void gen_poly_mat( int64_t &m, int64_t &n, - std::vector &A, + T* A, int64_t k, T cond, T p, @@ -120,8 +120,8 @@ void gen_poly_mat( ) { // Predeclare to all nonzero constants, start decay where needed - std::vector s(k, 1.0); - std::vector S(k * k, 0.0); + T* s = ( T * ) calloc( k, sizeof( T ) ); + T* S = ( T * ) calloc( k * k, sizeof( T ) ); // The first 10% of the singular values will be equal to one int offset = (int) floor(k * 0.1); @@ -130,27 +130,31 @@ void gen_poly_mat( T a = std::pow((std::pow(last_entry, -1 / p) - std::pow(first_entry, -1 / p)) / (k - offset), p); T b = std::pow(a * first_entry, -1 / p) - offset; // apply lambda function to every entry of s - std::for_each(s.begin() + offset, s.end(), + std::fill(s, s + offset, 1.0); + std::for_each(s + offset, s + k, // Lambda expression begins [&p, &offset, &a, &b](T &entry) { entry = 1 / (a * std::pow(offset + b, p)); ++offset; } ); - +/* + for (int i = offset; i < k; ++i) { + s[i] = 1 / (a * std::pow(offset + b, p)); + ++offset; + } +*/ // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); if (diagon) { - if (!(m == k || n == k)) { - m = k; - n = k; - A.resize(k * k); - } - lapack::lacpy(MatrixType::General, k, k, S.data(), k, A.data(), k); + lapack::lacpy(MatrixType::General, k, k, S, k, A, k); } else { RandLAPACK::gen::gen_singvec(m, n, A, k, S, state); } + + free(s); + free(S); } /// Generates a matrix with exponentially-decaying spectrum of the following form: From 9b8da934b7c9cee3d7d25d7694ed99be193d7e21 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 13:54:26 -0700 Subject: [PATCH 54/56] Forgot to free the memory in condition number check. --- RandLAPACK/misc/rl_gen.hh | 10 +--------- RandLAPACK/misc/rl_util.hh | 3 +++ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index b95a8e98..5cbd4bb7 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -131,19 +131,11 @@ void gen_poly_mat( T b = std::pow(a * first_entry, -1 / p) - offset; // apply lambda function to every entry of s std::fill(s, s + offset, 1.0); - std::for_each(s + offset, s + k, - // Lambda expression begins - [&p, &offset, &a, &b](T &entry) { - entry = 1 / (a * std::pow(offset + b, p)); - ++offset; - } - ); -/* for (int i = offset; i < k; ++i) { s[i] = 1 / (a * std::pow(offset + b, p)); ++offset; } -*/ + // form a diagonal S RandLAPACK::util::diag(k, k, s, k, S); diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 0368235f..f173a3c7 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -205,6 +205,9 @@ T cond_num_check( if (verbose) printf("CONDITION NUMBER: %f\n", cond_num); + free(A_cpy); + free(s); + return cond_num; } From ecf60d36a806b97fa5f957f0188e0aa3d3dab775 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 14:04:11 -0700 Subject: [PATCH 55/56] Fixing omp mac issue --- RandLAPACK/drivers/rl_rbki.hh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 4c31191e..5f241937 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -247,10 +247,14 @@ int RBKI::call( // Generate a dense Gaussian random matrx. // OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation. +#if !defined(__APPLE__) omp_set_num_threads(this->num_threads_some); +#endif RandBLAS::DenseDist D(n, k); state = RandBLAS::fill_dense(D, Y_i, state).second; +#if !defined(__APPLE__) omp_set_num_threads(this->num_threads_rest); +#endif if(this -> timing) { sketching_t_stop = high_resolution_clock::now(); From 9e1e036c50615aa921f063cf433ba2f6e8af634f Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 1 Apr 2024 14:15:33 -0700 Subject: [PATCH 56/56] Fixing omp mac issue --- RandLAPACK/drivers/rl_rbki.hh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 5f241937..f56cf1c0 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -352,9 +352,13 @@ int RBKI::call( } // Copy R_ii over to R's (in transposed format). +#if !defined(__APPLE__) omp_set_num_threads(this->num_threads_some); +#endif util::transposition(0, k, Y_i, n, R_ii, n, 1); +#if !defined(__APPLE__) omp_set_num_threads(this->num_threads_rest); +#endif if(this -> timing) { r_cpy_t_stop = high_resolution_clock::now();