Update per Riley's comments

BallisticLA · Apr 1, 2024 · 109edae · 109edae
1 parent b824aa0
commit 109edae
Show file tree

Hide file tree

Showing 17 changed files with 156 additions and 61 deletions.
diff --git a/RandLAPACK/comps/rl_orth.hh b/RandLAPACK/comps/rl_orth.hh
@@ -96,7 +96,7 @@ int CholQRQ<T>::call(
     // Scheme may succeed, but output garbage
     if(this->cond_check) {
         if(util::cond_num_check(k, k, Q_gram.data(), (this->Q_gram_cpy).data(), (this->s).data(), this->verbosity) > (1 / std::sqrt(std::numeric_limits<T>::epsilon()))){
-                //return 1;
+                return 1;
         }
     }
 

diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh
@@ -318,9 +318,8 @@ int CQRRP_blocked<T, RNG>::call(
 
         // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk.
         // Get a transpose of A_sk 
-        #pragma omp parallel for
-        for(i = 0; i < cols; ++i)
-            blas::copy(sampling_dimension, &A_sk[i * d], 1, &A_sk_trans[i], n);
+        util::transposition(sampling_dimension, cols, A_sk, d, A_sk_trans, n, 0);
+
         // Perform a row-pivoted LU on a transpose of A_sk
         lapack::getrf(cols, sampling_dimension, A_sk_trans, n, J_buffer_lu);
         // Fill the pivot vector, apply swaps found via lu on A_sk'.

diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh
@@ -127,6 +127,9 @@ class RBKI : public RBKIalg<T, RNG> {
         int max_krylov_iters;
         std::vector<long> times;
         T norm_R_end;
+
+        int num_threads_some;
+        int num_threads_rest;
 };
 
 // -----------------------------------------------------------------------------
@@ -185,7 +188,7 @@ int RBKI<T, RNG>::call(
         allocation_t_start  = high_resolution_clock::now();
     }
 
-    int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0;
+    int64_t iter = 0, iter_od = 0, iter_ev = 0, end_rows = 0, end_cols = 0;
     T norm_R = 0;
     int max_iters = this->max_krylov_iters;//std::min(this->max_krylov_iters, (int) (n / (T) k));
 
@@ -244,10 +247,10 @@ int RBKI<T, RNG>::call(
 
     // Generate a dense Gaussian random matrx.
     // OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation.
-    //omp_set_num_threads(4);
+    omp_set_num_threads(this->num_threads_some);
     RandBLAS::DenseDist D(n, k);
     state = RandBLAS::fill_dense(D, Y_i, state).second;
-    //omp_set_num_threads(48);
+    omp_set_num_threads(this->num_threads_rest);
 
     if(this -> timing) {
         sketching_t_stop  = high_resolution_clock::now();
@@ -345,10 +348,9 @@ int RBKI<T, RNG>::call(
             }
 
             // Copy R_ii over to R's (in transposed format).
-            omp_set_num_threads(4);
-            for(i = 0; i < k; ++i)
-                blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n);
-            omp_set_num_threads(48);
+            omp_set_num_threads(this->num_threads_some);
+            util::transposition(0, k, Y_i, n, R_ii, n, 1);
+            omp_set_num_threads(this->num_threads_rest);
 
             if(this -> timing) {
                 r_cpy_t_stop  = high_resolution_clock::now();
@@ -499,7 +501,7 @@ int RBKI<T, RNG>::call(
         }
     }
 
-    this -> norm_R_end = norm_R;
+    this->norm_R_end = norm_R;
     this->num_krylov_iters = iter;
     end_cols = num_krylov_iters * k / 2;
     iter % 2 == 0 ? end_rows = end_cols + k : end_rows = end_cols;
@@ -556,7 +558,7 @@ int RBKI<T, RNG>::call(
             total_t_stop = high_resolution_clock::now();
             total_t_dur  = duration_cast<microseconds>(total_t_stop - total_t_start).count();
             long t_rest  = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur + sketching_t_dur + r_cpy_t_dur + s_cpy_t_dur + norm_t_dur);
-            this -> times.resize(11);
+            this -> times.resize(13);
             this -> times = {allocation_t_dur, get_factors_t_dur, ungqr_t_dur, reorth_t_dur, qr_t_dur, gemm_A_t_dur, main_loop_t_dur, sketching_t_dur, r_cpy_t_dur, s_cpy_t_dur, norm_t_dur, t_rest, total_t_dur};
 
             if (this -> verbosity) {

diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh
@@ -131,13 +131,10 @@ void gen_poly_mat(
     T b = std::pow(a * first_entry, -1 / p) - offset;
     // apply lambda function to every entry of s
     std::fill(s, s + offset, 1.0);
-    std::for_each(s + offset, s + k,
-        // Lambda expression begins
-        [&p, &offset, &a, &b](T &entry) {
-                entry = 1 / (a * std::pow(offset + b, p));
-                ++offset;
-        }
-    );
+    for (int i = offset; i < k; ++i) {
+        s[i] = 1 / (a * std::pow(offset + b, p));
+        ++offset;
+    }
 
     // form a diagonal S
     RandLAPACK::util::diag(k, k, s, k, S);
@@ -179,12 +176,10 @@ void gen_exp_mat(
     // apply lambda function to every entry of s
     // Please make sure that the first singular value is always 1
     std::fill(s, s + offset, 1.0);
-    std::for_each(s + offset, s + k,
-        // Lambda expression begins
-        [&t, &cnt](T &entry) {
-                entry = (std::exp(++cnt * -t));
-        }
-    );
+    for (int i = offset; i < k; ++i) {
+        s[i] = (std::exp(++cnt * -t));
+        ++offset;
+    }
 
     // form a diagonal S
     RandLAPACK::util::diag(k, k, s, k, S);

diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh
@@ -187,7 +187,7 @@ template <typename T>
 T cond_num_check(
     int64_t m,
     int64_t n,
-    T* A,
+    const T* A,
     T* A_cpy,
     T* s,
     bool verbose
@@ -213,7 +213,7 @@ template <typename T>
 int64_t rank_check(
     int64_t m,
     int64_t n,
-    T* A
+    const T* A
 ) {
     T* A_pre_cpy = ( T * ) calloc( m * n, sizeof( T ) );
     T* s     = ( T * ) calloc( n, sizeof( T ) );
@@ -389,5 +389,30 @@ void eat_lda_slack(
     delete [] work;
 }
 
+// Perform an explicit transposition of a given matrix, 
+// write the transpose into a buffer.
+// WARNING: OMP parallelism occasionally tanks the performance.
+template <typename T>
+void transposition(
+    int64_t m,
+    int64_t n,
+    const T* A,
+    int64_t lda,
+    T* AT,
+    int64_t ldat,
+    int copy_upper_triangle
+) {
+    if (copy_upper_triangle) {
+        // Only transposing the upper-triangular portion of the original
+        #pragma omp parallel for
+        for(int i = 0; i < n; ++i)
+            blas::copy(i + 1, &A[i * lda], 1, &AT[i], ldat);
+    } else {
+        #pragma omp parallel for
+        for(int i = 0; i < n; ++i)
+            blas::copy(m, &A[i * lda], 1, &AT[i], ldat);
+    }
+}
+
 } // end namespace util
 #endif
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -60,8 +60,8 @@ add_benchmark(NAME CQRRPT_pivot_quality     CXX_SOURCES bench_CQRRPT/CQRRPT_pivo
 # CQRRP benchmarks
 add_benchmark(NAME CQRRP_speed_comparisons       CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs})
 add_benchmark(NAME CQRRP_runtime_breakdown       CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs})
-add_benchmark(NAME CQRRP_Apple_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs})
+add_benchmark(NAME CQRRP_single_precision        CXX_SOURCES bench_CQRRP/CQRRP_single_precision.cc  LINK_LIBS ${Benchmark_libs})
 add_benchmark(NAME CQRRP_pivot_quality           CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc     LINK_LIBS ${Benchmark_libs})
 
 add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs})
-add_benchmark(NAME RBKI_runtime_benchmark CXX_SOURCES bench_RBKI/RBKI_runtime_benchmark.cc LINK_LIBS ${Benchmark_libs})
+add_benchmark(NAME RBKI_runtime_breakdown CXX_SOURCES bench_RBKI/RBKI_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs})
diff --git a/benchmark/Gemm_vs_ormqr.cc b/benchmark/Gemm_vs_ormqr.cc
@@ -73,4 +73,4 @@ int main() {
     test_speed<double, r123::Philox4x32>(std::pow(2, 14), std::pow(2, 9),  10, state);
     test_speed<double, r123::Philox4x32>(std::pow(2, 15), std::pow(2, 10), 10, state);
     return 0;
-}
+}
diff --git a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc
@@ -1,3 +1,7 @@
+/*
+Performs computations in order to assess the pivot quality of ICQRRP.
+The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper.
+*/
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc
@@ -1,3 +1,17 @@
+/*
+ICQRRP runtime breakdown benchmark - assesses the time taken by each subcomponent of ICQRRP.
+There are 9 things that we time:
+                1. SASO generation and application time
+                2. QRCP time.
+                3. Preconditioning time.
+                4. Time to perform Cholesky QR.
+                5. Time to restore Householder vectors.
+                6. Time to compute A_new, R12.
+                7. Time to update factors Q, R.
+                8. Time to update the sketch.
+                9. Time to pivot trailing columns of R-factor.
+*/
+
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/...ch_CQRRP/CQRRP_Apple_runtime_breakdown.cc → ...ark/bench_CQRRP/CQRRP_single_precision.cc b/...ch_CQRRP/CQRRP_Apple_runtime_breakdown.cc → ...ark/bench_CQRRP/CQRRP_single_precision.cc
@@ -1,3 +1,7 @@
+/*
+This benchmarks compares single-precision ICQRRP with double-precision GETRF and GEQRF.
+We anticipate that single-precision ICQRRP can be used as part of the linear system solving process.
+*/
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc
@@ -1,3 +1,14 @@
+/*
+ICQRRP speed comparison benchmark - runs:
+    1. ICQRRP
+    2. GEQRF
+    3. GEQP3 - takes too long!
+    5. HQRRP + CholQR
+    6. HQRRP + GEQRF
+for a matrix with fixed number of rows and columns and a varying ICQRRP block size.
+Records the best timing, saves that into a file.
+*/
+
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc
@@ -1,3 +1,7 @@
+/*
+Performs computations in order to assess the pivot quality of CQRRPT.
+The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper.
+*/
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc
@@ -1,3 +1,13 @@
+/*
+CQRRPT runtime breakdown benchmark - assesses the time taken by each subcomponent of CQRRPT.
+There are 6 things that we time:
+                1. SASO generation and application time
+                2. QRCP time.
+                3. Time it takes to compute numerical rank k.
+                4. piv(A).
+                5. TRSM(A).
+                6. Time to perform Cholesky QR.
+*/
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc
@@ -1,3 +1,14 @@
+/*
+CQRRPT speed comparison benchmark - runs:
+    1. CQRRPT
+    2. GEQR
+    3. GEQRF
+    4. GEQP3
+    5. GEQPT
+    6. SCHOLQR
+for a matrix with fixed number of rows and a varying number of columns.
+Records the best timing, saves that into a file.
+*/
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"

diff --git a/...mark/bench_RBKI/RBKI_runtime_benchmark.cc → ...mark/bench_RBKI/RBKI_runtime_breakdown.cc b/...mark/bench_RBKI/RBKI_runtime_benchmark.cc → ...mark/bench_RBKI/RBKI_runtime_breakdown.cc
@@ -1,3 +1,18 @@
+/*
+RBKI runtime breakdown benchmark - assesses the time taken by each subcomponent of RBKI.
+Records all, data, not just the best.
+There are 10 things that we time:
+                1.Allocate and free time.
+                2.Time to acquire the SVD factors.
+                3.UNGQR time.
+                4.Reorthogonalization time.
+                5.QR time.
+                6.GEMM A time.
+                7.Sketching time.
+                8.R_ii cpy time.
+                9.S_ii cpy time.
+                10.Norm R time.
+*/
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"
@@ -68,35 +83,30 @@ static void call_all_algs(
     RandLAPACK::RBKI<double, r123::Philox4x32> RBKI(false, time_subroutines, tol);
     RBKI.max_krylov_iters = num_krylov_iters;
 
-
     // Making sure the states are unchanged
     auto state_gen = state;
 
     // Timing vars
-    long dur_rbki    = 0;
-    long t_rbki_best = 0;
-    std::vector<long> inner_timing_best;
+    std::vector<long> inner_timing;
 
     for (int i = 0; i < numruns; ++i) {
         printf("Iteration %d start.\n", i);
-        auto start_rbki = high_resolution_clock::now();
         RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state);
-        auto stop_rbki = high_resolution_clock::now();
-        dur_rbki = duration_cast<microseconds>(stop_rbki - start_rbki).count();
-        // Update best timing
-        if (!i || dur_rbki < t_rbki_best) {t_rbki_best = dur_rbki; inner_timing_best = RBKI.times;}
+
+        // Update timing vector
+        inner_timing = RBKI.times;
+        // Add info about the run
+        inner_timing.insert (inner_timing.begin(), k);
+        inner_timing.insert (inner_timing.begin(), num_krylov_iters);
+
+        std::ofstream file(output_filename, std::ios::app);
+        std::copy(inner_timing.begin(), inner_timing.end(), std::ostream_iterator<int>(file, ", "));
+        file << "\n";
+
         // Clear and re-generate data
         data_regen<T, RNG>(m_info, all_data, state_gen, 0);
         state_gen = state;
     }
-
-    // Add info about the run
-    inner_timing_best.insert (inner_timing_best.begin(), k);
-    inner_timing_best.insert (inner_timing_best.begin(), num_krylov_iters);
-
-    std::ofstream file(output_filename, std::ios::app);
-    std::copy(inner_timing_best.begin(), inner_timing_best.end(), std::ostream_iterator<int>(file, ", "));
-    file << "\n";
 }
 
 int main(int argc, char *argv[]) {
@@ -157,4 +167,4 @@ int main(int argc, char *argv[]) {
         }
         num_krylov_iters_curr = num_krylov_iters_start;
     }
-}
+}