Skip to content

Commit

Permalink
Update per Riley's comments
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Apr 1, 2024
1 parent b824aa0 commit 109edae
Show file tree
Hide file tree
Showing 17 changed files with 156 additions and 61 deletions.
2 changes: 1 addition & 1 deletion RandLAPACK/comps/rl_orth.hh
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ int CholQRQ<T>::call(
// Scheme may succeed, but output garbage
if(this->cond_check) {
if(util::cond_num_check(k, k, Q_gram.data(), (this->Q_gram_cpy).data(), (this->s).data(), this->verbosity) > (1 / std::sqrt(std::numeric_limits<T>::epsilon()))){
//return 1;
return 1;
}
}

Expand Down
5 changes: 2 additions & 3 deletions RandLAPACK/drivers/rl_cqrrp.hh
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,8 @@ int CQRRP_blocked<T, RNG>::call(

// Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk.
// Get a transpose of A_sk
#pragma omp parallel for
for(i = 0; i < cols; ++i)
blas::copy(sampling_dimension, &A_sk[i * d], 1, &A_sk_trans[i], n);
util::transposition(sampling_dimension, cols, A_sk, d, A_sk_trans, n, 0);

// Perform a row-pivoted LU on a transpose of A_sk
lapack::getrf(cols, sampling_dimension, A_sk_trans, n, J_buffer_lu);
// Fill the pivot vector, apply swaps found via lu on A_sk'.
Expand Down
20 changes: 11 additions & 9 deletions RandLAPACK/drivers/rl_rbki.hh
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ class RBKI : public RBKIalg<T, RNG> {
int max_krylov_iters;
std::vector<long> times;
T norm_R_end;

int num_threads_some;
int num_threads_rest;
};

// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -185,7 +188,7 @@ int RBKI<T, RNG>::call(
allocation_t_start = high_resolution_clock::now();
}

int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0;
int64_t iter = 0, iter_od = 0, iter_ev = 0, end_rows = 0, end_cols = 0;
T norm_R = 0;
int max_iters = this->max_krylov_iters;//std::min(this->max_krylov_iters, (int) (n / (T) k));

Expand Down Expand Up @@ -244,10 +247,10 @@ int RBKI<T, RNG>::call(

// Generate a dense Gaussian random matrx.
// OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation.
//omp_set_num_threads(4);
omp_set_num_threads(this->num_threads_some);
RandBLAS::DenseDist D(n, k);
state = RandBLAS::fill_dense(D, Y_i, state).second;
//omp_set_num_threads(48);
omp_set_num_threads(this->num_threads_rest);

if(this -> timing) {
sketching_t_stop = high_resolution_clock::now();
Expand Down Expand Up @@ -345,10 +348,9 @@ int RBKI<T, RNG>::call(
}

// Copy R_ii over to R's (in transposed format).
omp_set_num_threads(4);
for(i = 0; i < k; ++i)
blas::copy(i + 1, &Y_i[i * n], 1, &R_ii[i], n);
omp_set_num_threads(48);
omp_set_num_threads(this->num_threads_some);
util::transposition(0, k, Y_i, n, R_ii, n, 1);
omp_set_num_threads(this->num_threads_rest);

if(this -> timing) {
r_cpy_t_stop = high_resolution_clock::now();
Expand Down Expand Up @@ -499,7 +501,7 @@ int RBKI<T, RNG>::call(
}
}

this -> norm_R_end = norm_R;
this->norm_R_end = norm_R;
this->num_krylov_iters = iter;
end_cols = num_krylov_iters * k / 2;
iter % 2 == 0 ? end_rows = end_cols + k : end_rows = end_cols;
Expand Down Expand Up @@ -556,7 +558,7 @@ int RBKI<T, RNG>::call(
total_t_stop = high_resolution_clock::now();
total_t_dur = duration_cast<microseconds>(total_t_stop - total_t_start).count();
long t_rest = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur + sketching_t_dur + r_cpy_t_dur + s_cpy_t_dur + norm_t_dur);
this -> times.resize(11);
this -> times.resize(13);
this -> times = {allocation_t_dur, get_factors_t_dur, ungqr_t_dur, reorth_t_dur, qr_t_dur, gemm_A_t_dur, main_loop_t_dur, sketching_t_dur, r_cpy_t_dur, s_cpy_t_dur, norm_t_dur, t_rest, total_t_dur};

if (this -> verbosity) {
Expand Down
21 changes: 8 additions & 13 deletions RandLAPACK/misc/rl_gen.hh
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,10 @@ void gen_poly_mat(
T b = std::pow(a * first_entry, -1 / p) - offset;
// apply lambda function to every entry of s
std::fill(s, s + offset, 1.0);
std::for_each(s + offset, s + k,
// Lambda expression begins
[&p, &offset, &a, &b](T &entry) {
entry = 1 / (a * std::pow(offset + b, p));
++offset;
}
);
for (int i = offset; i < k; ++i) {
s[i] = 1 / (a * std::pow(offset + b, p));
++offset;
}

// form a diagonal S
RandLAPACK::util::diag(k, k, s, k, S);
Expand Down Expand Up @@ -179,12 +176,10 @@ void gen_exp_mat(
// apply lambda function to every entry of s
// Please make sure that the first singular value is always 1
std::fill(s, s + offset, 1.0);
std::for_each(s + offset, s + k,
// Lambda expression begins
[&t, &cnt](T &entry) {
entry = (std::exp(++cnt * -t));
}
);
for (int i = offset; i < k; ++i) {
s[i] = (std::exp(++cnt * -t));
++offset;
}

// form a diagonal S
RandLAPACK::util::diag(k, k, s, k, S);
Expand Down
29 changes: 27 additions & 2 deletions RandLAPACK/misc/rl_util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ template <typename T>
T cond_num_check(
int64_t m,
int64_t n,
T* A,
const T* A,
T* A_cpy,
T* s,
bool verbose
Expand All @@ -213,7 +213,7 @@ template <typename T>
int64_t rank_check(
int64_t m,
int64_t n,
T* A
const T* A
) {
T* A_pre_cpy = ( T * ) calloc( m * n, sizeof( T ) );
T* s = ( T * ) calloc( n, sizeof( T ) );
Expand Down Expand Up @@ -389,5 +389,30 @@ void eat_lda_slack(
delete [] work;
}

// Perform an explicit transposition of a given matrix,
// write the transpose into a buffer.
// WARNING: OMP parallelism occasionally tanks the performance.
template <typename T>
void transposition(
int64_t m,
int64_t n,
const T* A,
int64_t lda,
T* AT,
int64_t ldat,
int copy_upper_triangle
) {
if (copy_upper_triangle) {
// Only transposing the upper-triangular portion of the original
#pragma omp parallel for
for(int i = 0; i < n; ++i)
blas::copy(i + 1, &A[i * lda], 1, &AT[i], ldat);
} else {
#pragma omp parallel for
for(int i = 0; i < n; ++i)
blas::copy(m, &A[i * lda], 1, &AT[i], ldat);
}
}

} // end namespace util
#endif
4 changes: 2 additions & 2 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivo
# CQRRP benchmarks
add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs})
add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs})
add_benchmark(NAME CQRRP_Apple_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_Apple_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs})
add_benchmark(NAME CQRRP_single_precision CXX_SOURCES bench_CQRRP/CQRRP_single_precision.cc LINK_LIBS ${Benchmark_libs})
add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs})

add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs})
add_benchmark(NAME RBKI_runtime_benchmark CXX_SOURCES bench_RBKI/RBKI_runtime_benchmark.cc LINK_LIBS ${Benchmark_libs})
add_benchmark(NAME RBKI_runtime_breakdown CXX_SOURCES bench_RBKI/RBKI_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs})
2 changes: 1 addition & 1 deletion benchmark/Gemm_vs_ormqr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,4 @@ int main() {
test_speed<double, r123::Philox4x32>(std::pow(2, 14), std::pow(2, 9), 10, state);
test_speed<double, r123::Philox4x32>(std::pow(2, 15), std::pow(2, 10), 10, state);
return 0;
}
}
4 changes: 4 additions & 0 deletions benchmark/bench_CQRRP/CQRRP_pivot_quality.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
/*
Performs computations in order to assess the pivot quality of ICQRRP.
The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper.
*/
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
14 changes: 14 additions & 0 deletions benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
/*
ICQRRP runtime breakdown benchmark - assesses the time taken by each subcomponent of ICQRRP.
There are 9 things that we time:
1. SASO generation and application time
2. QRCP time.
3. Preconditioning time.
4. Time to perform Cholesky QR.
5. Time to restore Householder vectors.
6. Time to compute A_new, R12.
7. Time to update factors Q, R.
8. Time to update the sketch.
9. Time to pivot trailing columns of R-factor.
*/

#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
/*
This benchmarks compares single-precision ICQRRP with double-precision GETRF and GEQRF.
We anticipate that single-precision ICQRRP can be used as part of the linear system solving process.
*/
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
11 changes: 11 additions & 0 deletions benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
/*
ICQRRP speed comparison benchmark - runs:
1. ICQRRP
2. GEQRF
3. GEQP3 - takes too long!
5. HQRRP + CholQR
6. HQRRP + GEQRF
for a matrix with fixed number of rows and columns and a varying ICQRRP block size.
Records the best timing, saves that into a file.
*/

#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
4 changes: 4 additions & 0 deletions benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
/*
Performs computations in order to assess the pivot quality of CQRRPT.
The setup is described in detail in Section 4 of The CQRRPT (https://arxiv.org/pdf/2311.08316.pdf) paper.
*/
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
10 changes: 10 additions & 0 deletions benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
/*
CQRRPT runtime breakdown benchmark - assesses the time taken by each subcomponent of CQRRPT.
There are 6 things that we time:
1. SASO generation and application time
2. QRCP time.
3. Time it takes to compute numerical rank k.
4. piv(A).
5. TRSM(A).
6. Time to perform Cholesky QR.
*/
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
11 changes: 11 additions & 0 deletions benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
/*
CQRRPT speed comparison benchmark - runs:
1. CQRRPT
2. GEQR
3. GEQRF
4. GEQP3
5. GEQPT
6. SCHOLQR
for a matrix with fixed number of rows and a varying number of columns.
Records the best timing, saves that into a file.
*/
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
/*
RBKI runtime breakdown benchmark - assesses the time taken by each subcomponent of RBKI.
Records all, data, not just the best.
There are 10 things that we time:
1.Allocate and free time.
2.Time to acquire the SVD factors.
3.UNGQR time.
4.Reorthogonalization time.
5.QR time.
6.GEMM A time.
7.Sketching time.
8.R_ii cpy time.
9.S_ii cpy time.
10.Norm R time.
*/
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
Expand Down Expand Up @@ -68,35 +83,30 @@ static void call_all_algs(
RandLAPACK::RBKI<double, r123::Philox4x32> RBKI(false, time_subroutines, tol);
RBKI.max_krylov_iters = num_krylov_iters;


// Making sure the states are unchanged
auto state_gen = state;

// Timing vars
long dur_rbki = 0;
long t_rbki_best = 0;
std::vector<long> inner_timing_best;
std::vector<long> inner_timing;

for (int i = 0; i < numruns; ++i) {
printf("Iteration %d start.\n", i);
auto start_rbki = high_resolution_clock::now();
RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state);
auto stop_rbki = high_resolution_clock::now();
dur_rbki = duration_cast<microseconds>(stop_rbki - start_rbki).count();
// Update best timing
if (!i || dur_rbki < t_rbki_best) {t_rbki_best = dur_rbki; inner_timing_best = RBKI.times;}

// Update timing vector
inner_timing = RBKI.times;
// Add info about the run
inner_timing.insert (inner_timing.begin(), k);
inner_timing.insert (inner_timing.begin(), num_krylov_iters);

std::ofstream file(output_filename, std::ios::app);
std::copy(inner_timing.begin(), inner_timing.end(), std::ostream_iterator<int>(file, ", "));
file << "\n";

// Clear and re-generate data
data_regen<T, RNG>(m_info, all_data, state_gen, 0);
state_gen = state;
}

// Add info about the run
inner_timing_best.insert (inner_timing_best.begin(), k);
inner_timing_best.insert (inner_timing_best.begin(), num_krylov_iters);

std::ofstream file(output_filename, std::ios::app);
std::copy(inner_timing_best.begin(), inner_timing_best.end(), std::ostream_iterator<int>(file, ", "));
file << "\n";
}

int main(int argc, char *argv[]) {
Expand Down Expand Up @@ -157,4 +167,4 @@ int main(int argc, char *argv[]) {
}
num_krylov_iters_curr = num_krylov_iters_start;
}
}
}
Loading

0 comments on commit 109edae

Please sign in to comment.