Skip to content

Commit

Permalink
Commits hipify results
Browse files Browse the repository at this point in the history
  • Loading branch information
PTNobel committed Feb 11, 2025
1 parent 31363fd commit ec2f1cb
Show file tree
Hide file tree
Showing 20 changed files with 515 additions and 397 deletions.
193 changes: 97 additions & 96 deletions RandLAPACK/gpu_functions/rl_cuda_kernels.cuh

Large diffs are not rendered by default.

13 changes: 6 additions & 7 deletions benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ int main(int argc, char *argv[]) {
int64_t m = std::stol(size);
int64_t n = std::stol(size);
double d_factor = 1.0;
std::vector<int64_t> b_sz = {250, 500, 1000, 2000, 4000, 8000};
//std::vector<int64_t> b_sz = {256, 512, 1024, 2048, 4096, 8192};
int64_t b_sz_start = 32;
int64_t b_sz_end = 128;
auto state = RandBLAS::RNGState<r123::Philox4x32>();
auto state_constant = state;
// Timing results
Expand All @@ -141,21 +141,20 @@ int main(int argc, char *argv[]) {
+ "_num_info_lines_" + std::to_string(6) +
".txt";

std::ofstream file(output_filename, std::ios::out | std::ios::app);
std::ofstream file(output_filename, std::ios::out | std::ios::trunc);

// Writing important data into file
file << "Description: Results from the BQRRP runtime breakdown benchmark, recording the time it takes to perform every subroutine in BQRRP."
"\nFile format: 10 data columns, each corresponding to a given BQRRP subroutine: skop_t_dur, preallocation_t_dur, qrcp_wide_t_dur, panel_preprocessing_t_dur, qr_tall_t_dur, q_reconstruction_t_dur, apply_transq_t_dur, sample_update_t_dur, t_other, total_t_dur"
" rows correspond to BQRRP runs with block sizes varying in powers of 2, with numruns repititions of each block size"
"\nInput type:" + std::to_string(m_info.m_type) +
"\nInput size:" + std::to_string(m) + " by " + std::to_string(n) +
"\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz.front()) + " BQRRP block size end: " + std::to_string(b_sz.back()) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) +
"\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz_start) + " BQRRP block size end: " + std::to_string(b_sz_end) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) +
"\n";
file.flush();

int i = 0;
for (;i < b_sz.size(); ++i) {
call_all_algs(m_info, numruns, b_sz[i], qr_tall, all_data, state_constant, output_filename);
for (;b_sz_start <= b_sz_end; b_sz_start *= 2) {
call_all_algs(m_info, numruns, b_sz_start, qr_tall, all_data, state_constant, output_filename);
}
}
#endif
74 changes: 40 additions & 34 deletions benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,26 +136,31 @@ static void call_wide_qrcp(
std::string output_filename) {

auto m = all_data.row;
auto tol = all_data.tolerance;

RandLAPACK::CQRRPT<double, r123::Philox4x32> CQRRPT(false, tol);
CQRRPT.nnz = 4;

// timing vars
long dur_geqp3 = 0;
long dur_luqr = 0;
long dur_cqrrpt = 0;

// Making sure the states are unchanged
auto state_alg = state;

int i, j = 0;
for (i = 0; i < numruns; ++i) {
printf("Wide QRCP iteration %d; m==%ld start.\n", i, n);
printf("Wide QRCP iteration %d; m==%d start.\n", i, n);
// Testing GEQP3
auto start_geqp3 = high_resolution_clock::now();
auto start_geqp3 = steady_clock::now();
lapack::geqp3(n, m, all_data.A.data(), n, all_data.J.data(), all_data.tau.data());
auto stop_geqp3 = high_resolution_clock::now();
auto stop_geqp3 = steady_clock::now();
dur_geqp3 = duration_cast<microseconds>(stop_geqp3 - start_geqp3).count();
data_regen(m_info, all_data, state, state, 1);

// Testing LUQR
auto start_luqr = high_resolution_clock::now();
auto start_luqr = steady_clock::now();
// Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk.
// Get a transpose of A_sk
RandLAPACK::util::transposition(n, m, all_data.A.data(), n, all_data.A_trans.data(), m, 0);
Expand All @@ -172,7 +177,7 @@ static void call_wide_qrcp(
RandLAPACK::util::col_swap(n, m, m, all_data.A.data(), n, all_data.J);
// Perform an unpivoted QR on A_sk
lapack::geqrf(n, m, all_data.A.data(), n, all_data.tau.data());
auto stop_luqr = high_resolution_clock::now();
auto stop_luqr = steady_clock::now();
dur_luqr = duration_cast<microseconds>(stop_luqr - start_luqr).count();
data_regen(m_info, all_data, state, state, 1);

Expand All @@ -193,6 +198,7 @@ static void call_tsqr(
std::string output_filename) {

auto m = all_data.row;
auto tol = all_data.tolerance;
int64_t tsize = 0;

// timing vars
Expand All @@ -205,11 +211,10 @@ static void call_tsqr(
long dur_cholqr_r_restore = 0;

// Imitating the QRCP on a sketch stage of BQRRP - needed to get a preconditioner
T* S = new T[n * m]();
T* A_sk = new T[n * n]();
int64_t* J = new int64_t[n]();
T* tau = new T[n]();

T* S = ( T * ) calloc( n * m, sizeof( T ) );
T* A_sk = ( T * ) calloc( n * n, sizeof( T ) );
int64_t* J = ( int64_t * ) calloc( n, sizeof( int64_t ) );
T* tau = ( T * ) calloc( n, sizeof( T ) );
RandBLAS::DenseDist D(n, m);
auto state_const = state;
RandBLAS::fill_dense(D, S, state_const);
Expand All @@ -224,45 +229,45 @@ static void call_tsqr(
for(nb = geqrt_nb_start; nb <= n; nb *=2) {
printf("TSQR iteration %d; n==%ld start.\n", i, n);

auto start_geqrt = high_resolution_clock::now();
auto start_geqrt = steady_clock::now();
lapack::geqrt( m, n, nb, all_data.A.data(), m, all_data.T_mat.data(), n );
auto stop_geqrt = high_resolution_clock::now();
auto stop_geqrt = steady_clock::now();
dur_geqrt = duration_cast<microseconds>(stop_geqrt - start_geqrt).count();

if(nb == geqrt_nb_start) {
// Testing GEQRF
auto start_geqrf = high_resolution_clock::now();
auto start_geqrf = steady_clock::now();
lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data());
auto stop_geqrf = high_resolution_clock::now();
auto stop_geqrf = steady_clock::now();
dur_geqrf = duration_cast<microseconds>(stop_geqrf - start_geqrf).count();
data_regen(m_info, all_data, state, state, 2);

// Testing GEQR
auto start_geqr = high_resolution_clock::now();
auto start_geqr = steady_clock::now();
lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1);
tsize = (int64_t) all_data.tau[0];
all_data.tau.resize(tsize);
lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize);
auto stop_geqr = high_resolution_clock::now();
auto stop_geqr = steady_clock::now();
dur_geqr = duration_cast<microseconds>(stop_geqr - start_geqr).count();
data_regen(m_info, all_data, state, state, 2);

// Testing CholQR
auto start_precond = high_resolution_clock::now();
auto start_precond = steady_clock::now();
blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, A_sk, n, all_data.A.data(), m);
auto stop_precond = high_resolution_clock::now();
auto stop_precond = steady_clock::now();
dur_cholqr_precond = duration_cast<microseconds>(stop_precond - start_precond).count();
auto start_cholqr = high_resolution_clock::now();
auto start_cholqr = steady_clock::now();
blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n);
lapack::potrf(Uplo::Upper, n, all_data.R.data(), n);
blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, all_data.R.data(), n, all_data.A.data(), m);
auto stop_cholqr = high_resolution_clock::now();
auto stop_cholqr = steady_clock::now();
dur_cholqr = duration_cast<microseconds>(stop_cholqr - start_cholqr).count();
auto start_orhr_col = high_resolution_clock::now();
auto start_orhr_col = steady_clock::now();
lapack::orhr_col(m, n, n, all_data.A.data(), m, all_data.T_mat.data(), n, all_data.D.data());
auto stop_cholqr_orhr = high_resolution_clock::now();
auto stop_cholqr_orhr = steady_clock::now();
dur_cholqr_house_rest = duration_cast<microseconds>(stop_cholqr_orhr - start_orhr_col).count();
auto start_r_restore = high_resolution_clock::now();
auto start_r_restore = steady_clock::now();
// Construct the proper R-factor
for(int i = 0; i < n; ++i) {
for(int j = 0; j < (i + 1); ++j) {
Expand All @@ -271,7 +276,7 @@ static void call_tsqr(
}
blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, n, n, (T) 1.0, A_sk, n, all_data.R.data(), n);
lapack::lacpy(MatrixType::Upper, n, n, all_data.R.data(), n, all_data.A.data(), m);
auto stop_r_restore = high_resolution_clock::now();
auto stop_r_restore = steady_clock::now();
dur_cholqr_r_restore = duration_cast<microseconds>(stop_r_restore - start_r_restore).count();
data_regen(m_info, all_data, state, state, 2);

Expand All @@ -284,10 +289,10 @@ static void call_tsqr(
file << "\n";
}

delete[] A_sk;
delete[] S;
delete[] J;
delete[] tau;
free(A_sk);
free(S);
free(J);
free(tau);
}

template <typename T, typename RNG>
Expand All @@ -306,14 +311,15 @@ static void call_apply_q(
// timing vars
long dur_ormqr = 0;
long dur_gemqrt = 0;
long dur_gemm = 0;

std::ofstream file(output_filename, std::ios::app);

int i, j = 0;
int64_t nb = 0;
for (i = 0; i < numruns; ++i) {
for(nb = gemqrt_nb_start; nb <= n; nb *=2) {
printf("Apply Q iteration %d; n==%ld start.\n", i, n);
printf("Apply Q iteration %d; n==%d start.\n", i, n);
// Performing CholQR
blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n);
lapack::potrf(Uplo::Upper, n, all_data.R.data(), n);
Expand All @@ -322,9 +328,9 @@ static void call_apply_q(
lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_gemqrt.data(), m);
lapack::orhr_col(m, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.D.data());

auto start_gemqrt = high_resolution_clock::now();
auto start_gemqrt = steady_clock::now();
lapack::gemqrt(Side::Left, Op::Trans, m, m - n, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.B1.data(), m);
auto stop_gemqrt = high_resolution_clock::now();
auto stop_gemqrt = steady_clock::now();
dur_gemqrt = duration_cast<microseconds>(stop_gemqrt - start_gemqrt).count();

// We do not re-run ormqr and gemm for different nbs
Expand All @@ -335,9 +341,9 @@ static void call_apply_q(
for(j = 0; j < n; ++j)
all_data.tau[j] = all_data.T_mat[(n + 1) * j];

auto start_ormqr = high_resolution_clock::now();
auto start_ormqr = steady_clock::now();
lapack::ormqr(Side::Left, Op::Trans, m, m - n, n, all_data.A.data(), m, all_data.tau.data(), all_data.B.data(), m);
auto stop_ormqr = high_resolution_clock::now();
auto stop_ormqr = steady_clock::now();
dur_ormqr = duration_cast<microseconds>(stop_ormqr - start_ormqr).count();

file << dur_ormqr << ", ";
Expand Down Expand Up @@ -404,4 +410,4 @@ int main(int argc, char *argv[]) {
for (i = n_start; i <= n_stop; i *= 2)
call_apply_q(m_info, numruns, i, nb_start, all_data, state, state_B, output_filename);
}
#endif
#endif
6 changes: 3 additions & 3 deletions benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,

RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state);
std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0);
std::fill(all_data.J.begin(), all_data.J.end(), 0);
std::iota(all_data.J.begin(), all_data.J.end(), 1);
}

template <typename T, typename RNG>
Expand All @@ -71,7 +71,7 @@ static void call_all_algs(
int panel_pivoting = 0;

// Timing vars
T* times = ( T * ) calloc( 27, sizeof( T ) );
T* times = ( T * ) calloc(27, sizeof( T ) );

for (int i = 0; i < numruns; ++i) {
printf("ITERATION %d, NUMCOLS %ld\n", i, n);
Expand Down Expand Up @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) {
+ "_num_info_lines_" + std::to_string(6) +
".txt";

std::ofstream file(output_filename, std::ios::out | std::ios::app);
std::ofstream file(output_filename, std::ios::out | std::ios::trunc);

// Writing important data into file
file << "Description: Results from the HQRRP runtime breakdown benchmark, recording the time it takes to perform every subroutine in HQRRP."
Expand Down
6 changes: 3 additions & 3 deletions benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,

// Re-generate and clear data
template <typename T>
static std::vector<T> get_norms(int64_t n, std::vector<T> Mat, int64_t lda) {
static std::vector<T> get_norms( int64_t m, int64_t n, std::vector<T> Mat, int64_t lda) {

std::vector<T> R_norms (n, 0.0);
for (int i = 0; i < n; ++i) {
Expand Down Expand Up @@ -81,7 +81,7 @@ static void R_norm_ratio(

// Running GEQP3
lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());
std::vector<T> R_norms_GEQP3 = get_norms(n, all_data.A, m);
std::vector<T> R_norms_GEQP3 = get_norms(m, n, all_data.A, m);
printf("\nDone with QP3\n");

// Clear and re-generate data
Expand All @@ -92,7 +92,7 @@ static void R_norm_ratio(
// Running CQRRP
state_alg = state;
CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg);
std::vector<T> R_norms_CQRRPT = get_norms(n, all_data.R, n);
std::vector<T> R_norms_CQRRPT = get_norms(n, n, all_data.R, n);

// Declare a data file
std::fstream file1("QR_R_norm_ratios_rows_" + std::to_string(m)
Expand Down
24 changes: 12 additions & 12 deletions benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,35 +85,35 @@ static void call_all_algs(
for (int i = 0; i < numruns; ++i) {
printf("Iteration %d start.\n", i);
// Testing GEQP3
auto start_geqp3 = high_resolution_clock::now();
auto start_geqp3 = steady_clock::now();
lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());
auto stop_geqp3 = high_resolution_clock::now();
auto stop_geqp3 = steady_clock::now();
dur_geqp3 = duration_cast<microseconds>(stop_geqp3 - start_geqp3).count();

state_gen = state;
data_regen(m_info, all_data, state_gen);

// Testing GEQRF
auto start_geqrf = high_resolution_clock::now();
auto start_geqrf = steady_clock::now();
lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data());
auto stop_geqrf = high_resolution_clock::now();
auto stop_geqrf = steady_clock::now();
dur_geqrf = duration_cast<microseconds>(stop_geqrf - start_geqrf).count();

state_gen = state;
data_regen(m_info, all_data, state_gen);

// Testing CQRRPT
auto start_cqrrp = high_resolution_clock::now();
auto start_cqrrp = steady_clock::now();
CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg);
auto stop_cqrrp = high_resolution_clock::now();
auto stop_cqrrp = steady_clock::now();
dur_cqrrpt = duration_cast<microseconds>(stop_cqrrp - start_cqrrp).count();

state_gen = state;
state_alg = state;
data_regen(m_info, all_data, state_gen);

// Testing SCHOLQR3
auto start_scholqr = high_resolution_clock::now();
auto start_scholqr = steady_clock::now();
//--------------------------------------------------------------------------------------------------------------------------//
T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
T shift = 11 * std::numeric_limits<T>::epsilon() * n * std::pow(norm_A, 2);
Expand All @@ -131,29 +131,29 @@ static void call_all_algs(
lapack::potrf(Uplo::Upper, n, all_data.R.data(), n);
blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m);
//--------------------------------------------------------------------------------------------------------------------------//
auto stop_scholqr = high_resolution_clock::now();
auto stop_scholqr = steady_clock::now();
dur_scholqr = duration_cast<microseconds>(stop_scholqr - start_scholqr).count();

auto state_gen = state;
data_regen(m_info, all_data, state_gen);

// Testing GEQR + GEQPT
#if !defined(__APPLE__)
auto start_geqpt = high_resolution_clock::now();
auto start_geqr = high_resolution_clock::now();
auto start_geqpt = steady_clock::now();
auto start_geqr = steady_clock::now();
// GEQR(A) part
lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1);
int64_t tsize = (int64_t) all_data.tau[0];
all_data.tau.resize(tsize);
lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize);

auto stop_geqr = high_resolution_clock::now();
auto stop_geqr = steady_clock::now();
dur_geqr = duration_cast<microseconds>(stop_geqr - start_geqr).count();

// GEQP3(R) part
lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n);
lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data());
auto stop_geqpt = high_resolution_clock::now();
auto stop_geqpt = steady_clock::now();
dur_geqpt = duration_cast<microseconds>(stop_geqpt - start_geqpt).count();
state_gen = state;
data_regen(m_info, all_data, state_gen);
Expand Down
Loading

0 comments on commit ec2f1cb

Please sign in to comment.