From f1b679a2d907dcc5f1a8bda46234fa4c678f4a2f Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Tue, 4 Feb 2025 09:53:53 -0800 Subject: [PATCH] Replacing hugh_resolution_clock with steady_clock --- RandLAPACK/drivers/rl_bqrrp.hh | 80 +++++------ RandLAPACK/drivers/rl_bqrrp_gpu.hh | 120 ++++++++-------- RandLAPACK/drivers/rl_cqrrpt.hh | 56 ++++---- RandLAPACK/drivers/rl_cqrrpt_gpu.hh | 56 ++++---- RandLAPACK/drivers/rl_hqrrp.hh | 132 +++++++++--------- RandLAPACK/drivers/rl_rbki.hh | 132 +++++++++--------- .../bench_BQRRP/BQRRP_speed_comparisons.cc | 28 ++-- .../bench_BQRRP/BQRRP_subroutines_speed.cc | 44 +++--- .../bench_CQRRPT/CQRRPT_speed_comparisons.cc | 24 ++-- .../bench_RBKI/RBKI_speed_comparisons.cc | 16 +-- .../RBKI_speed_comparisons_just_RBKI.cc | 4 +- benchmark/bench_general/GEMM_flop_count.cc | 4 +- benchmark/bench_general/Gemm_vs_ormqr.cc | 8 +- benchmark/bench_general/basic_blas_speed.cc | 12 +- test/misc/test_util.cc | 8 +- 15 files changed, 362 insertions(+), 362 deletions(-) diff --git a/RandLAPACK/drivers/rl_bqrrp.hh b/RandLAPACK/drivers/rl_bqrrp.hh index 5695e415..16dd2a99 100644 --- a/RandLAPACK/drivers/rl_bqrrp.hh +++ b/RandLAPACK/drivers/rl_bqrrp.hh @@ -161,24 +161,24 @@ int BQRRP::call( throw std::runtime_error("BQRRP is not supported when BLAS is linked against Apple Accelerate."); #else //-------TIMING VARS--------/ - high_resolution_clock::time_point preallocation_t_start; - high_resolution_clock::time_point preallocation_t_stop; - high_resolution_clock::time_point skop_t_start; - high_resolution_clock::time_point skop_t_stop; - high_resolution_clock::time_point qrcp_wide_t_start; - high_resolution_clock::time_point qrcp_wide_t_stop; - high_resolution_clock::time_point panel_preprocessing_t_start; - high_resolution_clock::time_point panel_preprocessing_t_stop; - high_resolution_clock::time_point qr_tall_t_start; - high_resolution_clock::time_point qr_tall_t_stop; - high_resolution_clock::time_point q_reconstruction_t_start; - high_resolution_clock::time_point q_reconstruction_t_stop; - high_resolution_clock::time_point apply_transq_t_start; - high_resolution_clock::time_point apply_transq_t_stop; - high_resolution_clock::time_point sample_update_t_start; - high_resolution_clock::time_point sample_update_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point preallocation_t_start; + steady_clock::time_point preallocation_t_stop; + steady_clock::time_point skop_t_start; + steady_clock::time_point skop_t_stop; + steady_clock::time_point qrcp_wide_t_start; + steady_clock::time_point qrcp_wide_t_stop; + steady_clock::time_point panel_preprocessing_t_start; + steady_clock::time_point panel_preprocessing_t_stop; + steady_clock::time_point qr_tall_t_start; + steady_clock::time_point qr_tall_t_stop; + steady_clock::time_point q_reconstruction_t_start; + steady_clock::time_point q_reconstruction_t_stop; + steady_clock::time_point apply_transq_t_start; + steady_clock::time_point apply_transq_t_stop; + steady_clock::time_point sample_update_t_start; + steady_clock::time_point sample_update_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long preallocation_t_dur = 0; long skop_t_dur = 0; long qrcp_wide_t_dur = 0; @@ -190,8 +190,8 @@ int BQRRP::call( long total_t_dur = 0; if(this -> timing) { - total_t_start = high_resolution_clock::now(); - preallocation_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); + preallocation_t_start = steady_clock::now(); } int iter, i, j; int64_t tmp; @@ -281,9 +281,9 @@ int BQRRP::call( //*******************POINTERS TO DATA REQUIRING ADDITIONAL STORAGE END******************* if(this -> timing) { - preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_stop = steady_clock::now(); preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); - skop_t_start = high_resolution_clock::now(); + skop_t_start = steady_clock::now(); } // Using Gaussian matrix as a sketching operator. @@ -296,7 +296,7 @@ int BQRRP::call( free(S); if(this -> timing) { - skop_t_stop = high_resolution_clock::now(); + skop_t_stop = steady_clock::now(); skop_t_dur = duration_cast(skop_t_stop - skop_t_start).count(); } @@ -312,7 +312,7 @@ int BQRRP::call( std::fill(&Work2[0], &Work2[n], (T) 0.0); if(this -> timing) - qrcp_wide_t_start = high_resolution_clock::now(); + qrcp_wide_t_start = steady_clock::now(); // Performing qrcp_wide below if (this -> qrcp_wide == Subroutines::QRCPWide::geqp3) { @@ -340,9 +340,9 @@ int BQRRP::call( } if(this -> timing) { - qrcp_wide_t_stop = high_resolution_clock::now(); + qrcp_wide_t_stop = steady_clock::now(); qrcp_wide_t_dur += duration_cast(qrcp_wide_t_stop - qrcp_wide_t_start).count(); - panel_preprocessing_t_start = high_resolution_clock::now(); + panel_preprocessing_t_start = steady_clock::now(); } // Need to premute trailing columns of the full R-factor. @@ -409,9 +409,9 @@ int BQRRP::call( } if(this -> timing) { - panel_preprocessing_t_stop = high_resolution_clock::now(); + panel_preprocessing_t_stop = steady_clock::now(); panel_preprocessing_t_dur += duration_cast(panel_preprocessing_t_stop - panel_preprocessing_t_start).count(); - qr_tall_t_start = high_resolution_clock::now(); + qr_tall_t_start = steady_clock::now(); } // Define a pointer to the current subportion of tau vector. @@ -428,9 +428,9 @@ int BQRRP::call( R11 = A_work; if(this -> timing) { - qr_tall_t_stop = high_resolution_clock::now(); + qr_tall_t_stop = steady_clock::now(); qr_tall_t_dur += duration_cast(qr_tall_t_stop - qr_tall_t_start).count(); - apply_transq_t_start = high_resolution_clock::now(); + apply_transq_t_start = steady_clock::now(); } } else if (this -> qr_tall == Subroutines::QRTall::cholqr) { @@ -445,9 +445,9 @@ int BQRRP::call( blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, block_rank, (T) 1.0, R_tall_qr, b_sz_const, A_work, lda); if(this -> timing) { - qr_tall_t_stop = high_resolution_clock::now(); + qr_tall_t_stop = steady_clock::now(); qr_tall_t_dur += duration_cast(qr_tall_t_stop - qr_tall_t_start).count(); - q_reconstruction_t_start = high_resolution_clock::now(); + q_reconstruction_t_start = steady_clock::now(); } // Find Q (stored in A) using Householder reconstruction. @@ -485,9 +485,9 @@ int BQRRP::call( lapack::lacpy(MatrixType::Upper, block_rank, b_sz, R_tall_qr, b_sz_const, A_work, lda); if(this -> timing) { - q_reconstruction_t_stop = high_resolution_clock::now(); + q_reconstruction_t_stop = steady_clock::now(); q_reconstruction_t_dur += duration_cast(q_reconstruction_t_stop - q_reconstruction_t_start).count(); - apply_transq_t_start = high_resolution_clock::now(); + apply_transq_t_start = steady_clock::now(); } } else { // Perform QRF by default @@ -497,9 +497,9 @@ int BQRRP::call( // R11 is computed and placed in the appropriate space R11 = A_work; if(this -> timing) { - qr_tall_t_stop = high_resolution_clock::now(); + qr_tall_t_stop = steady_clock::now(); qr_tall_t_dur += duration_cast(qr_tall_t_stop - qr_tall_t_start).count(); - apply_transq_t_start = high_resolution_clock::now(); + apply_transq_t_start = steady_clock::now(); } } @@ -528,7 +528,7 @@ int BQRRP::call( } if(this -> timing) { - apply_transq_t_stop = high_resolution_clock::now(); + apply_transq_t_stop = steady_clock::now(); apply_transq_t_dur += duration_cast(apply_transq_t_stop - apply_transq_t_start).count(); } @@ -550,7 +550,7 @@ int BQRRP::call( this -> rank = curr_sz; if(this -> timing) { - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_other = total_t_dur - (skop_t_dur + preallocation_t_dur + qrcp_wide_t_dur + panel_preprocessing_t_dur + qr_tall_t_dur + q_reconstruction_t_dur + apply_transq_t_dur + sample_update_t_dur); this -> times.resize(10); @@ -591,7 +591,7 @@ int BQRRP::call( } if(this -> timing) - sample_update_t_start = high_resolution_clock::now(); + sample_update_t_start = steady_clock::now(); // Updating the pointer to "Current A." // In a global sense, below is identical to: @@ -624,7 +624,7 @@ int BQRRP::call( A_sk = &A_sk[d * b_sz]; if(this -> timing) { - sample_update_t_stop = high_resolution_clock::now(); + sample_update_t_stop = steady_clock::now(); sample_update_t_dur += duration_cast(sample_update_t_stop - sample_update_t_start).count(); } diff --git a/RandLAPACK/drivers/rl_bqrrp_gpu.hh b/RandLAPACK/drivers/rl_bqrrp_gpu.hh index 908b2247..6ebd7390 100644 --- a/RandLAPACK/drivers/rl_bqrrp_gpu.hh +++ b/RandLAPACK/drivers/rl_bqrrp_gpu.hh @@ -159,34 +159,34 @@ int BQRRP_GPU::call( T* tau, int64_t* J ){ - high_resolution_clock::time_point preallocation_t_start; - high_resolution_clock::time_point preallocation_t_stop; - high_resolution_clock::time_point qrcp_wide_t_start; - high_resolution_clock::time_point qrcp_wide_t_stop; - high_resolution_clock::time_point copy_A_sk_t_start; - high_resolution_clock::time_point copy_A_sk_t_stop; - high_resolution_clock::time_point qrcp_piv_t_start; - high_resolution_clock::time_point qrcp_piv_t_stop; - high_resolution_clock::time_point copy_A_t_start; - high_resolution_clock::time_point copy_A_t_stop; - high_resolution_clock::time_point piv_A_t_start; - high_resolution_clock::time_point piv_A_t_stop; - high_resolution_clock::time_point updating_J_t_start; - high_resolution_clock::time_point updating_J_t_stop; - high_resolution_clock::time_point copy_J_t_start; - high_resolution_clock::time_point copy_J_t_stop; - high_resolution_clock::time_point preconditioning_t_start; - high_resolution_clock::time_point preconditioning_t_stop; - high_resolution_clock::time_point qr_tall_t_start; - high_resolution_clock::time_point qr_tall_t_stop; - high_resolution_clock::time_point q_reconstruction_t_start; - high_resolution_clock::time_point q_reconstruction_t_stop; - high_resolution_clock::time_point apply_transq_t_start; - high_resolution_clock::time_point apply_transq_t_stop; - high_resolution_clock::time_point sample_update_t_start; - high_resolution_clock::time_point sample_update_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point preallocation_t_start; + steady_clock::time_point preallocation_t_stop; + steady_clock::time_point qrcp_wide_t_start; + steady_clock::time_point qrcp_wide_t_stop; + steady_clock::time_point copy_A_sk_t_start; + steady_clock::time_point copy_A_sk_t_stop; + steady_clock::time_point qrcp_piv_t_start; + steady_clock::time_point qrcp_piv_t_stop; + steady_clock::time_point copy_A_t_start; + steady_clock::time_point copy_A_t_stop; + steady_clock::time_point piv_A_t_start; + steady_clock::time_point piv_A_t_stop; + steady_clock::time_point updating_J_t_start; + steady_clock::time_point updating_J_t_stop; + steady_clock::time_point copy_J_t_start; + steady_clock::time_point copy_J_t_stop; + steady_clock::time_point preconditioning_t_start; + steady_clock::time_point preconditioning_t_stop; + steady_clock::time_point qr_tall_t_start; + steady_clock::time_point qr_tall_t_stop; + steady_clock::time_point q_reconstruction_t_start; + steady_clock::time_point q_reconstruction_t_stop; + steady_clock::time_point apply_transq_t_start; + steady_clock::time_point apply_transq_t_stop; + steady_clock::time_point sample_update_t_start; + steady_clock::time_point sample_update_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long preallocation_t_dur = 0; long qrcp_wide_t_dur = 0; long copy_A_sk_t_dur = 0; @@ -203,8 +203,8 @@ int BQRRP_GPU::call( long total_t_dur = 0; if(this -> timing) { - total_t_start = high_resolution_clock::now(); - preallocation_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); + preallocation_t_start = steady_clock::now(); } int iter; @@ -326,7 +326,7 @@ int BQRRP_GPU::call( cudaStreamSynchronize(strm); if(this -> timing) { lapack_queue.sync(); - preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_stop = steady_clock::now(); preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); } @@ -345,7 +345,7 @@ int BQRRP_GPU::call( if(this -> timing) { nvtxRangePushA("qrcp_wide"); - qrcp_wide_t_start = high_resolution_clock::now(); + qrcp_wide_t_start = steady_clock::now(); } // qrcp_wide through LUQR below // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. @@ -368,7 +368,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePushA("copy_A_sk"); - copy_A_sk_t_start = high_resolution_clock::now(); + copy_A_sk_t_start = steady_clock::now(); } // Instead of copying A_sk_work into A_sk_copy_col_swap, we ``swap'' the pointers. // This is safe, as A_sk is not needed outside of BQRRP. @@ -377,10 +377,10 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - copy_A_sk_t_stop = high_resolution_clock::now(); + copy_A_sk_t_stop = steady_clock::now(); copy_A_sk_t_dur += duration_cast(copy_A_sk_t_stop - copy_A_sk_t_start).count(); nvtxRangePushA("piv_A_sk"); - qrcp_piv_t_start = high_resolution_clock::now(); + qrcp_piv_t_start = steady_clock::now(); } // Apply pivots to A_sk @@ -389,7 +389,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - qrcp_piv_t_stop = high_resolution_clock::now(); + qrcp_piv_t_stop = steady_clock::now(); qrcp_piv_t_dur += duration_cast(qrcp_piv_t_stop - qrcp_piv_t_start).count(); } @@ -405,10 +405,10 @@ int BQRRP_GPU::call( if(this -> timing) { lapack_queue.sync(); nvtxRangePop(); - qrcp_wide_t_stop = high_resolution_clock::now(); + qrcp_wide_t_stop = steady_clock::now(); qrcp_wide_t_dur += duration_cast(qrcp_wide_t_stop - qrcp_wide_t_start).count(); nvtxRangePushA("copy_A"); - copy_A_t_start = high_resolution_clock::now(); + copy_A_t_start = steady_clock::now(); } // Need to premute trailing columns of the full R-factor. // Remember that the R-factor is stored the upper-triangular portion of A. @@ -417,10 +417,10 @@ int BQRRP_GPU::call( if(this -> timing) { nvtxRangePop(); - copy_A_t_stop = high_resolution_clock::now(); + copy_A_t_stop = steady_clock::now(); copy_A_t_dur += duration_cast(copy_A_t_stop - copy_A_t_start).count(); nvtxRangePushA("piv_A"); - piv_A_t_start = high_resolution_clock::now(); + piv_A_t_start = steady_clock::now(); } // Instead of copying A into A_copy_col_swap, we ``swap'' the pointers. @@ -521,7 +521,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - piv_A_t_stop = high_resolution_clock::now(); + piv_A_t_stop = steady_clock::now(); piv_A_t_dur += duration_cast(piv_A_t_stop - piv_A_t_start).count(); } @@ -529,20 +529,20 @@ int BQRRP_GPU::call( if(iter == 0) { if(this -> timing) { nvtxRangePushA("update_J"); - updating_J_t_start = high_resolution_clock::now(); + updating_J_t_start = steady_clock::now(); } RandLAPACK::cuda_kernels::copy_gpu(strm, n, J_buffer, 1, J, 1); if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - updating_J_t_stop = high_resolution_clock::now(); + updating_J_t_stop = steady_clock::now(); updating_J_t_dur += duration_cast(updating_J_t_stop - updating_J_t_start).count(); nvtxRangePushA("update_R"); } } else { if(this -> timing) { nvtxRangePushA("copy_J"); - copy_J_t_start = high_resolution_clock::now(); + copy_J_t_start = steady_clock::now(); } // Instead of copying J into J_copy_col_swap, we ``swap'' the pointers. // We have to take some precautions when BQRRP main loop terminates. @@ -565,17 +565,17 @@ int BQRRP_GPU::call( if(this -> timing) { nvtxRangePop(); - copy_J_t_stop = high_resolution_clock::now(); + copy_J_t_stop = steady_clock::now(); copy_J_t_dur += duration_cast(copy_J_t_stop - copy_J_t_start).count(); nvtxRangePushA("update_J"); - updating_J_t_start = high_resolution_clock::now(); + updating_J_t_start = steady_clock::now(); } RandLAPACK::cuda_kernels::col_swap_gpu(strm, cols, cols, J_work, J_copy_col_swap_work, J_buffer); if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - updating_J_t_stop = high_resolution_clock::now(); + updating_J_t_stop = steady_clock::now(); updating_J_t_dur += duration_cast(updating_J_t_stop - updating_J_t_start).count(); } } @@ -584,7 +584,7 @@ int BQRRP_GPU::call( if(this -> qr_tall == GPUSubroutine::QRTall::cholqr) { if(this -> timing) { nvtxRangePushA("precond_A"); - preconditioning_t_start = high_resolution_clock::now(); + preconditioning_t_start = steady_clock::now(); } // A_pre = AJ(:, 1:b_sz) * inv(R_sk) @@ -594,10 +594,10 @@ int BQRRP_GPU::call( if(this -> timing) { lapack_queue.sync(); nvtxRangePop(); - preconditioning_t_stop = high_resolution_clock::now(); + preconditioning_t_stop = steady_clock::now(); preconditioning_t_dur += duration_cast(preconditioning_t_stop - preconditioning_t_start).count(); nvtxRangePushA("qr_tall"); - qr_tall_t_start = high_resolution_clock::now(); + qr_tall_t_start = steady_clock::now(); } // Performing tall QR @@ -609,10 +609,10 @@ int BQRRP_GPU::call( if(this -> timing) { lapack_queue.sync(); nvtxRangePop(); - qr_tall_t_stop = high_resolution_clock::now(); + qr_tall_t_stop = steady_clock::now(); qr_tall_t_dur += duration_cast(qr_tall_t_stop - qr_tall_t_start).count(); nvtxRangePushA("orhr_col"); - q_reconstruction_t_start = high_resolution_clock::now(); + q_reconstruction_t_start = steady_clock::now(); } // Find Q (stored in A) using Householder reconstruction. @@ -627,7 +627,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - q_reconstruction_t_stop = high_resolution_clock::now(); + q_reconstruction_t_stop = steady_clock::now(); q_reconstruction_t_dur += duration_cast(q_reconstruction_t_stop - q_reconstruction_t_start).count(); } @@ -653,7 +653,7 @@ int BQRRP_GPU::call( // Default case - performing QR_tall via QRF if(this -> timing) { nvtxRangePushA("qr_tall"); - qr_tall_t_start = high_resolution_clock::now(); + qr_tall_t_start = steady_clock::now(); } // Perform an unpivoted QR instead of CholQR if(iter == 0) { @@ -666,7 +666,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - qr_tall_t_stop = high_resolution_clock::now(); + qr_tall_t_stop = steady_clock::now(); qr_tall_t_dur += duration_cast(qr_tall_t_stop - qr_tall_t_start).count(); } // R11 is computed and placed in the appropriate space @@ -683,7 +683,7 @@ int BQRRP_GPU::call( // Q is defined with block_rank elementary reflectors. if(this -> timing) { nvtxRangePushA("update_A"); - apply_transq_t_start = high_resolution_clock::now(); + apply_transq_t_start = steady_clock::now(); } if (block_rank != b_sz_const) { @@ -711,7 +711,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - apply_transq_t_stop = high_resolution_clock::now(); + apply_transq_t_stop = steady_clock::now(); apply_transq_t_dur += duration_cast(apply_transq_t_stop - apply_transq_t_start).count(); } @@ -766,7 +766,7 @@ int BQRRP_GPU::call( lapack_queue.sync(); if(this -> timing) { - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_rest = total_t_dur - (preallocation_t_dur + qrcp_wide_t_dur + copy_A_t_dur + piv_A_t_dur + copy_J_t_dur + updating_J_t_dur + preconditioning_t_dur + qr_tall_t_dur + q_reconstruction_t_dur + apply_transq_t_dur + sample_update_t_dur); this -> times.resize(15); @@ -821,7 +821,7 @@ int BQRRP_GPU::call( } if(this -> timing) { nvtxRangePushA("update_Sk"); - sample_update_t_start = high_resolution_clock::now(); + sample_update_t_start = steady_clock::now(); } // Updating the pointer to "Current A." // In a global sense, below is identical to: @@ -863,7 +863,7 @@ int BQRRP_GPU::call( if(this -> timing) { cudaStreamSynchronize(strm); nvtxRangePop(); - sample_update_t_stop = high_resolution_clock::now(); + sample_update_t_stop = steady_clock::now(); sample_update_t_dur += duration_cast(sample_update_t_stop - sample_update_t_start).count(); } nvtxRangePop(); diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index be1b2caa..cd0f86f1 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -152,20 +152,20 @@ int CQRRPT::call( RandBLAS::RNGState &state ){ ///--------------------TIMING VARS--------------------/ - high_resolution_clock::time_point saso_t_stop; - high_resolution_clock::time_point saso_t_start; - high_resolution_clock::time_point qrcp_t_start; - high_resolution_clock::time_point qrcp_t_stop; - high_resolution_clock::time_point rank_reveal_t_start; - high_resolution_clock::time_point rank_reveal_t_stop; - high_resolution_clock::time_point cholqr_t_start; - high_resolution_clock::time_point cholqr_t_stop; - high_resolution_clock::time_point a_mod_piv_t_start; - high_resolution_clock::time_point a_mod_piv_t_stop; - high_resolution_clock::time_point a_mod_trsm_t_start; - high_resolution_clock::time_point a_mod_trsm_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point saso_t_stop; + steady_clock::time_point saso_t_start; + steady_clock::time_point qrcp_t_start; + steady_clock::time_point qrcp_t_stop; + steady_clock::time_point rank_reveal_t_start; + steady_clock::time_point rank_reveal_t_stop; + steady_clock::time_point cholqr_t_start; + steady_clock::time_point cholqr_t_stop; + steady_clock::time_point a_mod_piv_t_start; + steady_clock::time_point a_mod_piv_t_stop; + steady_clock::time_point a_mod_trsm_t_start; + steady_clock::time_point a_mod_trsm_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long saso_t_dur = 0; long qrcp_t_dur = 0; long rank_reveal_t_dur = 0; @@ -175,7 +175,7 @@ int CQRRPT::call( long total_t_dur = 0; if(this -> timing) - total_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); int i; int64_t k = n; @@ -192,7 +192,7 @@ int CQRRPT::call( std::vector J_buf(n, 0); if(this -> timing) - saso_t_start = high_resolution_clock::now(); + saso_t_start = steady_clock::now(); /// Generating a SASO RandBLAS::SparseDist DS(d, m, this->nnz); @@ -207,8 +207,8 @@ int CQRRPT::call( ); if(this -> timing) { - saso_t_stop = high_resolution_clock::now(); - qrcp_t_start = high_resolution_clock::now(); + saso_t_stop = steady_clock::now(); + qrcp_t_start = steady_clock::now(); } /// Performing QRCP on a sketch @@ -220,8 +220,8 @@ int CQRRPT::call( } if(this -> timing) { - qrcp_t_stop = high_resolution_clock::now(); - rank_reveal_t_start = high_resolution_clock::now(); + qrcp_t_stop = steady_clock::now(); + rank_reveal_t_start = steady_clock::now(); } /// Using naive rank estimation to ensure that R used for preconditioning is invertible. @@ -237,30 +237,30 @@ int CQRRPT::call( this->rank = k; if(this -> timing) - rank_reveal_t_stop = high_resolution_clock::now(); + rank_reveal_t_stop = steady_clock::now(); /// Extracting a k by k R representation T* R_sp = R; lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, ldr); if(this -> timing) - a_mod_piv_t_start = high_resolution_clock::now(); + a_mod_piv_t_start = steady_clock::now(); // Swap k columns of A with pivots from J blas::copy(n, J, 1, J_buf.data(), 1); util::col_swap(m, n, k, A, lda, J_buf); if(this -> timing) { - a_mod_piv_t_stop = high_resolution_clock::now(); - a_mod_trsm_t_start = high_resolution_clock::now(); + a_mod_piv_t_stop = steady_clock::now(); + a_mod_trsm_t_start = steady_clock::now(); } // A_pre * R_sp = AP blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, ldr, A, lda); if(this -> timing) { - a_mod_trsm_t_stop = high_resolution_clock::now(); - cholqr_t_start = high_resolution_clock::now(); + a_mod_trsm_t_stop = steady_clock::now(); + cholqr_t_start = steady_clock::now(); } // Do Cholesky QR @@ -289,7 +289,7 @@ int CQRRPT::call( blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, ldr, A, lda); if(this -> timing) - cholqr_t_stop = high_resolution_clock::now(); + cholqr_t_stop = steady_clock::now(); // Get the final R-factor -- undoing the preconditioning blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, A_hat, d, R_sp, ldr); @@ -305,7 +305,7 @@ int CQRRPT::call( a_mod_trsm_t_dur = duration_cast(a_mod_trsm_t_stop - a_mod_trsm_t_start).count(); cholqr_t_dur = duration_cast(cholqr_t_stop - cholqr_t_start).count(); - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_rest = total_t_dur - (saso_t_dur + qrcp_t_dur + rank_reveal_t_dur + cholqr_t_dur + a_mod_piv_t_dur + a_mod_trsm_t_dur); diff --git a/RandLAPACK/drivers/rl_cqrrpt_gpu.hh b/RandLAPACK/drivers/rl_cqrrpt_gpu.hh index edf16b42..8d39dfe5 100644 --- a/RandLAPACK/drivers/rl_cqrrpt_gpu.hh +++ b/RandLAPACK/drivers/rl_cqrrpt_gpu.hh @@ -159,20 +159,20 @@ int CQRRPT_GPU::call( RandBLAS::RNGState &state ){ ///--------------------TIMING VARS--------------------/ - high_resolution_clock::time_point saso_t_stop; - high_resolution_clock::time_point saso_t_start; - high_resolution_clock::time_point qrcp_t_start; - high_resolution_clock::time_point qrcp_t_stop; - high_resolution_clock::time_point rank_reveal_t_start; - high_resolution_clock::time_point rank_reveal_t_stop; - high_resolution_clock::time_point cholqr_t_start; - high_resolution_clock::time_point cholqr_t_stop; - high_resolution_clock::time_point a_mod_piv_t_start; - high_resolution_clock::time_point a_mod_piv_t_stop; - high_resolution_clock::time_point a_mod_trsm_t_start; - high_resolution_clock::time_point a_mod_trsm_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point saso_t_stop; + steady_clock::time_point saso_t_start; + steady_clock::time_point qrcp_t_start; + steady_clock::time_point qrcp_t_stop; + steady_clock::time_point rank_reveal_t_start; + steady_clock::time_point rank_reveal_t_stop; + steady_clock::time_point cholqr_t_start; + steady_clock::time_point cholqr_t_stop; + steady_clock::time_point a_mod_piv_t_start; + steady_clock::time_point a_mod_piv_t_stop; + steady_clock::time_point a_mod_trsm_t_start; + steady_clock::time_point a_mod_trsm_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long saso_t_dur = 0; long qrcp_t_dur = 0; long rank_reveal_t_dur = 0; @@ -182,7 +182,7 @@ int CQRRPT_GPU::call( long total_t_dur = 0; if(this -> timing) - total_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); int i; int64_t k = n; @@ -196,7 +196,7 @@ int CQRRPT_GPU::call( std::vector J_buf(n, 0); if(this -> timing) - saso_t_start = high_resolution_clock::now(); + saso_t_start = steady_clock::now(); /***********************************************************************************/ // I will avoid performing skething on a GPU for now @@ -212,8 +212,8 @@ int CQRRPT_GPU::call( ); if(this -> timing) { - saso_t_stop = high_resolution_clock::now(); - qrcp_t_start = high_resolution_clock::now(); + saso_t_stop = steady_clock::now(); + qrcp_t_start = steady_clock::now(); } /// Performing QRCP on a sketch @@ -225,8 +225,8 @@ int CQRRPT_GPU::call( } if(this -> timing) { - qrcp_t_stop = high_resolution_clock::now(); - rank_reveal_t_start = high_resolution_clock::now(); + qrcp_t_stop = steady_clock::now(); + rank_reveal_t_start = steady_clock::now(); } /// Using naive rank estimation to ensure that R used for preconditioning is invertible. @@ -242,7 +242,7 @@ int CQRRPT_GPU::call( this->rank = k; if(this -> timing) - rank_reveal_t_stop = high_resolution_clock::now(); + rank_reveal_t_stop = steady_clock::now(); // Allocating space for a preconditioner buffer. T* R_sp = ( T * ) calloc( k * k, sizeof( T ) ); @@ -253,15 +253,15 @@ int CQRRPT_GPU::call( lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], ldr); if(this -> timing) - a_mod_piv_t_start = high_resolution_clock::now(); + a_mod_piv_t_start = steady_clock::now(); // Swap k columns of A with pivots from J blas::copy(n, J, 1, J_buf.data(), 1); util::col_swap(m, n, k, A, lda, J_buf); if(this -> timing) { - a_mod_piv_t_stop = high_resolution_clock::now(); - a_mod_trsm_t_start = high_resolution_clock::now(); + a_mod_piv_t_stop = steady_clock::now(); + a_mod_trsm_t_start = steady_clock::now(); } /******************************GPU REGION BEGIN*********************************/ @@ -303,8 +303,8 @@ int CQRRPT_GPU::call( blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp_device, k, A_device, lda, blas_queue); if(this -> timing) { - a_mod_trsm_t_stop = high_resolution_clock::now(); - cholqr_t_start = high_resolution_clock::now(); + a_mod_trsm_t_stop = steady_clock::now(); + cholqr_t_start = steady_clock::now(); } // Do Cholesky QR @@ -347,7 +347,7 @@ int CQRRPT_GPU::call( blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp_device, k, A_device, lda, blas_queue); if(this -> timing) - cholqr_t_stop = high_resolution_clock::now(); + cholqr_t_stop = steady_clock::now(); // Get the final R-factor. blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, R_sp_device, k, R_device, ldr, blas_queue); @@ -363,7 +363,7 @@ int CQRRPT_GPU::call( a_mod_trsm_t_dur = duration_cast(a_mod_trsm_t_stop - a_mod_trsm_t_start).count(); cholqr_t_dur = duration_cast(cholqr_t_stop - cholqr_t_start).count(); - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_rest = total_t_dur - (saso_t_dur + qrcp_t_dur + rank_reveal_t_dur + cholqr_t_dur + a_mod_piv_t_dur + a_mod_trsm_t_dur); diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index 2547eaf3..7ea36043 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -597,22 +597,22 @@ int64_t NoFLA_QRPmod_WY_unb_var4( return CHOLQR_mod_WY(num_stages, m_A, n_A, buff_A, ldim_A, buff_t, buff_T, ldim_T, buff_R, ldim_R, buff_D); } - high_resolution_clock::time_point preallocation_t_start; - high_resolution_clock::time_point preallocation_t_stop; - high_resolution_clock::time_point norms_t_start; - high_resolution_clock::time_point norms_t_stop; - high_resolution_clock::time_point pivoting_t_start; - high_resolution_clock::time_point pivoting_t_stop; - high_resolution_clock::time_point gen_reflector_1_t_start; - high_resolution_clock::time_point gen_reflector_1_t_stop; - high_resolution_clock::time_point gen_reflector_2_t_start; - high_resolution_clock::time_point gen_reflector_2_t_stop; - high_resolution_clock::time_point downdating_t_start; - high_resolution_clock::time_point downdating_t_stop; - high_resolution_clock::time_point gen_T_t_start; - high_resolution_clock::time_point gen_T_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point preallocation_t_start; + steady_clock::time_point preallocation_t_stop; + steady_clock::time_point norms_t_start; + steady_clock::time_point norms_t_stop; + steady_clock::time_point pivoting_t_start; + steady_clock::time_point pivoting_t_stop; + steady_clock::time_point gen_reflector_1_t_start; + steady_clock::time_point gen_reflector_1_t_stop; + steady_clock::time_point gen_reflector_2_t_start; + steady_clock::time_point gen_reflector_2_t_stop; + steady_clock::time_point downdating_t_start; + steady_clock::time_point downdating_t_stop; + steady_clock::time_point gen_T_t_start; + steady_clock::time_point gen_T_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long preallocation_t_dur = 0; long norms_t_dur = 0; long pivoting_t_dur = 0; @@ -623,8 +623,8 @@ int64_t NoFLA_QRPmod_WY_unb_var4( long total_t_dur = 0; if(timing != nullptr) { - total_t_start = high_resolution_clock::now(); - preallocation_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); + preallocation_t_start = steady_clock::now(); } int64_t j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col, @@ -645,16 +645,16 @@ int64_t NoFLA_QRPmod_WY_unb_var4( buff_workspace = ( T * ) calloc( n_A, sizeof( T ) ); if(timing != nullptr) { - preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_stop = steady_clock::now(); preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); - norms_t_start = high_resolution_clock::now(); + norms_t_start = steady_clock::now(); } if( pivoting == 1 ) { // Compute initial norms of A int64_to d and e. NoFLA_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e ); } if(timing != nullptr) { - norms_t_stop = high_resolution_clock::now(); + norms_t_stop = steady_clock::now(); norms_t_dur = duration_cast(norms_t_stop - norms_t_start).count(); } @@ -667,7 +667,7 @@ int64_t NoFLA_QRPmod_WY_unb_var4( n_A22 = n_A - j - 1; if(timing != nullptr) { - pivoting_t_start = high_resolution_clock::now(); + pivoting_t_start = steady_clock::now(); } if( pivoting == 1 ) { // Obtain the index of the column with largest 2-norm. @@ -684,9 +684,9 @@ int64_t NoFLA_QRPmod_WY_unb_var4( } if(timing != nullptr) { - pivoting_t_stop = high_resolution_clock::now(); + pivoting_t_stop = steady_clock::now(); pivoting_t_dur += duration_cast(pivoting_t_stop - pivoting_t_start).count(); - gen_reflector_1_t_start = high_resolution_clock::now(); + gen_reflector_1_t_start = steady_clock::now(); } // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21 @@ -702,9 +702,9 @@ int64_t NoFLA_QRPmod_WY_unb_var4( ); if(timing != nullptr) { - gen_reflector_1_t_stop = high_resolution_clock::now(); + gen_reflector_1_t_stop = steady_clock::now(); gen_reflector_1_t_dur += duration_cast(gen_reflector_1_t_stop - gen_reflector_1_t_start).count(); - gen_reflector_2_t_start = high_resolution_clock::now(); + gen_reflector_2_t_start = steady_clock::now(); } // | a12t | = H | a12t | @@ -723,9 +723,9 @@ int64_t NoFLA_QRPmod_WY_unb_var4( buff_A[ j + j * ldim_A ] = diag; if(timing != nullptr) { - gen_reflector_2_t_stop = high_resolution_clock::now(); + gen_reflector_2_t_stop = steady_clock::now(); gen_reflector_2_t_dur += duration_cast(gen_reflector_2_t_stop - gen_reflector_2_t_start).count(); - downdating_t_start = high_resolution_clock::now(); + downdating_t_start = steady_clock::now(); } if( pivoting == 1 ) { // Update partial column norms. @@ -736,13 +736,13 @@ int64_t NoFLA_QRPmod_WY_unb_var4( & buff_A[ ( j+1 ) + std::min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A ); } if(timing != nullptr) { - downdating_t_stop = high_resolution_clock::now(); + downdating_t_stop = steady_clock::now(); downdating_t_dur += duration_cast(downdating_t_stop - downdating_t_start).count(); } } if(timing != nullptr) { - gen_T_t_start = high_resolution_clock::now(); + gen_T_t_start = steady_clock::now(); } // Build T. @@ -754,9 +754,9 @@ int64_t NoFLA_QRPmod_WY_unb_var4( } if(timing != nullptr) { - gen_T_t_stop = high_resolution_clock::now(); + gen_T_t_stop = steady_clock::now(); gen_T_t_dur = duration_cast(gen_T_t_stop - gen_T_t_start).count(); - preallocation_t_start = high_resolution_clock::now(); + preallocation_t_start = steady_clock::now(); } // Remove auxiliary vectors. @@ -765,12 +765,12 @@ int64_t NoFLA_QRPmod_WY_unb_var4( free( buff_workspace ); if(timing != nullptr) { - preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_stop = steady_clock::now(); preallocation_t_dur += duration_cast(preallocation_t_stop - preallocation_t_start).count(); } if(timing != nullptr) { - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long other_t_dur = total_t_dur - (preallocation_t_dur + norms_t_dur + pivoting_t_dur + gen_reflector_1_t_dur + gen_reflector_2_t_dur + downdating_t_dur + gen_T_t_dur); @@ -827,22 +827,22 @@ int64_t hqrrp( int64_t nb_alg, int64_t pp, int64_t panel_pivoting, int64_t qr_type, RandBLAS::RNGState &state, T* timing) { //-------TIMING VARS--------/ - high_resolution_clock::time_point preallocation_t_stop; - high_resolution_clock::time_point preallocation_t_start; - high_resolution_clock::time_point sketching_t_stop; - high_resolution_clock::time_point sketching_t_start; - high_resolution_clock::time_point downdating_t_stop; - high_resolution_clock::time_point downdating_t_start; - high_resolution_clock::time_point qrcp_t_start; - high_resolution_clock::time_point qrcp_t_stop; - high_resolution_clock::time_point qr_t_start; - high_resolution_clock::time_point qr_t_stop; - high_resolution_clock::time_point updating_A_t_start; - high_resolution_clock::time_point updating_A_t_stop; - high_resolution_clock::time_point updating_Sketch_t_start; - high_resolution_clock::time_point updating_Sketch_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point preallocation_t_stop; + steady_clock::time_point preallocation_t_start; + steady_clock::time_point sketching_t_stop; + steady_clock::time_point sketching_t_start; + steady_clock::time_point downdating_t_stop; + steady_clock::time_point downdating_t_start; + steady_clock::time_point qrcp_t_start; + steady_clock::time_point qrcp_t_stop; + steady_clock::time_point qr_t_start; + steady_clock::time_point qr_t_stop; + steady_clock::time_point updating_A_t_start; + steady_clock::time_point updating_A_t_stop; + steady_clock::time_point updating_Sketch_t_start; + steady_clock::time_point updating_Sketch_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long preallocation_t_dur = 0; long sketching_t_dur = 0; long downdating_t_dur = 0; @@ -863,8 +863,8 @@ int64_t hqrrp( } if(timing != nullptr) { - total_t_start = high_resolution_clock::now(); - preallocation_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); + preallocation_t_start = steady_clock::now(); } int64_t b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V, @@ -928,9 +928,9 @@ int64_t hqrrp( buff_D = ( T * ) calloc( nb_alg, sizeof( T ) ); if(timing != nullptr) { - preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_stop = steady_clock::now(); preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); - sketching_t_start = high_resolution_clock::now(); + sketching_t_start = steady_clock::now(); } // Initialize matrices G and Y. @@ -943,7 +943,7 @@ int64_t hqrrp( d_zero, buff_Y, ldim_Y ); if(timing != nullptr) { - sketching_t_stop = high_resolution_clock::now(); + sketching_t_stop = steady_clock::now(); sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); } @@ -994,7 +994,7 @@ int64_t hqrrp( if(timing != nullptr) - downdating_t_start = high_resolution_clock::now(); + downdating_t_start = steady_clock::now(); #ifdef CHECK_DOWNDATING_OF_Y // Check downdating of matrix Y: Compare downdated matrix Y with @@ -1030,7 +1030,7 @@ int64_t hqrrp( free( buff_cyr ); #endif if(timing != nullptr) { - downdating_t_stop = high_resolution_clock::now(); + downdating_t_stop = steady_clock::now(); downdating_t_dur += duration_cast(downdating_t_stop - downdating_t_start).count(); } @@ -1053,7 +1053,7 @@ int64_t hqrrp( buff_VR, ldim_V); if(timing != nullptr) - qrcp_t_start = high_resolution_clock::now(); + qrcp_t_start = steady_clock::now(); NoFLA_QRPmod_WY_unb_var4(0, 1, b, m_V, n_VR, @@ -1066,7 +1066,7 @@ int64_t hqrrp( ); if(timing != nullptr) { - qrcp_t_stop = high_resolution_clock::now(); + qrcp_t_stop = steady_clock::now(); qrcp_t_dur += duration_cast(qrcp_t_stop - qrcp_t_start).count(); } @@ -1091,7 +1091,7 @@ int64_t hqrrp( // if(timing != nullptr) - qr_t_start = high_resolution_clock::now(); + qr_t_start = steady_clock::now(); NoFLA_QRPmod_WY_unb_var4(qr_type, panel_pivoting, -1, m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1, @@ -1100,9 +1100,9 @@ int64_t hqrrp( 1, buff_T1_T, ldim_W, buff_R, ldim_R, buff_D, timing_QR); if(timing != nullptr) { - qr_t_stop = high_resolution_clock::now(); + qr_t_stop = steady_clock::now(); qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); - updating_A_t_start = high_resolution_clock::now(); + updating_A_t_start = steady_clock::now(); } // @@ -1121,9 +1121,9 @@ int64_t hqrrp( } if(timing != nullptr) { - updating_A_t_stop = high_resolution_clock::now(); + updating_A_t_stop = steady_clock::now(); updating_A_t_dur += duration_cast(updating_A_t_stop - updating_A_t_start).count(); - updating_Sketch_t_start = high_resolution_clock::now(); + updating_Sketch_t_start = steady_clock::now(); } // @@ -1141,7 +1141,7 @@ int64_t hqrrp( } if(timing != nullptr) { - updating_Sketch_t_stop = high_resolution_clock::now(); + updating_Sketch_t_stop = steady_clock::now(); updating_Sketch_t_dur += duration_cast(updating_Sketch_t_stop - updating_Sketch_t_start).count(); } } @@ -1151,7 +1151,7 @@ int64_t hqrrp( // Make sure that timing points to a sufficient amount of space. timing = ( T * ) realloc(timing, 29 * sizeof( T ) ); - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long other_t_dur = total_t_dur - (preallocation_t_dur + sketching_t_dur + downdating_t_dur + qrcp_t_dur + qr_t_dur + updating_A_t_dur + updating_Sketch_t_dur); diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 65c24a24..c303c50b 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -142,30 +142,30 @@ int RBKI::call( T* Sigma, RandBLAS::RNGState &state ){ - high_resolution_clock::time_point allocation_t_start; - high_resolution_clock::time_point allocation_t_stop; - high_resolution_clock::time_point get_factors_t_start; - high_resolution_clock::time_point get_factors_t_stop; - high_resolution_clock::time_point ungqr_t_start; - high_resolution_clock::time_point ungqr_t_stop; - high_resolution_clock::time_point reorth_t_start; - high_resolution_clock::time_point reorth_t_stop; - high_resolution_clock::time_point qr_t_start; - high_resolution_clock::time_point qr_t_stop; - high_resolution_clock::time_point gemm_A_t_start; - high_resolution_clock::time_point gemm_A_t_stop; - high_resolution_clock::time_point main_loop_t_start; - high_resolution_clock::time_point main_loop_t_stop; - high_resolution_clock::time_point sketching_t_start; - high_resolution_clock::time_point sketching_t_stop; - high_resolution_clock::time_point r_cpy_t_start; - high_resolution_clock::time_point r_cpy_t_stop; - high_resolution_clock::time_point s_cpy_t_start; - high_resolution_clock::time_point s_cpy_t_stop; - high_resolution_clock::time_point norm_t_start; - high_resolution_clock::time_point norm_t_stop; - high_resolution_clock::time_point total_t_start; - high_resolution_clock::time_point total_t_stop; + steady_clock::time_point allocation_t_start; + steady_clock::time_point allocation_t_stop; + steady_clock::time_point get_factors_t_start; + steady_clock::time_point get_factors_t_stop; + steady_clock::time_point ungqr_t_start; + steady_clock::time_point ungqr_t_stop; + steady_clock::time_point reorth_t_start; + steady_clock::time_point reorth_t_stop; + steady_clock::time_point qr_t_start; + steady_clock::time_point qr_t_stop; + steady_clock::time_point gemm_A_t_start; + steady_clock::time_point gemm_A_t_stop; + steady_clock::time_point main_loop_t_start; + steady_clock::time_point main_loop_t_stop; + steady_clock::time_point sketching_t_start; + steady_clock::time_point sketching_t_stop; + steady_clock::time_point r_cpy_t_start; + steady_clock::time_point r_cpy_t_stop; + steady_clock::time_point s_cpy_t_start; + steady_clock::time_point s_cpy_t_stop; + steady_clock::time_point norm_t_start; + steady_clock::time_point norm_t_stop; + steady_clock::time_point total_t_start; + steady_clock::time_point total_t_stop; long allocation_t_dur = 0; long get_factors_t_dur = 0; @@ -181,8 +181,8 @@ int RBKI::call( long total_t_dur = 0; if(this -> timing) { - total_t_start = high_resolution_clock::now(); - allocation_t_start = high_resolution_clock::now(); + total_t_start = steady_clock::now(); + allocation_t_start = steady_clock::now(); } int64_t iter = 0, iter_od = 0, iter_ev = 0, end_rows = 0, end_cols = 0; @@ -230,7 +230,7 @@ int RBKI::call( T* tau = ( T * ) calloc( k, sizeof( T ) ); if(this -> timing) { - allocation_t_stop = high_resolution_clock::now(); + allocation_t_stop = steady_clock::now(); allocation_t_dur = duration_cast(allocation_t_stop - allocation_t_start).count(); } @@ -240,7 +240,7 @@ int RBKI::call( T threshold = std::sqrt(1 - sq_tol) * norm_A; if(this -> timing) - sketching_t_start = high_resolution_clock::now(); + sketching_t_start = steady_clock::now(); // Generate a dense Gaussian random matrx. // OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation. @@ -254,35 +254,35 @@ int RBKI::call( #endif if(this -> timing) { - sketching_t_stop = high_resolution_clock::now(); + sketching_t_stop = steady_clock::now(); sketching_t_dur = duration_cast(sketching_t_stop - sketching_t_start).count(); - gemm_A_t_start = high_resolution_clock::now(); + gemm_A_t_start = steady_clock::now(); } // [X_ev, ~] = qr(A * Y_i, 0) blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); if(this -> timing) { - gemm_A_t_stop = high_resolution_clock::now(); + gemm_A_t_stop = steady_clock::now(); gemm_A_t_dur = duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } if(this -> timing) - qr_t_start = high_resolution_clock::now(); + qr_t_start = steady_clock::now(); lapack::geqrf(m, k, X_i, m, tau); if(this -> timing) { - qr_t_stop = high_resolution_clock::now(); + qr_t_stop = steady_clock::now(); qr_t_dur = duration_cast(qr_t_stop - qr_t_start).count(); - ungqr_t_start = high_resolution_clock::now(); + ungqr_t_start = steady_clock::now(); } // Convert X_i into an explicit form. It is now stored in X_ev as it should be. lapack::ungqr(m, k, k, X_i, m, tau); if(this -> timing) { - ungqr_t_stop = high_resolution_clock::now(); + ungqr_t_stop = steady_clock::now(); ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); } @@ -294,16 +294,16 @@ int RBKI::call( // Iterate until in-loop termination criteria is met. while(1) { if(this -> timing) - main_loop_t_start = high_resolution_clock::now(); + main_loop_t_start = steady_clock::now(); if (iter % 2 != 0) { if(this -> timing) - gemm_A_t_start = high_resolution_clock::now(); + gemm_A_t_start = steady_clock::now(); // Y_i = A' * X_i blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, n); if(this -> timing) { - gemm_A_t_stop = high_resolution_clock::now(); + gemm_A_t_stop = steady_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } @@ -320,7 +320,7 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, k, iter_ev * k, n, 1.0, Y_i, n, Y_od, n, 0.0, R_i, n); if(this -> timing) - reorth_t_start = high_resolution_clock::now(); + reorth_t_start = steady_clock::now(); // Y_i = Y_i - Y_od * R_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, R_i, n, 1.0, Y_i, n); @@ -330,7 +330,7 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, n, k, iter_ev * k, -1.0, Y_od, n, Y_orth_buf, k, 1.0, Y_i, n); if(this -> timing) { - reorth_t_stop = high_resolution_clock::now(); + reorth_t_stop = steady_clock::now(); reorth_t_dur += duration_cast(reorth_t_stop - reorth_t_start).count(); } } @@ -339,13 +339,13 @@ int RBKI::call( std::fill(&tau[0], &tau[k], 0.0); if(this -> timing) - qr_t_start = high_resolution_clock::now(); + qr_t_start = steady_clock::now(); lapack::geqrf(n, k, Y_i, n, tau); if(this -> timing) { - qr_t_stop = high_resolution_clock::now(); + qr_t_stop = steady_clock::now(); qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); - r_cpy_t_start = high_resolution_clock::now(); + r_cpy_t_start = steady_clock::now(); } // Copy R_ii over to R's (in transposed format). @@ -358,16 +358,16 @@ int RBKI::call( #endif if(this -> timing) { - r_cpy_t_stop = high_resolution_clock::now(); + r_cpy_t_stop = steady_clock::now(); r_cpy_t_dur += duration_cast(r_cpy_t_stop - r_cpy_t_start).count(); - ungqr_t_start = high_resolution_clock::now(); + ungqr_t_start = steady_clock::now(); } // Convert Y_i into an explicit form. It is now stored in Y_odd as it should be. lapack::ungqr(n, k, k, Y_i, n, tau); if(this -> timing) { - ungqr_t_stop = high_resolution_clock::now(); + ungqr_t_stop = steady_clock::now(); ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); } @@ -392,13 +392,13 @@ int RBKI::call( } else { if(this -> timing) - gemm_A_t_start = high_resolution_clock::now(); + gemm_A_t_start = steady_clock::now(); // X_i = A * Y_i blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m); if(this -> timing) { - gemm_A_t_stop = high_resolution_clock::now(); + gemm_A_t_stop = steady_clock::now(); gemm_A_t_dur += duration_cast(gemm_A_t_stop - gemm_A_t_start).count(); } @@ -414,7 +414,7 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, iter_od * k, k, m, 1.0, X_ev, m, X_i, m, 0.0, S_i, n + k); if(this -> timing) - reorth_t_start = high_resolution_clock::now(); + reorth_t_start = steady_clock::now(); //X_i = X_i - X_ev * S_i; blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, S_i, n + k, 1.0, X_i, m); @@ -424,7 +424,7 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, iter_od * k, -1.0, X_ev, m, X_orth_buf, n + k, 1.0, X_i, m); if(this -> timing) { - reorth_t_stop = high_resolution_clock::now(); + reorth_t_stop = steady_clock::now(); reorth_t_dur += duration_cast(reorth_t_stop - reorth_t_start).count(); } @@ -432,30 +432,30 @@ int RBKI::call( std::fill(&tau[0], &tau[k], 0.0); if(this -> timing) - qr_t_start = high_resolution_clock::now(); + qr_t_start = steady_clock::now(); lapack::geqrf(m, k, X_i, m, tau); if(this -> timing) { - qr_t_stop = high_resolution_clock::now(); + qr_t_stop = steady_clock::now(); qr_t_dur += duration_cast(qr_t_stop - qr_t_start).count(); - s_cpy_t_start = high_resolution_clock::now(); + s_cpy_t_start = steady_clock::now(); } // Copy S_ii over to S's space under S_i (offset down by iter_od * k) lapack::lacpy(MatrixType::Upper, k, k, X_i, m, S_ii, n + k); if(this -> timing) { - s_cpy_t_stop = high_resolution_clock::now(); + s_cpy_t_stop = steady_clock::now(); s_cpy_t_dur += duration_cast(s_cpy_t_stop - s_cpy_t_start).count(); - ungqr_t_start = high_resolution_clock::now(); + ungqr_t_start = steady_clock::now(); } // Convert X_i into an explicit form. It is now stored in X_ev as it should be lapack::ungqr(m, k, k, X_i, m, tau); if(this -> timing) { - ungqr_t_stop = high_resolution_clock::now(); + ungqr_t_stop = steady_clock::now(); ungqr_t_dur += duration_cast(ungqr_t_stop - ungqr_t_start).count(); } @@ -481,16 +481,16 @@ int RBKI::call( } if(this -> timing) - norm_t_start = high_resolution_clock::now(); + norm_t_start = steady_clock::now(); // This is only changed on odd iters if (iter % 2 != 0) norm_R = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, iter_ev * k, iter_ev * k, R, n); if(this -> timing) { - norm_t_stop = high_resolution_clock::now(); + norm_t_stop = steady_clock::now(); norm_t_dur += duration_cast(norm_t_stop - norm_t_start).count(); - main_loop_t_stop = high_resolution_clock::now(); + main_loop_t_stop = steady_clock::now(); main_loop_t_dur += duration_cast(main_loop_t_stop - main_loop_t_start).count(); } @@ -513,16 +513,16 @@ int RBKI::call( if(this -> timing) { - allocation_t_start = high_resolution_clock::now(); + allocation_t_start = steady_clock::now(); } U_hat = ( T * ) calloc( end_rows * end_cols, sizeof( T ) ); VT_hat = ( T * ) calloc( end_cols * end_cols, sizeof( T ) ); if(this -> timing) { - allocation_t_stop = high_resolution_clock::now(); + allocation_t_stop = steady_clock::now(); allocation_t_dur += duration_cast(allocation_t_stop - allocation_t_start).count(); - get_factors_t_start = high_resolution_clock::now(); + get_factors_t_start = steady_clock::now(); } if (iter % 2 != 0) { @@ -540,9 +540,9 @@ int RBKI::call( blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, end_cols, n, end_cols, 1.0, VT_hat, end_cols, Y_od, n, 0.0, VT, n); if(this -> timing) { - get_factors_t_stop = high_resolution_clock::now(); + get_factors_t_stop = steady_clock::now(); get_factors_t_dur = duration_cast(get_factors_t_stop - get_factors_t_start).count(); - allocation_t_start = high_resolution_clock::now(); + allocation_t_start = steady_clock::now(); } free(Y_od); free(X_ev); @@ -555,12 +555,12 @@ int RBKI::call( free(X_orth_buf); if(this -> timing) { - allocation_t_stop = high_resolution_clock::now(); + allocation_t_stop = steady_clock::now(); allocation_t_dur += duration_cast(allocation_t_stop - allocation_t_start).count(); } if(this -> timing) { - total_t_stop = high_resolution_clock::now(); + total_t_stop = steady_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_rest = total_t_dur - (allocation_t_dur + get_factors_t_dur + ungqr_t_dur + reorth_t_dur + qr_t_dur + gemm_A_t_dur + sketching_t_dur + r_cpy_t_dur + s_cpy_t_dur + norm_t_dur); this -> times.resize(13); diff --git a/benchmark/bench_BQRRP/BQRRP_speed_comparisons.cc b/benchmark/bench_BQRRP/BQRRP_speed_comparisons.cc index b4b686dc..ddf03122 100644 --- a/benchmark/bench_BQRRP/BQRRP_speed_comparisons.cc +++ b/benchmark/bench_BQRRP/BQRRP_speed_comparisons.cc @@ -92,9 +92,9 @@ static void call_all_algs( printf("ITERATION %d, B_SZ %ld\n", i, b_sz); // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); + auto start_geqrf = steady_clock::now(); lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); + auto stop_geqrf = steady_clock::now(); dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); printf("TOTAL TIME FOR GEQRF %ld\n", dur_geqrf); @@ -106,9 +106,9 @@ static void call_all_algs( // Testing BQRRP - QRF BQRRP.qr_tall = Subroutines::QRTall::geqrf; BQRRP.apply_trans_q = Subroutines::ApplyTransQ::ormqr; - auto start_bqrrp_qrf = high_resolution_clock::now(); + auto start_bqrrp_qrf = steady_clock::now(); BQRRP.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_alg); - auto stop_bqrrp_qrf = high_resolution_clock::now(); + auto stop_bqrrp_qrf = steady_clock::now(); dur_bqrrp_qrf = duration_cast(stop_bqrrp_qrf - start_bqrrp_qrf).count(); printf("TOTAL TIME FOR BQRRP_QRF %ld\n", dur_bqrrp_qrf); @@ -121,9 +121,9 @@ static void call_all_algs( // Testing BQRRP - CholQR BQRRP.qr_tall = Subroutines::QRTall::cholqr; BQRRP.apply_trans_q = Subroutines::ApplyTransQ::ormqr; - auto start_bqrrp_cholqr = high_resolution_clock::now(); + auto start_bqrrp_cholqr = steady_clock::now(); BQRRP.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_alg); - auto stop_bqrrp_cholqr = high_resolution_clock::now(); + auto stop_bqrrp_cholqr = steady_clock::now(); dur_bqrrp_cholqr = duration_cast(stop_bqrrp_cholqr - start_bqrrp_cholqr).count(); printf("TOTAL TIME FOR BQRRP_CHOLQR %ld\n", dur_bqrrp_cholqr); @@ -134,9 +134,9 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen, 1); // Testing HQRRP DEFAULT - auto start_hqrrp = high_resolution_clock::now(); + auto start_hqrrp = steady_clock::now(); RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 0, state_alg, (T*) nullptr); - auto stop_hqrrp = high_resolution_clock::now(); + auto stop_hqrrp = steady_clock::now(); dur_hqrrp = duration_cast(stop_hqrrp - start_hqrrp).count(); printf("TOTAL TIME FOR HQRRP %ld\n", dur_hqrrp); @@ -147,9 +147,9 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen, 1); // Testing HQRRP with GEQRF - auto start_hqrrp_geqrf = high_resolution_clock::now(); + auto start_hqrrp_geqrf = steady_clock::now(); RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 1, state_alg, (T*) nullptr); - auto stop_hqrrp_geqrf = high_resolution_clock::now(); + auto stop_hqrrp_geqrf = steady_clock::now(); dur_hqrrp_geqrf = duration_cast(stop_hqrrp_geqrf - start_hqrrp_geqrf).count(); printf("TOTAL TIME FOR HQRRP WITH GEQRF %ld\n", dur_hqrrp_geqrf); @@ -160,9 +160,9 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen, 1); // Testing HQRRP with CholQR - auto start_hqrrp_cholqr = high_resolution_clock::now(); + auto start_hqrrp_cholqr = steady_clock::now(); RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 2, state_alg, (T*) nullptr); - auto stop_hqrrp_cholqr = high_resolution_clock::now(); + auto stop_hqrrp_cholqr = steady_clock::now(); dur_hqrrp_cholqr = duration_cast(stop_hqrrp_cholqr - start_hqrrp_cholqr).count(); printf("TOTAL TIME FOR HQRRP WITH CHOLQRQ %ld\n", dur_hqrrp_cholqr); @@ -174,9 +174,9 @@ static void call_all_algs( if ((i <= 2) && (b_sz == 256)) { // Testing GEQP3 - auto start_geqp3 = high_resolution_clock::now(); + auto start_geqp3 = steady_clock::now(); lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); - auto stop_geqp3 = high_resolution_clock::now(); + auto stop_geqp3 = steady_clock::now(); dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); printf("TOTAL TIME FOR GEQP3 %ld\n", dur_geqp3); diff --git a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc index d9ad7cf5..1657d55c 100644 --- a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc +++ b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc @@ -153,14 +153,14 @@ static void call_wide_qrcp( for (i = 0; i < numruns; ++i) { printf("Wide QRCP iteration %d; m==%d start.\n", i, n); // Testing GEQP3 - auto start_geqp3 = high_resolution_clock::now(); + auto start_geqp3 = steady_clock::now(); lapack::geqp3(n, m, all_data.A.data(), n, all_data.J.data(), all_data.tau.data()); - auto stop_geqp3 = high_resolution_clock::now(); + auto stop_geqp3 = steady_clock::now(); dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); data_regen(m_info, all_data, state, state, 1); // Testing LUQR - auto start_luqr = high_resolution_clock::now(); + auto start_luqr = steady_clock::now(); // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. // Get a transpose of A_sk RandLAPACK::util::transposition(n, m, all_data.A.data(), n, all_data.A_trans.data(), m, 0); @@ -177,7 +177,7 @@ static void call_wide_qrcp( RandLAPACK::util::col_swap(n, m, m, all_data.A.data(), n, all_data.J); // Perform an unpivoted QR on A_sk lapack::geqrf(n, m, all_data.A.data(), n, all_data.tau.data()); - auto stop_luqr = high_resolution_clock::now(); + auto stop_luqr = steady_clock::now(); dur_luqr = duration_cast(stop_luqr - start_luqr).count(); data_regen(m_info, all_data, state, state, 1); @@ -229,45 +229,45 @@ static void call_tsqr( for(nb = geqrt_nb_start; nb <= n; nb *=2) { printf("TSQR iteration %d; n==%ld start.\n", i, n); - auto start_geqrt = high_resolution_clock::now(); + auto start_geqrt = steady_clock::now(); lapack::geqrt( m, n, nb, all_data.A.data(), m, all_data.T_mat.data(), n ); - auto stop_geqrt = high_resolution_clock::now(); + auto stop_geqrt = steady_clock::now(); dur_geqrt = duration_cast(stop_geqrt - start_geqrt).count(); if(nb == geqrt_nb_start) { // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); + auto start_geqrf = steady_clock::now(); lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); + auto stop_geqrf = steady_clock::now(); dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); data_regen(m_info, all_data, state, state, 2); // Testing GEQR - auto start_geqr = high_resolution_clock::now(); + auto start_geqr = steady_clock::now(); lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); tsize = (int64_t) all_data.tau[0]; all_data.tau.resize(tsize); lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); - auto stop_geqr = high_resolution_clock::now(); + auto stop_geqr = steady_clock::now(); dur_geqr = duration_cast(stop_geqr - start_geqr).count(); data_regen(m_info, all_data, state, state, 2); // Testing CholQR - auto start_precond = high_resolution_clock::now(); + auto start_precond = steady_clock::now(); blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, A_sk, n, all_data.A.data(), m); - auto stop_precond = high_resolution_clock::now(); + auto stop_precond = steady_clock::now(); dur_cholqr_precond = duration_cast(stop_precond - start_precond).count(); - auto start_cholqr = high_resolution_clock::now(); + auto start_cholqr = steady_clock::now(); blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n); lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, all_data.R.data(), n, all_data.A.data(), m); - auto stop_cholqr = high_resolution_clock::now(); + auto stop_cholqr = steady_clock::now(); dur_cholqr = duration_cast(stop_cholqr - start_cholqr).count(); - auto start_orhr_col = high_resolution_clock::now(); + auto start_orhr_col = steady_clock::now(); lapack::orhr_col(m, n, n, all_data.A.data(), m, all_data.T_mat.data(), n, all_data.D.data()); - auto stop_cholqr_orhr = high_resolution_clock::now(); + auto stop_cholqr_orhr = steady_clock::now(); dur_cholqr_house_rest = duration_cast(stop_cholqr_orhr - start_orhr_col).count(); - auto start_r_restore = high_resolution_clock::now(); + auto start_r_restore = steady_clock::now(); // Construct the proper R-factor for(int i = 0; i < n; ++i) { for(int j = 0; j < (i + 1); ++j) { @@ -276,7 +276,7 @@ static void call_tsqr( } blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, n, n, (T) 1.0, A_sk, n, all_data.R.data(), n); lapack::lacpy(MatrixType::Upper, n, n, all_data.R.data(), n, all_data.A.data(), m); - auto stop_r_restore = high_resolution_clock::now(); + auto stop_r_restore = steady_clock::now(); dur_cholqr_r_restore = duration_cast(stop_r_restore - start_r_restore).count(); data_regen(m_info, all_data, state, state, 2); @@ -328,9 +328,9 @@ static void call_apply_q( lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_gemqrt.data(), m); lapack::orhr_col(m, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.D.data()); - auto start_gemqrt = high_resolution_clock::now(); + auto start_gemqrt = steady_clock::now(); lapack::gemqrt(Side::Left, Op::Trans, m, m - n, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.B1.data(), m); - auto stop_gemqrt = high_resolution_clock::now(); + auto stop_gemqrt = steady_clock::now(); dur_gemqrt = duration_cast(stop_gemqrt - start_gemqrt).count(); // We do not re-run ormqr and gemm for different nbs @@ -341,9 +341,9 @@ static void call_apply_q( for(j = 0; j < n; ++j) all_data.tau[j] = all_data.T_mat[(n + 1) * j]; - auto start_ormqr = high_resolution_clock::now(); + auto start_ormqr = steady_clock::now(); lapack::ormqr(Side::Left, Op::Trans, m, m - n, n, all_data.A.data(), m, all_data.tau.data(), all_data.B.data(), m); - auto stop_ormqr = high_resolution_clock::now(); + auto stop_ormqr = steady_clock::now(); dur_ormqr = duration_cast(stop_ormqr - start_ormqr).count(); file << dur_ormqr << ", "; diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc index 9440eba0..b19bb958 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -85,27 +85,27 @@ static void call_all_algs( for (int i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); // Testing GEQP3 - auto start_geqp3 = high_resolution_clock::now(); + auto start_geqp3 = steady_clock::now(); lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); - auto stop_geqp3 = high_resolution_clock::now(); + auto stop_geqp3 = steady_clock::now(); dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); state_gen = state; data_regen(m_info, all_data, state_gen); // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); + auto start_geqrf = steady_clock::now(); lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); + auto stop_geqrf = steady_clock::now(); dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); state_gen = state; data_regen(m_info, all_data, state_gen); // Testing CQRRPT - auto start_cqrrp = high_resolution_clock::now(); + auto start_cqrrp = steady_clock::now(); CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); - auto stop_cqrrp = high_resolution_clock::now(); + auto stop_cqrrp = steady_clock::now(); dur_cqrrpt = duration_cast(stop_cqrrp - start_cqrrp).count(); state_gen = state; @@ -113,7 +113,7 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen); // Testing SCHOLQR3 - auto start_scholqr = high_resolution_clock::now(); + auto start_scholqr = steady_clock::now(); //--------------------------------------------------------------------------------------------------------------------------// T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); T shift = 11 * std::numeric_limits::epsilon() * n * std::pow(norm_A, 2); @@ -131,7 +131,7 @@ static void call_all_algs( lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); //--------------------------------------------------------------------------------------------------------------------------// - auto stop_scholqr = high_resolution_clock::now(); + auto stop_scholqr = steady_clock::now(); dur_scholqr = duration_cast(stop_scholqr - start_scholqr).count(); auto state_gen = state; @@ -139,21 +139,21 @@ static void call_all_algs( // Testing GEQR + GEQPT #if !defined(__APPLE__) - auto start_geqpt = high_resolution_clock::now(); - auto start_geqr = high_resolution_clock::now(); + auto start_geqpt = steady_clock::now(); + auto start_geqr = steady_clock::now(); // GEQR(A) part lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); int64_t tsize = (int64_t) all_data.tau[0]; all_data.tau.resize(tsize); lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); - auto stop_geqr = high_resolution_clock::now(); + auto stop_geqr = steady_clock::now(); dur_geqr = duration_cast(stop_geqr - start_geqr).count(); // GEQP3(R) part lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n); lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data()); - auto stop_geqpt = high_resolution_clock::now(); + auto stop_geqpt = steady_clock::now(); dur_geqpt = duration_cast(stop_geqpt - start_geqpt).count(); state_gen = state; data_regen(m_info, all_data, state_gen); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index fdcd1f41..2a015012 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -244,9 +244,9 @@ static void call_all_algs( // There is no reason to run SVD many times, as it always outputs the same result. if ((b_sz == 16) && (num_matmuls == 2) && ((i == 0) || (i == 1))) { // Running SVD - auto start_svd = high_resolution_clock::now(); + auto start_svd = steady_clock::now(); lapack::gesdd(Job::SomeVec, m, n, all_data.A, m, all_data.Sigma, all_data.U, m, all_data.VT, n); - auto stop_svd = high_resolution_clock::now(); + auto stop_svd = steady_clock::now(); dur_svd = duration_cast(stop_svd - start_svd).count(); printf("TOTAL TIME FOR SVD %ld\n", dur_svd); @@ -266,9 +266,9 @@ static void call_all_algs( } // Running RBKI - auto start_rbki = high_resolution_clock::now(); + auto start_rbki = steady_clock::now(); all_algs.RBKI.call(m, n, all_data.A, m, b_sz, all_data.U, all_data.VT, all_data.Sigma, state_alg); - auto stop_rbki = high_resolution_clock::now(); + auto stop_rbki = steady_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); printf("TOTAL TIME FOR RBKI %ld\n", dur_rbki); @@ -283,10 +283,10 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen, 1); // Running RSVD - auto start_rsvd = high_resolution_clock::now(); + auto start_rsvd = steady_clock::now(); int64_t threshold_RSVD = (int64_t ) (b_sz * num_matmuls / 2); all_algs.RSVD.call(m, n, all_data.A, threshold_RSVD, tol, all_data.U_RSVD, all_data.Sigma_RSVD, all_data.V_RSVD, state_alg); - auto stop_rsvd = high_resolution_clock::now(); + auto stop_rsvd = steady_clock::now(); dur_rsvd = duration_cast(stop_rsvd - start_rsvd).count(); printf("TOTAL TIME FOR RSVD %ld\n", dur_rsvd); @@ -309,10 +309,10 @@ static void call_all_algs( // There is no reason to run SVDS many times, as it always outputs the same result. if ((num_matmuls == 2) && ((i == 0) || (i == 1))) { // Running SVDS - auto start_svds = high_resolution_clock::now(); + auto start_svds = steady_clock::now(); Spectra::PartialSVDSolver svds(all_data.A_spectra, std::min(custom_rank, n-2), std::min(2 * custom_rank, n-1)); svds.compute(); - auto stop_svds = high_resolution_clock::now(); + auto stop_svds = steady_clock::now(); dur_svds = duration_cast(stop_svds - start_svds).count(); printf("TOTAL TIME FOR SVDS %ld\n", dur_svds); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc index 23955b3a..79911dc7 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc @@ -138,9 +138,9 @@ static void call_all_algs( printf("Iteration %d start.\n", i); // Testing RBKI - auto start_rbki = high_resolution_clock::now(); + auto start_rbki = steady_clock::now(); RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state_alg); - auto stop_rbki = high_resolution_clock::now(); + auto stop_rbki = steady_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); T residual_err_custom = residual_error_comp(all_data, custom_rank); diff --git a/benchmark/bench_general/GEMM_flop_count.cc b/benchmark/bench_general/GEMM_flop_count.cc index 5c24217b..856d3544 100644 --- a/benchmark/bench_general/GEMM_flop_count.cc +++ b/benchmark/bench_general/GEMM_flop_count.cc @@ -33,9 +33,9 @@ test_flops(int64_t k, RandLAPACK::gen::mat_gen(m_info, B, state); // Get the timing - auto start = high_resolution_clock::now(); + auto start = steady_clock::now(); gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, k, k, k, 1.0, A, k, B, k, 0.0, C, k); - auto stop = high_resolution_clock::now(); + auto stop = steady_clock::now(); long dur = duration_cast(stop - start).count(); T dur_s = dur / 1e+6; diff --git a/benchmark/bench_general/Gemm_vs_ormqr.cc b/benchmark/bench_general/Gemm_vs_ormqr.cc index 7b5f922a..252bf73f 100644 --- a/benchmark/bench_general/Gemm_vs_ormqr.cc +++ b/benchmark/bench_general/Gemm_vs_ormqr.cc @@ -46,15 +46,15 @@ test_speed(int64_t m, // Get the implicit Q-factor in A_dat lapack::geqrf(m, n, A_dat, m, tau_dat); - auto start_ormqr = high_resolution_clock::now(); + auto start_ormqr = steady_clock::now(); lapack::ormqr(Side::Left, Op::Trans, m, n, n, A_dat, m, tau_dat, B1_dat, m); - auto stop_ormqr = high_resolution_clock::now(); + auto stop_ormqr = steady_clock::now(); long dur_ormqr = duration_cast(stop_ormqr - start_ormqr).count(); - auto start_gemm = high_resolution_clock::now(); + auto start_gemm = steady_clock::now(); lapack::ungqr(m, n, n, A_dat, m, tau_dat); gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, n, m, 1.0, A_dat, m, B2_dat, m, 0.0, Product_dat, n); - auto stop_gemm = high_resolution_clock::now(); + auto stop_gemm = steady_clock::now(); long dur_gemm = duration_cast(stop_gemm - start_gemm).count(); T gflop_count_gemm = (2 * std::pow(n, 2) * m) / std::pow(10, 9); diff --git a/benchmark/bench_general/basic_blas_speed.cc b/benchmark/bench_general/basic_blas_speed.cc index 0b1fe207..171dab5b 100644 --- a/benchmark/bench_general/basic_blas_speed.cc +++ b/benchmark/bench_general/basic_blas_speed.cc @@ -84,27 +84,27 @@ static void call_all_algs( for (int i = 0; i < numruns; ++i) { printf("ITERATION %d, DIM %ld\n", i, n); // Testing BLAS3 - auto start_blas3 = high_resolution_clock::now(); + auto start_blas3 = steady_clock::now(); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, 1.0, all_data.A.data(), n, all_data.B.data(), n, 0.0, all_data.C.data(), n); - auto stop_blas3 = high_resolution_clock::now(); + auto stop_blas3 = steady_clock::now(); dur_blas3 = duration_cast(stop_blas3 - start_blas3).count(); state_gen = state; data_regen(m_info, all_data, state_gen, 3); // Testing BLAS2 - auto start_blas2 = high_resolution_clock::now(); + auto start_blas2 = steady_clock::now(); blas::gemv(Layout::ColMajor, Op::NoTrans, n, n, 1.0, all_data.A.data(), n, all_data.a.data(), 1, 1.0, all_data.b.data(), 1); - auto stop_blas2 = high_resolution_clock::now(); + auto stop_blas2 = steady_clock::now(); dur_blas2 = duration_cast(stop_blas2 - start_blas2).count(); state_gen = state; data_regen(m_info, all_data, state_gen, 2); // Testing BLAS1 - auto start_blas1 = high_resolution_clock::now(); + auto start_blas1 = steady_clock::now(); blas::axpy(n, -1.0, all_data.a.data(), 1, all_data.b.data(), 1); - auto stop_blas1 = high_resolution_clock::now(); + auto stop_blas1 = steady_clock::now(); dur_blas1 = duration_cast(stop_blas1 - start_blas1).count(); state_gen = state; diff --git a/test/misc/test_util.cc b/test/misc/test_util.cc index 9474ce03..27536ae6 100644 --- a/test/misc/test_util.cc +++ b/test/misc/test_util.cc @@ -230,14 +230,14 @@ class TestUtil : public ::testing::Test tau[i] = T_mat[(n + 1) * i]; - auto start_std = high_resolution_clock::now(); + auto start_std = steady_clock::now(); lapack::ormqr(Side::Left, Op::Trans, m, n, n, A, m, tau, B, m); - auto stop_std = high_resolution_clock::now(); + auto stop_std = steady_clock::now(); long dur_std = duration_cast(stop_std - start_std).count(); - auto start_own = high_resolution_clock::now(); + auto start_own = steady_clock::now(); lapack::ormqr(Side::Left, Op::Trans, m, n, n, A1, m, tau1, B1, m); - auto stop_own = high_resolution_clock::now(); + auto stop_own = steady_clock::now(); long dur_own = duration_cast(stop_own - start_own).count(); printf("Own is %fx faster than std.\n", (T) dur_std / dur_own);