diff --git a/benchmark/bench_BQRRP/BQRRP_pivot_quality.cc b/benchmark/bench_BQRRP/BQRRP_pivot_quality.cc index fbbc46fe..9114d69f 100644 --- a/benchmark/bench_BQRRP/BQRRP_pivot_quality.cc +++ b/benchmark/bench_BQRRP/BQRRP_pivot_quality.cc @@ -191,18 +191,17 @@ static void sv_ratio( } int main(int argc, char *argv[]) { - - if(argc <= 1) { - printf("No input provided\n"); - return 0; + if (argc != 3) { + // Expected input into this benchmark. + std::cerr << "Usage: " << argv[0] << " ..." << std::endl; + return 1; } - auto size = argv[1]; // Declare parameters - int64_t m = std::stol(size); - int64_t n = std::stol(size); + int64_t m = std::stol(argv[1]); + int64_t n = std::stol(argv[2]); double d_factor = 1.0; - int64_t b_sz = 4096; + int64_t b_sz = std::stol(argv[3]);; auto state = RandBLAS::RNGState(); auto state_constant1 = state; auto state_constant2 = state; diff --git a/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc b/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc index 0281fdc0..acb2a01f 100644 --- a/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc +++ b/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc @@ -110,25 +110,34 @@ static void call_all_algs( int main(int argc, char *argv[]) { - if(argc <= 1) { - printf("No input provided\n"); - return 0; + if (argc < 5) { + // Expected input into this benchmark. + std::cerr << "Usage: " << argv[0] << " ..." << std::endl; + return 1; } - auto size = argv[1]; // Declare parameters - int64_t m = std::stol(size); - int64_t n = std::stol(size); + int64_t m = std::stol(argv[3]); + int64_t n = std::stol(argv[4]); double d_factor = 1.0; - std::vector b_sz = {250, 500, 1000, 2000, 4000, 8000}; + // Fill the block size vector + std::vector b_sz; + for (int i = 0; i < argc-5; ++i) + b_sz.push_back(std::stoi(argv[i + 5])); + // Save elements in string for logging purposes + std::ostringstream oss; + for (const auto &val : b_sz) + oss << val << ", "; + std::string b_sz_string = oss.str(); + //std::vector b_sz = {256, 512, 1024, 2048, 4096, 8192}; auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 3; - std::string qr_tall = argv[2]; + int64_t numruns = std::stol(argv[2]); + std::string qr_tall = argv[1]; // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, d_factor); @@ -150,7 +159,7 @@ int main(int argc, char *argv[]) { "\nNum OMP threads:" + std::to_string(RandLAPACK::util::get_omp_threads()) + "\nInput type:" + std::to_string(m_info.m_type) + "\nInput size:" + std::to_string(m) + " by " + std::to_string(n) + - "\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz.front()) + " BQRRP block size end: " + std::to_string(b_sz.back()) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + + "\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block sizes: " + b_sz_string + "num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + "\n"; file.flush(); diff --git a/benchmark/bench_BQRRP/BQRRP_speed_comparisons_block_size.cc b/benchmark/bench_BQRRP/BQRRP_speed_comparisons_block_size.cc index 3271c34e..75ce14a1 100644 --- a/benchmark/bench_BQRRP/BQRRP_speed_comparisons_block_size.cc +++ b/benchmark/bench_BQRRP/BQRRP_speed_comparisons_block_size.cc @@ -187,25 +187,31 @@ static void call_all_algs( int main(int argc, char *argv[]) { - if(argc <= 1) { - printf("No input provided\n"); - return 0; + if (argc < 4) { + // Expected input into this benchmark. + std::cerr << "Usage: " << argv[0] << " ..." << std::endl; + return 1; } - auto size = argv[1]; - // Declare parameters - int64_t m = std::stol(size); - int64_t n = std::stol(size); + int64_t m = std::stol(argv[2]); + int64_t n = std::stol(argv[3]); double d_factor = 1.0; - std::vector b_sz = {250, 500, 1000, 2000, 4000, 8000}; - //std::vector b_sz = {256, 512, 1024, 2048, 4096, 8192}; + std::vector b_sz; + for (int i = 0; i < argc-4; ++i) + b_sz.push_back(std::stoi(argv[i + 4])); + // Save elements in string for logging purposes + std::ostringstream oss; + for (const auto &val : b_sz) + oss << val << ", "; + std::string b_sz_string = oss.str(); + auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 3; + int64_t numruns = std::stol(argv[1]); // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, d_factor); @@ -227,7 +233,7 @@ int main(int argc, char *argv[]) { "\nNum OMP threads:" + std::to_string(RandLAPACK::util::get_omp_threads()) + "\nInput type:" + std::to_string(m_info.m_type) + "\nInput size:" + std::to_string(m) + " by " + std::to_string(n) + - "\nAdditional parameters: BQRRP block size start: " + std::to_string(b_sz.front()) + " BQRRP block size end: " + std::to_string(b_sz.back()) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + + "\nAdditional parameters: BQRRP block sizes: " + b_sz_string + "num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + "\n"; file.flush(); diff --git a/benchmark/bench_BQRRP/BQRRP_speed_comparisons_mat_size.cc b/benchmark/bench_BQRRP/BQRRP_speed_comparisons_mat_size.cc index 5f890c20..36ebbe55 100644 --- a/benchmark/bench_BQRRP/BQRRP_speed_comparisons_mat_size.cc +++ b/benchmark/bench_BQRRP/BQRRP_speed_comparisons_mat_size.cc @@ -190,29 +190,37 @@ static void call_all_algs( int main(int argc, char *argv[]) { - if(argc <= 1) { - printf("No input provided\n"); - return 0; + if (argc < 3) { + // Expected input into this benchmark. + std::cerr << "Usage: " << argv[0] << " ..." << std::endl; + return 1; } - auto block_size = argv[1]; - // Declare parameters - int64_t m_start = std::pow(2, 10); - int64_t m_end = std::pow(2, 16); + // Fill the block size vector + std::vector m_sz; + for (int i = 0; i < argc-3; ++i) + m_sz.push_back(std::stoi(argv[i + 3])); + // Save elements in string for logging purposes + std::ostringstream oss; + for (const auto &val : m_sz) + oss << val << ", "; + std::string m_sz_string = oss.str(); + double d_factor = 1.0; - int64_t b_sz = std::stol(block_size); + int64_t b_sz = std::stol(argv[2]); auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 3; + int64_t numruns = std::stol(argv[1]); // Allocate basic workspace - QR_speed_benchmark_data all_data(m_end, m_end, b_sz, d_factor); + int64_t m_max = *std::max_element(m_sz.begin(), m_sz.end()); + QR_speed_benchmark_data all_data(m_max, m_max, b_sz, d_factor); // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m_end, m_end, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen_info m_info(m_max, m_max, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file @@ -228,13 +236,14 @@ int main(int argc, char *argv[]) { " rows correspond to BQRRP runs with mat sizes varying with powers of 2, with numruns repititions of each mat size." "\nNum OMP threads:" + std::to_string(RandLAPACK::util::get_omp_threads()) + "\nInput type:" + std::to_string(m_info.m_type) + - "\nInput size:" + " dim start: " + std::to_string(m_start) + " dim stop: " + std::to_string(m_end) + + "\nInput size:" + " dim start: " + m_sz_string + "\nAdditional parameters: BQRRP block size: " + std::to_string(b_sz) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + "\n"; file.flush(); - for (;m_start <= m_end; m_start *= 2) { - call_all_algs(m_info, numruns, m_start, all_data, state_constant, output_filename); + size_t i = 0; + for (;i < m_sz.size(); ++i) { + call_all_algs(m_info, numruns, m_sz[i], all_data, state_constant, output_filename); } } #endif \ No newline at end of file diff --git a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc index bafaa350..9af1f42c 100644 --- a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc +++ b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc @@ -352,14 +352,29 @@ static void call_apply_q( int main(int argc, char *argv[]) { - auto size = argv[1]; + if (argc < 3) { + // Expected input into this benchmark. + std::cerr << "Usage: " << argv[0] << " ..." << std::endl; + return 1; + } - int64_t i = 0; + size_t i = 0; // Declare parameters - int64_t m = std::stol(size); - int64_t n_start = 256; - int64_t n_stop = 2048; - int64_t nb_start = 256; + int64_t m = std::stol(argv[2]); + int64_t n_start = 256; + int64_t n_stop = 2048; + // Fill the n size vector + std::vector n_sz; + for (int i = 0; i < argc-3; ++i) + n_sz.push_back(std::stoi(argv[i + 3])); + // Save elements in string for logging purposes + std::ostringstream oss; + for (const auto &val : n_sz) + oss << val << ", "; + std::string n_sz_string = oss.str(); + + // Internal block size for ORMQR, will increase by 2 at every iteration + int64_t nb_start = n_start; auto state = RandBLAS::RNGState(); auto state_B = RandBLAS::RNGState(); auto state_constant = state; @@ -367,12 +382,13 @@ int main(int argc, char *argv[]) { // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 3; + int64_t numruns = std::stol(argv[1]); // Allocate basic workspace - benchmark_data all_data(m, n_stop); + int64_t n_max = *std::max_element(n_sz.begin(), n_sz.end()); + benchmark_data all_data(m, n_max); // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen_info m_info(m, n_max, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); RandLAPACK::gen::mat_gen(m_info, all_data.B.data(), state_B); @@ -391,18 +407,18 @@ int main(int argc, char *argv[]) { " \n In all cases, rows vary from n_start to n_stop in powers of two (with numruns runs per size)." "\nNum OMP threads:" + std::to_string(RandLAPACK::util::get_omp_threads()) + "\nInput type:" + std::to_string(m_info.m_type) + - "\nInput size:" + std::to_string(m) + " by " + std::to_string(n_start) + " to " + std::to_string(n_stop) + + "\nInput size:" + std::to_string(m) + " by " + n_sz_string + "\nAdditional parameters num runs per size " + std::to_string(numruns) + " nb_start " + std::to_string(nb_start) + "\n"; file.flush(); - for (i = n_start; i <= n_stop; i *= 2) - call_wide_qrcp(m_info, numruns, i, all_data, state, output_filename); + for (;i < n_sz.size(); ++i) + call_wide_qrcp(m_info, numruns, n_sz[i], all_data, state, output_filename); - for (i = n_start; i <= n_stop; i *= 2) - call_tsqr(m_info, numruns, i, nb_start, all_data, state, output_filename); + for (;i < n_sz.size(); ++i) + call_tsqr(m_info, numruns, n_sz[i], nb_start, all_data, state, output_filename); - for (i = n_start; i <= n_stop; i *= 2) - call_apply_q(m_info, numruns, i, nb_start, all_data, state, state_B, output_filename); + for (;i < n_sz.size(); ++i) + call_apply_q(m_info, numruns, n_sz[i], nb_start, all_data, state, state_B, output_filename); } #endif \ No newline at end of file diff --git a/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc b/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc index fe430be8..3ed50749 100644 --- a/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc +++ b/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc @@ -96,24 +96,32 @@ static void call_all_algs( int main(int argc, char *argv[]) { - if(argc <= 1) { - printf("No input provided\n"); - return 0; + if (argc < 4) { + // Expected input into this benchmark. + std::cerr << "Usage: " << argv[0] << " ..." << std::endl; + return 1; } - auto size = argv[1]; // Declare parameters - int64_t m = std::stol(size); - int64_t n = std::stol(size); + int64_t m = std::stol(argv[2]); + int64_t n = std::stol(argv[3]); double d_factor = 1.0; - int64_t b_sz_start = 32; - int64_t b_sz_end = 8192; + // Fill the block size vector + std::vector b_sz; + for (int i = 0; i < argc-4; ++i) + b_sz.push_back(std::stoi(argv[i + 4])); + // Save elements in string for logging purposes + std::ostringstream oss; + for (const auto &val : b_sz) + oss << val << ", "; + std::string b_sz_string = oss.str(); + auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. - int64_t numruns = 1; + int64_t numruns = std::stol(argv[1]);; // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, d_factor); @@ -135,13 +143,13 @@ int main(int argc, char *argv[]) { "\nNum OMP threads:" + std::to_string(RandLAPACK::util::get_omp_threads()) + "\nInput type:" + std::to_string(m_info.m_type) + "\nInput size:" + std::to_string(m) + " by " + std::to_string(n) + - "\nAdditional parameters: HQRRP block size start: " + std::to_string(b_sz_start) + " HQRRP block size end: " + std::to_string(b_sz_end) + " num runs per size " + std::to_string(numruns) + " HQRRP d factor: " + std::to_string(d_factor) + + "\nAdditional parameters: HQRRP block sizes: " + b_sz_string + "num runs per size " + std::to_string(numruns) + " HQRRP d factor: " + std::to_string(d_factor) + "\n"; file.flush(); - - for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { - call_all_algs(m_info, numruns, b_sz_start, all_data, state_constant, output_filename); + size_t i = 0; + for (;i < b_sz.size(); ++i) { + call_all_algs(m_info, numruns, b_sz[i], all_data, state_constant, output_filename); } } #endif