diff --git a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh index 5745c9a8..8d1108dc 100644 --- a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh +++ b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh @@ -1,8 +1,9 @@ +#include "hip/hip_runtime.h" #pragma once #include "rl_cuda_macros.hh" -#include -#include -#include +#include +#include +#include namespace RandLAPACK::cuda_kernels { @@ -487,7 +488,7 @@ __global__ void copy_mat_gpu( template void naive_rank_est( - cudaStream_t stream, + hipStream_t stream, int64_t n, const T alpha, const T* A, @@ -496,21 +497,21 @@ void naive_rank_est( ) { #ifdef USE_CUDA int64_t* rank_device; - cudaMalloc(&rank_device, sizeof(int64_t)); + hipMalloc(&rank_device, sizeof(int64_t)); constexpr int threadsPerBlock{128}; int64_t num_blocks_ger{(n + threadsPerBlock - 1) / threadsPerBlock}; naive_rank_est<<>>(n, alpha, A, lda, rank_device); // It seems that these synchs are required, after all - cudaStreamSynchronize(stream); - cudaMemcpy(&rank, rank_device, sizeof(int64_t), cudaMemcpyDeviceToHost); - cudaStreamSynchronize(stream); - cudaFree(rank_device); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipStreamSynchronize(stream); + hipMemcpy(&rank, rank_device, sizeof(int64_t), hipMemcpyDeviceToHost); + hipStreamSynchronize(stream); + hipFree(rank_device); + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch naive_rank_est. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch naive_rank_est. " << hipGetErrorString(ierr)) abort(); } #else @@ -520,7 +521,7 @@ void naive_rank_est( template void all_of( - cudaStream_t stream, + hipStream_t stream, int64_t n, const T alpha, const T* A, @@ -528,19 +529,19 @@ void all_of( ) { #ifdef USE_CUDA bool* all_greater_device; - cudaMalloc(&all_greater_device, sizeof(bool)); + hipMalloc(&all_greater_device, sizeof(bool)); constexpr int threadsPerBlock{128}; int64_t num_blocks_ger{(n + threadsPerBlock - 1) / threadsPerBlock}; all_of<<>>(n, alpha, A, all_greater_device); - cudaStreamSynchronize(stream); - cudaMemcpy(&all_greater, all_greater_device, sizeof(bool), cudaMemcpyDeviceToHost); - cudaStreamSynchronize(stream); - cudaFree(all_greater_device); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipStreamSynchronize(stream); + hipMemcpy(&all_greater, all_greater_device, sizeof(bool), hipMemcpyDeviceToHost); + hipStreamSynchronize(stream); + hipFree(all_greater_device); + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch all_greater. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch all_greater. " << hipGetErrorString(ierr)) abort(); } #else @@ -550,7 +551,7 @@ void all_of( template void ger_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, T alpha, @@ -567,10 +568,10 @@ void ger_gpu( int64_t num_blocks_ger{(m + threadsPerBlock - 1) / threadsPerBlock}; ger_gpu<<>>(m, n, alpha, x, incx, y, incy, A, lda); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch ger_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch ger_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -580,7 +581,7 @@ void ger_gpu( template void copy_mat_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, const T* A, @@ -594,10 +595,10 @@ void copy_mat_gpu( dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x, (m + threadsPerBlock.y - 1) / threadsPerBlock.y); copy_mat_gpu<<>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch copy_mat_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch copy_mat_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -607,7 +608,7 @@ void copy_mat_gpu( template void col_swap_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, int64_t k, @@ -617,39 +618,39 @@ void col_swap_gpu( ) { #ifdef USE_CUDA int64_t* idx_copy; - cudaMallocAsync(&idx_copy, sizeof(int64_t) * n, stream); + hipMallocAsync(&idx_copy, sizeof(int64_t) * n, stream); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to allocate for col_swap_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to allocate for col_swap_gpu. " << hipGetErrorString(ierr)) abort(); } constexpr int numThreads = 128; dim3 dimBlock(numThreads, 1, 1); static int upper_bound = [&] { int numBlocksPerSm = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, col_swap_gpu_kernel, numThreads, 0); + hipDeviceProp_t deviceProp; + hipGetDeviceProperties(&deviceProp, 0); + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, col_swap_gpu_kernel, numThreads, 0); return deviceProp.multiProcessorCount * numBlocksPerSm; }(); int lower_bound = std::max(m, k); lower_bound = (lower_bound + numThreads - 1) / numThreads; dim3 dimGrid(std::min(upper_bound, lower_bound), 1, 1); void* kernelArgs[] = {(void*)&m, (void*)&n, (void*)&k, (void*)&A, (void*)&lda, (void*)&idx, (void*)&idx_copy}; - cudaLaunchCooperativeKernel((void*)col_swap_gpu_kernel_sequential, dimGrid, dimBlock, kernelArgs, 0, stream); - ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipLaunchCooperativeKernel((void*)col_swap_gpu_kernel_sequential, dimGrid, dimBlock, kernelArgs, 0, stream); + ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu sequential. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu sequential. " << hipGetErrorString(ierr)) abort(); } - cudaFreeAsync(idx_copy, stream); - ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipFreeAsync(idx_copy, stream); + ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to deallocate for col_swap_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to deallocate for col_swap_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -661,7 +662,7 @@ void col_swap_gpu( /// idx array modified ONLY within the scope of this function. template void col_swap_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, int64_t k, @@ -678,10 +679,10 @@ void col_swap_gpu( (m + threadsPerBlock.y - 1) / threadsPerBlock.y); col_swap_gpu_kernel<<>>(m, n, k, A, lda, A_cpy, ldac, idx); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu with parallel pivots. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu with parallel pivots. " << hipGetErrorString(ierr)) abort(); } #else @@ -693,7 +694,7 @@ void col_swap_gpu( /// idx array modified ONLY within the scope of this function. template void col_swap_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t k, int64_t* J, @@ -703,26 +704,26 @@ void col_swap_gpu( /* std::vector idx_copy(k); auto j = std::make_unique(m); - cudaMemcpyAsync(idx_copy.data(), idx, sizeof(int64_t) * k, cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(j.get(), J, sizeof(int64_t) * m, cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); + hipMemcpyAsync(idx_copy.data(), idx, sizeof(int64_t) * k, hipMemcpyDeviceToHost, stream); + hipMemcpyAsync(j.get(), J, sizeof(int64_t) * m, hipMemcpyDeviceToHost, stream); + hipStreamSynchronize(stream); RandLAPACK::util::col_swap(m, k, j.get(), std::move(idx_copy)); - cudaMemcpyAsync(J, j.get(), sizeof(int64_t) * m, cudaMemcpyHostToDevice, stream); + hipMemcpyAsync(J, j.get(), sizeof(int64_t) * m, hipMemcpyHostToDevice, stream); // must add this to avoid dangling reference during async copy - cudaStreamSynchronize(stream); + hipStreamSynchronize(stream); */ #ifdef USE_CUDA int64_t* idx_copy; - cudaMallocAsync(&idx_copy, sizeof(int64_t) * k, stream); + hipMallocAsync(&idx_copy, sizeof(int64_t) * k, stream); vec_ell_swap_gpu<<<1, 512, 0, stream>>>(m, k, J, idx, idx_copy); - cudaFreeAsync(idx_copy, stream); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipFreeAsync(idx_copy, stream); + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -732,7 +733,7 @@ void col_swap_gpu( template void col_swap_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t k, int64_t* J, @@ -744,10 +745,10 @@ void col_swap_gpu( int64_t num_blocks{(k + threadsPerBlock - 1) / threadsPerBlock}; vec_ell_swap_gpu<<>>(m, k, J, J_cpy, idx); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu vector. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu vector. " << hipGetErrorString(ierr)) abort(); } #else @@ -758,7 +759,7 @@ void col_swap_gpu( template void transposition_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, const T* A, @@ -772,10 +773,10 @@ void transposition_gpu( int64_t num_blocks{(m + threadsPerBlock - 1) / threadsPerBlock}; transposition_gpu<<>>(m, n, A, lda, AT, ldat, copy_upper_triangle); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch transposition_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch transposition_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -788,7 +789,7 @@ void transposition_gpu( // TODO: we get a multiple definition linker error if this function is not templated. template inline void LUQRCP_piv_process_gpu( - cudaStream_t stream, + hipStream_t stream, Idx sampling_dim, Idx cols, Idx* J_buffer, @@ -799,10 +800,10 @@ inline void LUQRCP_piv_process_gpu( Idx n{std::min(sampling_dim, cols)}; LUQRCP_piv_process_gpu_global<<<1, threadsPerBlock, 0, stream>>>(cols, n, J_buffer, J_buffer_lu); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch piv_process_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch piv_process_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -814,7 +815,7 @@ inline void LUQRCP_piv_process_gpu( // Outputs tau instead of T. template void orhr_col_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, T* A, @@ -840,10 +841,10 @@ void orhr_col_gpu( int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock}; elementwise_product<<>>(n, T{-1}, A, lda + 1, D, 1, tau, 1); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch orhr_col_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch orhr_col_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -854,7 +855,7 @@ void orhr_col_gpu( // Setting signs in the R-factor from CholQR after orhr_col outputs. template void R_cholqr_signs_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t b_sz, int64_t b_sz_const, T* R, @@ -867,10 +868,10 @@ void R_cholqr_signs_gpu( R_cholqr_signs_gpu<<>>(b_sz, b_sz_const, R, D); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch R_cholqr_signs_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch R_cholqr_signs_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -880,7 +881,7 @@ void R_cholqr_signs_gpu( template void copy_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t n, T const* src, int64_t incr_src, @@ -892,10 +893,10 @@ void copy_gpu( int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock}; copy_gpu<<>>(n, src, incr_src, dest, incr_dest); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch copy_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch copy_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -905,7 +906,7 @@ void copy_gpu( template void fill_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t n, T* x, int64_t incx, @@ -916,10 +917,10 @@ void fill_gpu( int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock}; fill_gpu<<>>(n, x, incx, alpha); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -929,7 +930,7 @@ void fill_gpu( template void fill_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t n, int64_t* x, int64_t incx, @@ -940,10 +941,10 @@ void fill_gpu( int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock}; fill_gpu<<>>(n, x, incx, alpha); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << hipGetErrorString(ierr)) abort(); } #else @@ -953,7 +954,7 @@ void fill_gpu( template void get_U_gpu( - cudaStream_t stream, + hipStream_t stream, int64_t m, int64_t n, T* A, @@ -964,10 +965,10 @@ void get_U_gpu( int64_t num_blocks{std::max((n + threadsPerBlock - 1) / threadsPerBlock, (int64_t) 1)}; get_U_gpu<<>>(m, n, A, lda); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Failed to launch get_U_gpu. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Failed to launch get_U_gpu. " << hipGetErrorString(ierr)) abort(); } #else diff --git a/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc b/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc index f8afb254..956d94cb 100644 --- a/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc +++ b/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc @@ -120,8 +120,8 @@ int main(int argc, char *argv[]) { int64_t m = std::stol(size); int64_t n = std::stol(size); double d_factor = 1.0; - std::vector b_sz = {250, 500, 1000, 2000, 4000, 8000}; - //std::vector b_sz = {256, 512, 1024, 2048, 4096, 8192}; + int64_t b_sz_start = 32; + int64_t b_sz_end = 128; auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results @@ -141,7 +141,7 @@ int main(int argc, char *argv[]) { + "_num_info_lines_" + std::to_string(6) + ".txt"; - std::ofstream file(output_filename, std::ios::out | std::ios::app); + std::ofstream file(output_filename, std::ios::out | std::ios::trunc); // Writing important data into file file << "Description: Results from the BQRRP runtime breakdown benchmark, recording the time it takes to perform every subroutine in BQRRP." @@ -149,13 +149,12 @@ int main(int argc, char *argv[]) { " rows correspond to BQRRP runs with block sizes varying in powers of 2, with numruns repititions of each block size" "\nInput type:" + std::to_string(m_info.m_type) + "\nInput size:" + std::to_string(m) + " by " + std::to_string(n) + - "\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz.front()) + " BQRRP block size end: " + std::to_string(b_sz.back()) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + + "\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz_start) + " BQRRP block size end: " + std::to_string(b_sz_end) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: " + std::to_string(d_factor) + "\n"; file.flush(); - int i = 0; - for (;i < b_sz.size(); ++i) { - call_all_algs(m_info, numruns, b_sz[i], qr_tall, all_data, state_constant, output_filename); + for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { + call_all_algs(m_info, numruns, b_sz_start, qr_tall, all_data, state_constant, output_filename); } } #endif diff --git a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc index 0a0919e8..1657d55c 100644 --- a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc +++ b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc @@ -136,26 +136,31 @@ static void call_wide_qrcp( std::string output_filename) { auto m = all_data.row; + auto tol = all_data.tolerance; + + RandLAPACK::CQRRPT CQRRPT(false, tol); + CQRRPT.nnz = 4; // timing vars long dur_geqp3 = 0; long dur_luqr = 0; + long dur_cqrrpt = 0; // Making sure the states are unchanged auto state_alg = state; int i, j = 0; for (i = 0; i < numruns; ++i) { - printf("Wide QRCP iteration %d; m==%ld start.\n", i, n); + printf("Wide QRCP iteration %d; m==%d start.\n", i, n); // Testing GEQP3 - auto start_geqp3 = high_resolution_clock::now(); + auto start_geqp3 = steady_clock::now(); lapack::geqp3(n, m, all_data.A.data(), n, all_data.J.data(), all_data.tau.data()); - auto stop_geqp3 = high_resolution_clock::now(); + auto stop_geqp3 = steady_clock::now(); dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); data_regen(m_info, all_data, state, state, 1); // Testing LUQR - auto start_luqr = high_resolution_clock::now(); + auto start_luqr = steady_clock::now(); // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk. // Get a transpose of A_sk RandLAPACK::util::transposition(n, m, all_data.A.data(), n, all_data.A_trans.data(), m, 0); @@ -172,7 +177,7 @@ static void call_wide_qrcp( RandLAPACK::util::col_swap(n, m, m, all_data.A.data(), n, all_data.J); // Perform an unpivoted QR on A_sk lapack::geqrf(n, m, all_data.A.data(), n, all_data.tau.data()); - auto stop_luqr = high_resolution_clock::now(); + auto stop_luqr = steady_clock::now(); dur_luqr = duration_cast(stop_luqr - start_luqr).count(); data_regen(m_info, all_data, state, state, 1); @@ -193,6 +198,7 @@ static void call_tsqr( std::string output_filename) { auto m = all_data.row; + auto tol = all_data.tolerance; int64_t tsize = 0; // timing vars @@ -205,11 +211,10 @@ static void call_tsqr( long dur_cholqr_r_restore = 0; // Imitating the QRCP on a sketch stage of BQRRP - needed to get a preconditioner - T* S = new T[n * m](); - T* A_sk = new T[n * n](); - int64_t* J = new int64_t[n](); - T* tau = new T[n](); - + T* S = ( T * ) calloc( n * m, sizeof( T ) ); + T* A_sk = ( T * ) calloc( n * n, sizeof( T ) ); + int64_t* J = ( int64_t * ) calloc( n, sizeof( int64_t ) ); + T* tau = ( T * ) calloc( n, sizeof( T ) ); RandBLAS::DenseDist D(n, m); auto state_const = state; RandBLAS::fill_dense(D, S, state_const); @@ -224,45 +229,45 @@ static void call_tsqr( for(nb = geqrt_nb_start; nb <= n; nb *=2) { printf("TSQR iteration %d; n==%ld start.\n", i, n); - auto start_geqrt = high_resolution_clock::now(); + auto start_geqrt = steady_clock::now(); lapack::geqrt( m, n, nb, all_data.A.data(), m, all_data.T_mat.data(), n ); - auto stop_geqrt = high_resolution_clock::now(); + auto stop_geqrt = steady_clock::now(); dur_geqrt = duration_cast(stop_geqrt - start_geqrt).count(); if(nb == geqrt_nb_start) { // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); + auto start_geqrf = steady_clock::now(); lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); + auto stop_geqrf = steady_clock::now(); dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); data_regen(m_info, all_data, state, state, 2); // Testing GEQR - auto start_geqr = high_resolution_clock::now(); + auto start_geqr = steady_clock::now(); lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); tsize = (int64_t) all_data.tau[0]; all_data.tau.resize(tsize); lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); - auto stop_geqr = high_resolution_clock::now(); + auto stop_geqr = steady_clock::now(); dur_geqr = duration_cast(stop_geqr - start_geqr).count(); data_regen(m_info, all_data, state, state, 2); // Testing CholQR - auto start_precond = high_resolution_clock::now(); + auto start_precond = steady_clock::now(); blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, A_sk, n, all_data.A.data(), m); - auto stop_precond = high_resolution_clock::now(); + auto stop_precond = steady_clock::now(); dur_cholqr_precond = duration_cast(stop_precond - start_precond).count(); - auto start_cholqr = high_resolution_clock::now(); + auto start_cholqr = steady_clock::now(); blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n); lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, all_data.R.data(), n, all_data.A.data(), m); - auto stop_cholqr = high_resolution_clock::now(); + auto stop_cholqr = steady_clock::now(); dur_cholqr = duration_cast(stop_cholqr - start_cholqr).count(); - auto start_orhr_col = high_resolution_clock::now(); + auto start_orhr_col = steady_clock::now(); lapack::orhr_col(m, n, n, all_data.A.data(), m, all_data.T_mat.data(), n, all_data.D.data()); - auto stop_cholqr_orhr = high_resolution_clock::now(); + auto stop_cholqr_orhr = steady_clock::now(); dur_cholqr_house_rest = duration_cast(stop_cholqr_orhr - start_orhr_col).count(); - auto start_r_restore = high_resolution_clock::now(); + auto start_r_restore = steady_clock::now(); // Construct the proper R-factor for(int i = 0; i < n; ++i) { for(int j = 0; j < (i + 1); ++j) { @@ -271,7 +276,7 @@ static void call_tsqr( } blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, n, n, (T) 1.0, A_sk, n, all_data.R.data(), n); lapack::lacpy(MatrixType::Upper, n, n, all_data.R.data(), n, all_data.A.data(), m); - auto stop_r_restore = high_resolution_clock::now(); + auto stop_r_restore = steady_clock::now(); dur_cholqr_r_restore = duration_cast(stop_r_restore - start_r_restore).count(); data_regen(m_info, all_data, state, state, 2); @@ -284,10 +289,10 @@ static void call_tsqr( file << "\n"; } - delete[] A_sk; - delete[] S; - delete[] J; - delete[] tau; + free(A_sk); + free(S); + free(J); + free(tau); } template @@ -306,6 +311,7 @@ static void call_apply_q( // timing vars long dur_ormqr = 0; long dur_gemqrt = 0; + long dur_gemm = 0; std::ofstream file(output_filename, std::ios::app); @@ -313,7 +319,7 @@ static void call_apply_q( int64_t nb = 0; for (i = 0; i < numruns; ++i) { for(nb = gemqrt_nb_start; nb <= n; nb *=2) { - printf("Apply Q iteration %d; n==%ld start.\n", i, n); + printf("Apply Q iteration %d; n==%d start.\n", i, n); // Performing CholQR blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n); lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); @@ -322,9 +328,9 @@ static void call_apply_q( lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_gemqrt.data(), m); lapack::orhr_col(m, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.D.data()); - auto start_gemqrt = high_resolution_clock::now(); + auto start_gemqrt = steady_clock::now(); lapack::gemqrt(Side::Left, Op::Trans, m, m - n, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.B1.data(), m); - auto stop_gemqrt = high_resolution_clock::now(); + auto stop_gemqrt = steady_clock::now(); dur_gemqrt = duration_cast(stop_gemqrt - start_gemqrt).count(); // We do not re-run ormqr and gemm for different nbs @@ -335,9 +341,9 @@ static void call_apply_q( for(j = 0; j < n; ++j) all_data.tau[j] = all_data.T_mat[(n + 1) * j]; - auto start_ormqr = high_resolution_clock::now(); + auto start_ormqr = steady_clock::now(); lapack::ormqr(Side::Left, Op::Trans, m, m - n, n, all_data.A.data(), m, all_data.tau.data(), all_data.B.data(), m); - auto stop_ormqr = high_resolution_clock::now(); + auto stop_ormqr = steady_clock::now(); dur_ormqr = duration_cast(stop_ormqr - start_ormqr).count(); file << dur_ormqr << ", "; @@ -404,4 +410,4 @@ int main(int argc, char *argv[]) { for (i = n_start; i <= n_stop; i *= 2) call_apply_q(m_info, numruns, i, nb_start, all_data, state, state_B, output_filename); } -#endif \ No newline at end of file +#endif diff --git a/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc b/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc index 6c1c8a97..06e7a0fa 100644 --- a/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc +++ b/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc @@ -49,7 +49,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); - std::fill(all_data.J.begin(), all_data.J.end(), 0); + std::iota(all_data.J.begin(), all_data.J.end(), 1); } template @@ -71,7 +71,7 @@ static void call_all_algs( int panel_pivoting = 0; // Timing vars - T* times = ( T * ) calloc( 27, sizeof( T ) ); + T* times = ( T * ) calloc(27, sizeof( T ) ); for (int i = 0; i < numruns; ++i) { printf("ITERATION %d, NUMCOLS %ld\n", i, n); @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) { + "_num_info_lines_" + std::to_string(6) + ".txt"; - std::ofstream file(output_filename, std::ios::out | std::ios::app); + std::ofstream file(output_filename, std::ios::out | std::ios::trunc); // Writing important data into file file << "Description: Results from the HQRRP runtime breakdown benchmark, recording the time it takes to perform every subroutine in HQRRP." diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc index c2c83b18..1c77e572 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc @@ -50,7 +50,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, // Re-generate and clear data template -static std::vector get_norms(int64_t n, std::vector Mat, int64_t lda) { +static std::vector get_norms( int64_t m, int64_t n, std::vector Mat, int64_t lda) { std::vector R_norms (n, 0.0); for (int i = 0; i < n; ++i) { @@ -81,7 +81,7 @@ static void R_norm_ratio( // Running GEQP3 lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); - std::vector R_norms_GEQP3 = get_norms(n, all_data.A, m); + std::vector R_norms_GEQP3 = get_norms(m, n, all_data.A, m); printf("\nDone with QP3\n"); // Clear and re-generate data @@ -92,7 +92,7 @@ static void R_norm_ratio( // Running CQRRP state_alg = state; CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); - std::vector R_norms_CQRRPT = get_norms(n, all_data.R, n); + std::vector R_norms_CQRRPT = get_norms(n, n, all_data.R, n); // Declare a data file std::fstream file1("QR_R_norm_ratios_rows_" + std::to_string(m) diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc index 9440eba0..b19bb958 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -85,27 +85,27 @@ static void call_all_algs( for (int i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); // Testing GEQP3 - auto start_geqp3 = high_resolution_clock::now(); + auto start_geqp3 = steady_clock::now(); lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); - auto stop_geqp3 = high_resolution_clock::now(); + auto stop_geqp3 = steady_clock::now(); dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); state_gen = state; data_regen(m_info, all_data, state_gen); // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); + auto start_geqrf = steady_clock::now(); lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); + auto stop_geqrf = steady_clock::now(); dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); state_gen = state; data_regen(m_info, all_data, state_gen); // Testing CQRRPT - auto start_cqrrp = high_resolution_clock::now(); + auto start_cqrrp = steady_clock::now(); CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); - auto stop_cqrrp = high_resolution_clock::now(); + auto stop_cqrrp = steady_clock::now(); dur_cqrrpt = duration_cast(stop_cqrrp - start_cqrrp).count(); state_gen = state; @@ -113,7 +113,7 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen); // Testing SCHOLQR3 - auto start_scholqr = high_resolution_clock::now(); + auto start_scholqr = steady_clock::now(); //--------------------------------------------------------------------------------------------------------------------------// T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); T shift = 11 * std::numeric_limits::epsilon() * n * std::pow(norm_A, 2); @@ -131,7 +131,7 @@ static void call_all_algs( lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); //--------------------------------------------------------------------------------------------------------------------------// - auto stop_scholqr = high_resolution_clock::now(); + auto stop_scholqr = steady_clock::now(); dur_scholqr = duration_cast(stop_scholqr - start_scholqr).count(); auto state_gen = state; @@ -139,21 +139,21 @@ static void call_all_algs( // Testing GEQR + GEQPT #if !defined(__APPLE__) - auto start_geqpt = high_resolution_clock::now(); - auto start_geqr = high_resolution_clock::now(); + auto start_geqpt = steady_clock::now(); + auto start_geqr = steady_clock::now(); // GEQR(A) part lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); int64_t tsize = (int64_t) all_data.tau[0]; all_data.tau.resize(tsize); lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); - auto stop_geqr = high_resolution_clock::now(); + auto stop_geqr = steady_clock::now(); dur_geqr = duration_cast(stop_geqr - start_geqr).count(); // GEQP3(R) part lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n); lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data()); - auto stop_geqpt = high_resolution_clock::now(); + auto stop_geqpt = steady_clock::now(); dur_geqpt = duration_cast(stop_geqpt - start_geqpt).count(); state_gen = state; data_regen(m_info, all_data, state_gen); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 704777b1..2a015012 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -45,42 +45,27 @@ struct RBKI_benchmark_data { RBKI_benchmark_data(int64_t m, int64_t n, T tol) : A_spectra(m, n) { - A = new T[m * n](); - U = new T[m * n](); - VT = new T[n * n](); - V = new T[n * n](); - Sigma = new T[m](); - U_RSVD = new T[m * n](); - V_RSVD = new T[n * n](); - Sigma_RSVD = new T[n](); - Buffer = new T[m * n](); - Sigma_cpy = new T[n * n](); - U_cpy = new T[m * n](); - VT_cpy = new T[n * n](); - + A = ( T * ) calloc(m * n, sizeof( T ) ); + U = ( T * ) calloc(m * n, sizeof( T ) ); + VT = ( T * ) calloc(n * n, sizeof( T ) ); + V = ( T * ) calloc(n * n, sizeof( T ) ); + Sigma = ( T * ) calloc(m, sizeof( T ) ); + U_RSVD = ( T * ) calloc(m * n, sizeof( T ) ); + V_RSVD = ( T * ) calloc(n * n, sizeof( T ) ); + Sigma_RSVD = ( T * ) calloc(n, sizeof( T ) ); + Buffer = ( T * ) calloc(m * n, sizeof( T ) ); + Sigma_cpy = ( T * ) calloc(n * n, sizeof( T ) ); + U_cpy = ( T * ) calloc(m * n, sizeof( T ) ); + VT_cpy = ( T * ) calloc(n * n, sizeof( T ) ); + + //A_lowrank_svd = ( T * ) calloc(m * n, sizeof( T ) ); + //A_lowrank_svd_const = ( T * ) calloc(m * n, sizeof( T ) ); A_lowrank_svd = nullptr; A_lowrank_svd_const = nullptr; row = m; col = n; tolerance = tol; } - - ~RBKI_benchmark_data() { - delete[] A; - delete[] U; - delete[] VT; - delete[] V; - delete[] Sigma; - delete[] U_RSVD; - delete[] V_RSVD; - delete[] Sigma_RSVD; - delete[] Buffer; - delete[] Sigma_cpy; - delete[] U_cpy; - delete[] VT_cpy; - delete[] A_lowrank_svd; - delete[] A_lowrank_svd_const; - } }; template @@ -259,9 +244,9 @@ static void call_all_algs( // There is no reason to run SVD many times, as it always outputs the same result. if ((b_sz == 16) && (num_matmuls == 2) && ((i == 0) || (i == 1))) { // Running SVD - auto start_svd = high_resolution_clock::now(); + auto start_svd = steady_clock::now(); lapack::gesdd(Job::SomeVec, m, n, all_data.A, m, all_data.Sigma, all_data.U, m, all_data.VT, n); - auto stop_svd = high_resolution_clock::now(); + auto stop_svd = steady_clock::now(); dur_svd = duration_cast(stop_svd - start_svd).count(); printf("TOTAL TIME FOR SVD %ld\n", dur_svd); @@ -281,9 +266,9 @@ static void call_all_algs( } // Running RBKI - auto start_rbki = high_resolution_clock::now(); + auto start_rbki = steady_clock::now(); all_algs.RBKI.call(m, n, all_data.A, m, b_sz, all_data.U, all_data.VT, all_data.Sigma, state_alg); - auto stop_rbki = high_resolution_clock::now(); + auto stop_rbki = steady_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); printf("TOTAL TIME FOR RBKI %ld\n", dur_rbki); @@ -298,10 +283,10 @@ static void call_all_algs( data_regen(m_info, all_data, state_gen, 1); // Running RSVD - auto start_rsvd = high_resolution_clock::now(); + auto start_rsvd = steady_clock::now(); int64_t threshold_RSVD = (int64_t ) (b_sz * num_matmuls / 2); all_algs.RSVD.call(m, n, all_data.A, threshold_RSVD, tol, all_data.U_RSVD, all_data.Sigma_RSVD, all_data.V_RSVD, state_alg); - auto stop_rsvd = high_resolution_clock::now(); + auto stop_rsvd = steady_clock::now(); dur_rsvd = duration_cast(stop_rsvd - start_rsvd).count(); printf("TOTAL TIME FOR RSVD %ld\n", dur_rsvd); @@ -324,10 +309,10 @@ static void call_all_algs( // There is no reason to run SVDS many times, as it always outputs the same result. if ((num_matmuls == 2) && ((i == 0) || (i == 1))) { // Running SVDS - auto start_svds = high_resolution_clock::now(); + auto start_svds = steady_clock::now(); Spectra::PartialSVDSolver svds(all_data.A_spectra, std::min(custom_rank, n-2), std::min(2 * custom_rank, n-1)); svds.compute(); - auto stop_svds = high_resolution_clock::now(); + auto stop_svds = steady_clock::now(); dur_svds = duration_cast(stop_svds - start_svds).count(); printf("TOTAL TIME FOR SVDS %ld\n", dur_svds); @@ -418,8 +403,8 @@ int main(int argc, char *argv[]) { RandLAPACK::gen::mat_gen_info m_info_A_svd(m, n, RandLAPACK::gen::custom_input); m_info_A_svd.filename = argv[2]; m_info_A_svd.workspace_query_mod = 0; - all_data.A_lowrank_svd = new double[m * n](); - all_data.A_lowrank_svd_const = new double[m * n](); + all_data.A_lowrank_svd = ( double * ) calloc(m * n, sizeof( double ) ); + all_data.A_lowrank_svd_const = ( double * ) calloc(m * n, sizeof( double ) ); RandLAPACK::gen::mat_gen(m_info_A_svd, all_data.A_lowrank_svd_const, state); lapack::lacpy(MatrixType::General, m, n, all_data.A_lowrank_svd_const, m, all_data.A_lowrank_svd, m); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc index 23955b3a..79911dc7 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc @@ -138,9 +138,9 @@ static void call_all_algs( printf("Iteration %d start.\n", i); // Testing RBKI - auto start_rbki = high_resolution_clock::now(); + auto start_rbki = steady_clock::now(); RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state_alg); - auto stop_rbki = high_resolution_clock::now(); + auto stop_rbki = steady_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); T residual_err_custom = residual_error_comp(all_data, custom_rank); diff --git a/benchmark/bench_general/GEMM_flop_count.cc b/benchmark/bench_general/GEMM_flop_count.cc index f0392760..856d3544 100644 --- a/benchmark/bench_general/GEMM_flop_count.cc +++ b/benchmark/bench_general/GEMM_flop_count.cc @@ -22,9 +22,9 @@ test_flops(int64_t k, int runs = 50; T GFLOPS_rate_best = 0; - T* A = new T[size](); - T* B = new T[size](); - T* C = new T[size](); + T* A = ( T * ) calloc( size, sizeof( T ) ); + T* B = ( T * ) calloc( size, sizeof( T ) ); + T* C = ( T * ) calloc( size, sizeof( T ) ); RandLAPACK::gen::mat_gen_info m_info(k, k, RandLAPACK::gen::gaussian); @@ -33,9 +33,9 @@ test_flops(int64_t k, RandLAPACK::gen::mat_gen(m_info, B, state); // Get the timing - auto start = high_resolution_clock::now(); + auto start = steady_clock::now(); gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, k, k, k, 1.0, A, k, B, k, 0.0, C, k); - auto stop = high_resolution_clock::now(); + auto stop = steady_clock::now(); long dur = duration_cast(stop - start).count(); T dur_s = dur / 1e+6; @@ -46,10 +46,6 @@ test_flops(int64_t k, } printf("THE SYSTEM IS CAPABLE OF %f GFLOPs/sec.\n\n", GFLOPS_rate_best); - - delete[] A; - delete[] B; - delete[] C; } int main() { diff --git a/benchmark/bench_general/Gemm_vs_ormqr.cc b/benchmark/bench_general/Gemm_vs_ormqr.cc index 7b5f922a..252bf73f 100644 --- a/benchmark/bench_general/Gemm_vs_ormqr.cc +++ b/benchmark/bench_general/Gemm_vs_ormqr.cc @@ -46,15 +46,15 @@ test_speed(int64_t m, // Get the implicit Q-factor in A_dat lapack::geqrf(m, n, A_dat, m, tau_dat); - auto start_ormqr = high_resolution_clock::now(); + auto start_ormqr = steady_clock::now(); lapack::ormqr(Side::Left, Op::Trans, m, n, n, A_dat, m, tau_dat, B1_dat, m); - auto stop_ormqr = high_resolution_clock::now(); + auto stop_ormqr = steady_clock::now(); long dur_ormqr = duration_cast(stop_ormqr - start_ormqr).count(); - auto start_gemm = high_resolution_clock::now(); + auto start_gemm = steady_clock::now(); lapack::ungqr(m, n, n, A_dat, m, tau_dat); gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, n, m, 1.0, A_dat, m, B2_dat, m, 0.0, Product_dat, n); - auto stop_gemm = high_resolution_clock::now(); + auto stop_gemm = steady_clock::now(); long dur_gemm = duration_cast(stop_gemm - start_gemm).count(); T gflop_count_gemm = (2 * std::pow(n, 2) * m) / std::pow(10, 9); diff --git a/benchmark/bench_general/basic_blas_speed.cc b/benchmark/bench_general/basic_blas_speed.cc index 0b1fe207..171dab5b 100644 --- a/benchmark/bench_general/basic_blas_speed.cc +++ b/benchmark/bench_general/basic_blas_speed.cc @@ -84,27 +84,27 @@ static void call_all_algs( for (int i = 0; i < numruns; ++i) { printf("ITERATION %d, DIM %ld\n", i, n); // Testing BLAS3 - auto start_blas3 = high_resolution_clock::now(); + auto start_blas3 = steady_clock::now(); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, 1.0, all_data.A.data(), n, all_data.B.data(), n, 0.0, all_data.C.data(), n); - auto stop_blas3 = high_resolution_clock::now(); + auto stop_blas3 = steady_clock::now(); dur_blas3 = duration_cast(stop_blas3 - start_blas3).count(); state_gen = state; data_regen(m_info, all_data, state_gen, 3); // Testing BLAS2 - auto start_blas2 = high_resolution_clock::now(); + auto start_blas2 = steady_clock::now(); blas::gemv(Layout::ColMajor, Op::NoTrans, n, n, 1.0, all_data.A.data(), n, all_data.a.data(), 1, 1.0, all_data.b.data(), 1); - auto stop_blas2 = high_resolution_clock::now(); + auto stop_blas2 = steady_clock::now(); dur_blas2 = duration_cast(stop_blas2 - start_blas2).count(); state_gen = state; data_regen(m_info, all_data, state_gen, 2); // Testing BLAS1 - auto start_blas1 = high_resolution_clock::now(); + auto start_blas1 = steady_clock::now(); blas::axpy(n, -1.0, all_data.a.data(), 1, all_data.b.data(), 1); - auto stop_blas1 = high_resolution_clock::now(); + auto stop_blas1 = steady_clock::now(); dur_blas1 = duration_cast(stop_blas1 - start_blas1).count(); state_gen = state; diff --git a/test/comps/test_determiter.cc b/test/comps/test_determiter.cc index fefc1ec8..76887f7d 100644 --- a/test/comps/test_determiter.cc +++ b/test/comps/test_determiter.cc @@ -47,9 +47,9 @@ class TestDetermiterOLS : public ::testing::Test { double delta = 0.1; double tol = 1e-8; - RandLAPACK::pcg( + RandLAPACK::pcg_saddle( m, n, A.data(), m, b.data(), c.data(), delta, - resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data()); + 10*n, resid_vec.data(), tol, n, M.data(), n, x0.data(), x.data(), y.data()); int64_t iter_count = 0; @@ -72,3 +72,129 @@ TEST_F(TestDetermiterOLS, Trivial) { run(k_idx); } } + +class TestDetermiterLockBlockPCG : public ::testing::Test { + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + void run_simple_block(int64_t m, int64_t s, T coeff, uint32_t seed) { + using std::vector; + auto layout = blas::Layout::ColMajor; + vector G_buff(m*m, 0.0); + vector H(m*s, 0.0); + randblas_require((int64_t) H.size() == m*s); + vector X_star(m*s, 0.0); + vector X_init(m*s, 0.0); + RandBLAS::RNGState state0(seed); + vector temp(2*m*m); + auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian}; + auto state1 = RandBLAS::fill_dense(D, temp.data(), state0); + blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m); + + vector regs(1, coeff); + RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs); + RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian}; + auto Xsd = X_star.data(); + auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1); + G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m); + + RandLAPACK::StatefulFrobeniusNorm seminorm{}; + + auto I_buff = eye(m); + vector zeros(1, 0.0); + RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros); + + T tol = 100*std::numeric_limits::epsilon(); + RandLAPACK::pcg(G, H.data(), s, seminorm, tol, m, I, X_init.data(), true); + + T tol_scale = std::sqrt((T)m); + T atol = tol_scale * std::pow(std::numeric_limits::epsilon(), 0.5); + T rtol = tol_scale * atol; + test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s, + __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol + ); + return; + } + + virtual void run_simple_lockstep(int64_t m, int64_t s, uint32_t seed) { + using T = double; + randblas_require(s <= 4); + using std::vector; + vector reg_coeffs{}; + reg_coeffs.push_back(100); + if (s > 1) + reg_coeffs.push_back(7); + if (s > 2) + reg_coeffs.push_back(0.1); + if (s > 3) + reg_coeffs.push_back(0.5483); + auto layout = blas::Layout::ColMajor; + vector G_buff(m*m, 0.0); + vector H(m*s, 0.0); + vector X_star(m*s, 0.0); + vector X_init(m*s, 0.0); + RandBLAS::RNGState state0(seed); + vector temp(2*m*m); + + auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian}; + auto state1 = RandBLAS::fill_dense(D, temp.data(), state0); + blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m); + + vector regs(reg_coeffs); + RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs); + RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian}; + auto Xsd = X_star.data(); + auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1); + G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m); + + auto I_buff = eye(m); + vector zeros(s, 0.0); + RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros); + + // While we're testing lockstep PCG, let's call the overload of pcg + // that automatically chooses the seminorm. We don't care about the return value + // in this case. + T tol = 100*std::numeric_limits::epsilon(); + RandLAPACK::pcg(G, H, tol, m, I, X_init, true); + + T tol_scale = std::sqrt((T)m); + T atol = tol_scale * std::pow(std::numeric_limits::epsilon(), 0.5); + T rtol = tol_scale * atol; + test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s, + __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol + ); + return; + } +}; + + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_1) { + run_simple_block(5, 1, 0.5, 1997); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_6_2) { + run_simple_block(6, 2, 0.5, 1997); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_4) { + run_simple_block(5, 4, 0.5, 1997); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_1) { + run_simple_lockstep(5, 1, 1997); + run_simple_lockstep(5, 1, 2024); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_6_2) { + run_simple_lockstep(6, 2, 1997); + run_simple_lockstep(6, 2, 2024); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_4) { + run_simple_lockstep(5, 4, 1997); + run_simple_lockstep(5, 4, 2024); +} diff --git a/test/comps/test_pcgls.cc b/test/comps/test_pcgls.cc index 9181d932..92e2f106 100644 --- a/test/comps/test_pcgls.cc +++ b/test/comps/test_pcgls.cc @@ -29,8 +29,8 @@ void run_pcgls_ex(int n, int m) double delta = 0.1; double tol = 1e-8; - RandLAPACK::pcg(m, n, A.data(), m, b.data(), c.data(), delta, - resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data()); + RandLAPACK::pcg_saddle(m, n, A.data(), m, b.data(), c.data(), delta, + 10*n, resid_vec.data(), tol, n, M.data(), n, x0.data(), x.data(), y.data()); for (double res: resid_vec) { diff --git a/test/comps/test_syrf.cc b/test/comps/test_syrf.cc index 23d68af9..b8afc19b 100644 --- a/test/comps/test_syrf.cc +++ b/test/comps/test_syrf.cc @@ -8,6 +8,7 @@ #include #include + class TestSYRF : public ::testing::Test { protected: @@ -45,9 +46,12 @@ class TestSYRF : public ::testing::Test template struct algorithm_objects { - RandLAPACK::SYPS SYPS; - RandLAPACK::HQRQ Orth_RF; - RandLAPACK::SYRF SYRF; + using SYPS_t = RandLAPACK::SYPS; + using Orth_t = RandLAPACK::HQRQ; + using SYRF_t = RandLAPACK::SYRF; + SYPS_t syps; + Orth_t orth; + SYRF_t syrf; algorithm_objects( bool verbose, @@ -55,9 +59,9 @@ class TestSYRF : public ::testing::Test int64_t p, int64_t passes_per_iteration ) : - SYPS(p, passes_per_iteration, verbose, cond_check), - Orth_RF(cond_check, verbose), - SYRF(SYPS, Orth_RF, verbose, cond_check) + syps(p, passes_per_iteration, verbose, cond_check), + orth(cond_check, verbose), + syrf(syps, orth, verbose, cond_check) {} }; @@ -95,7 +99,7 @@ class TestSYRF : public ::testing::Test auto m = all_data.row; auto k = all_data.rank; - all_algs.SYRF.call(Uplo::Upper, m, all_data.A.data(), k, all_data.Q, state, NULL); + all_algs.syrf.call(Uplo::Upper, m, all_data.A.data(), k, all_data.Q, state, NULL); // Reassing pointers because Q, B have been resized T* Q_dat = all_data.Q.data(); diff --git a/test/comps/test_util_gpu.cu b/test/comps/test_util_gpu.cu index 5fa7ebf1..c9bbbbcb 100644 --- a/test/comps/test_util_gpu.cu +++ b/test/comps/test_util_gpu.cu @@ -4,8 +4,8 @@ #include -#include -#include +#include +#include #include #include @@ -61,18 +61,18 @@ class TestUtil_GPU : public ::testing::Test { row = m; col = n; - cudaMalloc(&A_device, m * n * sizeof(T)); - cudaMalloc(&B_device, m * n * sizeof(T)); - cudaMalloc(&J_device, n * sizeof(int64_t)); - cudaMalloc(&buf_device, n * sizeof(int64_t)); - cudaDeviceSynchronize(); + hipMalloc(&A_device, m * n * sizeof(T)); + hipMalloc(&B_device, m * n * sizeof(T)); + hipMalloc(&J_device, n * sizeof(int64_t)); + hipMalloc(&buf_device, n * sizeof(int64_t)); + hipDeviceSynchronize(); } ~ColSwpTestData() { - cudaFree(A_device); - cudaFree(B_device); - cudaFree(J_device); - cudaFree(buf_device); + hipFree(A_device); + hipFree(B_device); + hipFree(J_device); + hipFree(buf_device); } }; @@ -93,14 +93,14 @@ class TestUtil_GPU : public ::testing::Test { length_J = m; length_idx = k; - cudaMalloc(&J_device, m * sizeof(int64_t)); - cudaMalloc(&idx_device, k * sizeof(int64_t)); - cudaDeviceSynchronize(); + hipMalloc(&J_device, m * sizeof(int64_t)); + hipMalloc(&idx_device, k * sizeof(int64_t)); + hipDeviceSynchronize(); } ~ColSwpVecTestData() { - cudaFree(J_device); - cudaFree(idx_device); + hipFree(J_device); + hipFree(idx_device); } }; @@ -121,13 +121,13 @@ class TestUtil_GPU : public ::testing::Test { row = m; col = n; - cudaMalloc(&A_device, m * n * sizeof(T)); - cudaMalloc(&A_device_T, n * m * sizeof(T)); + hipMalloc(&A_device, m * n * sizeof(T)); + hipMalloc(&A_device_T, n * m * sizeof(T)); } ~TranspTestData() { - cudaFree(A_device); - cudaFree(A_device_T); + hipFree(A_device); + hipFree(A_device_T); } }; @@ -151,15 +151,15 @@ class TestUtil_GPU : public ::testing::Test { row = m; col = n; - cudaMalloc(&A_device, m * n * sizeof(T)); - cudaMalloc(&y_device, n * n * sizeof(T)); - cudaMalloc(&x_device, m * sizeof(T)); + hipMalloc(&A_device, m * n * sizeof(T)); + hipMalloc(&y_device, n * n * sizeof(T)); + hipMalloc(&x_device, m * sizeof(T)); } ~GerTestData() { - cudaFree(A_device); - cudaFree(y_device); - cudaFree(x_device); + hipFree(A_device); + hipFree(y_device); + hipFree(x_device); } }; @@ -169,26 +169,26 @@ class TestUtil_GPU : public ::testing::Test auto m = all_data.row; auto n = all_data.col; - cudaStream_t strm = cudaStreamPerThread; + hipStream_t strm = hipStreamPerThread; - cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice, strm); - cudaMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), cudaMemcpyHostToDevice, strm); - cudaStreamSynchronize(strm); + hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice, strm); + hipMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), hipMemcpyHostToDevice, strm); + hipStreamSynchronize(strm); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("GPU ERROR. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("GPU ERROR. " << hipGetErrorString(ierr)) abort(); } printf("Passed the general error check\n"); RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, n, n, all_data.A_device, m, all_data.J_device); - cudaMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost, strm); - cudaMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), cudaMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), hipMemcpyDeviceToHost, strm); RandLAPACK::util::col_swap(m, n, n, all_data.A.data(), m, all_data.J); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); for(int i = 0; i < m*n; ++i) all_data.A[i] -= all_data.A_host_buffer[i]; @@ -196,10 +196,10 @@ class TestUtil_GPU : public ::testing::Test T norm_test = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); printf("\nNorm diff GPU CPU: %e\n", norm_test); EXPECT_NEAR(norm_test, 0.0, std::pow(std::numeric_limits::epsilon(), 0.75)); - ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr)) abort(); } } @@ -218,21 +218,21 @@ class TestUtil_GPU : public ::testing::Test auto n = all_data.col; auto lda = m; - cudaStream_t strm = cudaStreamPerThread; - cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), cudaMemcpyHostToDevice, strm); - cudaMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), cudaMemcpyHostToDevice, strm); + hipStream_t strm = hipStreamPerThread; + hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), hipMemcpyHostToDevice, strm); + hipMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), hipMemcpyHostToDevice, strm); T* A_device = all_data.A_device; T* A_device_submat = all_data.A_device + (col_offset * m + offset); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); RandLAPACK::cuda_kernels::col_swap_gpu(strm, m_submat, n_submat, k_submat, A_device_submat, lda, all_data.J_device); - cudaMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost, strm); - cudaMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), cudaMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), hipMemcpyDeviceToHost, strm); T* A = all_data.A.data(); T* A_submat = all_data.A.data() + (col_offset * m + offset); RandLAPACK::util::col_swap(m_submat, n_submat, k_submat, A_submat, lda, all_data.J); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); for(int i = 0; i < m*n; ++i) all_data.A[i] -= all_data.A_host_buffer[i]; @@ -241,10 +241,10 @@ class TestUtil_GPU : public ::testing::Test printf("\nNorm diff GPU CPU: %e\n", norm_test); EXPECT_NEAR(norm_test, 0.0, std::pow(std::numeric_limits::epsilon(), 0.75)); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr)) abort(); } } @@ -258,18 +258,18 @@ class TestUtil_GPU : public ::testing::Test auto m = all_data.length_J; auto k = all_data.length_idx; - cudaStream_t strm = cudaStreamPerThread; - cudaMemcpyAsync(all_data.J_device, all_data.J.data(), m * sizeof(int64_t), cudaMemcpyHostToDevice, strm); - cudaMemcpyAsync(all_data.idx_device, all_data.idx.data(), k * sizeof(int64_t), cudaMemcpyHostToDevice, strm); + hipStream_t strm = hipStreamPerThread; + hipMemcpyAsync(all_data.J_device, all_data.J.data(), m * sizeof(int64_t), hipMemcpyHostToDevice, strm); + hipMemcpyAsync(all_data.idx_device, all_data.idx.data(), k * sizeof(int64_t), hipMemcpyHostToDevice, strm); int64_t* J_device_subvec = all_data.J_device + offset; - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, k, J_device_subvec, all_data.idx_device); - cudaMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, m * sizeof(int64_t), cudaMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, m * sizeof(int64_t), hipMemcpyDeviceToHost, strm); int64_t* J_subvec = all_data.J.data() + offset; std::vector buf; RandLAPACK::util::col_swap(m, k, J_subvec, all_data.idx); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); for(int i = 0; i < m; ++i){ all_data.J[i] -= all_data.J_host_buffer[i]; @@ -279,10 +279,10 @@ class TestUtil_GPU : public ::testing::Test printf("\nNorm diff GPU CPU: %e\n", norm_test); EXPECT_NEAR(norm_test, 0.0, std::pow(std::numeric_limits::epsilon(), 0.75)); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr)) abort(); } } @@ -294,15 +294,15 @@ class TestUtil_GPU : public ::testing::Test auto m = all_data.row; auto n = all_data.col; - cudaStream_t strm = cudaStreamPerThread; + hipStream_t strm = hipStreamPerThread; - cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); + hipMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice); // Perform Pivoted QR lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); // Swap columns in A's copy - cudaMemcpy(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), cudaMemcpyHostToDevice); + hipMemcpy(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), hipMemcpyHostToDevice); RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, n, n, all_data.A_device, m, all_data.J_device); - cudaMemcpy(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost); + hipMemcpy(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost); // Create an identity and store Q in it. RandLAPACK::util::eye(m, n, all_data.Ident.data()); @@ -318,10 +318,10 @@ class TestUtil_GPU : public ::testing::Test printf("||A_piv - QR||_F: %e\n", norm); EXPECT_NEAR(norm, 0.0, std::pow(std::numeric_limits::epsilon(), 0.625)); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr)) abort(); } } @@ -332,14 +332,14 @@ class TestUtil_GPU : public ::testing::Test auto m = all_data.row; auto n = all_data.col; - cudaStream_t strm = cudaStreamPerThread; + hipStream_t strm = hipStreamPerThread; - cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice, strm); - cudaStreamSynchronize(strm); + hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice, strm); + hipStreamSynchronize(strm); RandLAPACK::cuda_kernels::transposition_gpu(strm, m, n, all_data.A_device, m, all_data.A_device_T, n, 0); - cudaMemcpyAsync(all_data.A_T_buffer.data(), all_data.A_device_T, n * m * sizeof(T), cudaMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.A_T_buffer.data(), all_data.A_device_T, n * m * sizeof(T), hipMemcpyDeviceToHost, strm); RandLAPACK::util::transposition(m, n, all_data.A.data(), m, all_data.A_T.data(), n, 0); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); // A_piv - A_cpy for(int i = 0; i < m * n; ++i) @@ -349,10 +349,10 @@ class TestUtil_GPU : public ::testing::Test printf("||A_T_host - A_T_device||_F: %e\n", norm); EXPECT_NEAR(norm, 0.0, std::pow(std::numeric_limits::epsilon(), 0.625)); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr)) abort(); } } @@ -365,16 +365,16 @@ class TestUtil_GPU : public ::testing::Test auto m = all_data.row; auto n = all_data.col; - cudaStream_t strm = cudaStreamPerThread; - cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), cudaMemcpyHostToDevice, strm); - cudaStreamSynchronize(strm); - cudaMemcpyAsync(all_data.y_device, all_data.y.data(), n * n * sizeof(T), cudaMemcpyHostToDevice, strm); - cudaMemcpyAsync(all_data.x_device, all_data.x.data(), m * sizeof(T), cudaMemcpyHostToDevice, strm); + hipStream_t strm = hipStreamPerThread; + hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), hipMemcpyHostToDevice, strm); + hipStreamSynchronize(strm); + hipMemcpyAsync(all_data.y_device, all_data.y.data(), n * n * sizeof(T), hipMemcpyHostToDevice, strm); + hipMemcpyAsync(all_data.x_device, all_data.x.data(), m * sizeof(T), hipMemcpyHostToDevice, strm); RandLAPACK::cuda_kernels::ger_gpu(strm, m, n, alpha, all_data.x_device, 1, all_data.y_device, n + 1, all_data.A_device, m); - cudaMemcpyAsync(all_data.A_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost, strm); + hipMemcpyAsync(all_data.A_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost, strm); // Y has stride of n + 1 blas::ger(Layout::ColMajor, m, n, alpha, all_data.x.data(), 1, all_data.y.data(), n + 1, all_data.A.data(), m); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); // A_piv - A_cpy for(int i = 0; i < m * n; ++i) @@ -384,10 +384,10 @@ class TestUtil_GPU : public ::testing::Test printf("||A_host - A_device||_F: %e\n", norm); EXPECT_NEAR(norm, 0.0, std::pow(std::numeric_limits::epsilon(), 0.625)); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr)) abort(); } } @@ -517,9 +517,9 @@ TEST_F(TestUtil_GPU, test_ger_gpu) { m_info.exponent = 2.0; RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); RandBLAS::DenseDist D1(n, n); - state = RandBLAS::fill_dense(D1, all_data.y.data(), state).second; + state = RandBLAS::fill_dense(D1, all_data.y.data(), state); RandBLAS::DenseDist D2(1, m); - state = RandBLAS::fill_dense(D2, all_data.x.data(), state).second; + state = RandBLAS::fill_dense(D2, all_data.x.data(), state); ger_gpu(alpha, all_data); } diff --git a/test/drivers/bench_bqrrp_gpu.cu b/test/drivers/bench_bqrrp_gpu.cu index 342f364a..93a8bb1f 100644 --- a/test/drivers/bench_bqrrp_gpu.cu +++ b/test/drivers/bench_bqrrp_gpu.cu @@ -3,9 +3,9 @@ #include "rl_lapackpp.hh" #include "rl_gen.hh" -#include -#include -#include +#include +#include +#include #include #include @@ -49,19 +49,19 @@ class BenchBQRRP : public ::testing::TestWithParam { row = m; col = n; - cudaMalloc(&A_device, m * n * sizeof(T)); - cudaMalloc(&tau_device, n * sizeof(T)); - cudaMalloc(&J_device, n * sizeof(int64_t)); - cudaMalloc(&R_device, n * n * sizeof(T)); - cudaMalloc(&D_device, n * sizeof(T)); + hipMalloc(&A_device, m * n * sizeof(T)); + hipMalloc(&tau_device, n * sizeof(T)); + hipMalloc(&J_device, n * sizeof(int64_t)); + hipMalloc(&R_device, n * n * sizeof(T)); + hipMalloc(&D_device, n * sizeof(T)); } ~BQRRPBenchData() { - cudaFree(A_device); - cudaFree(tau_device); - cudaFree(J_device); - cudaFree(R_device); - cudaFree(D_device); + hipFree(A_device); + hipFree(tau_device); + hipFree(J_device); + hipFree(R_device); + hipFree(D_device); } }; @@ -76,8 +76,8 @@ class BenchBQRRP : public ::testing::TestWithParam auto n = m_info.cols; RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state_const); - cudaMemset(all_data.J_device, 0.0, n); - cudaMemset(all_data.tau_device, 0.0, n); + hipMemset(all_data.J_device, 0.0, n); + hipMemset(all_data.tau_device, 0.0, n); } template @@ -100,14 +100,13 @@ class BenchBQRRP : public ::testing::TestWithParam // BQRRP with QRF // Skethcing in an sampling regime - cudaMalloc(&all_data.A_sk_device, d * n * sizeof(T)); - all_data.A_sk = new T[d * n](); - T* S = new T[d * m](); - + hipMalloc(&all_data.A_sk_device, d * n * sizeof(T)); + all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) ); + T* S = ( T * ) calloc( d * m, sizeof( T ) ); RandBLAS::DenseDist D(d, m); - RandBLAS::fill_dense(D, S, state_const).second; + RandBLAS::fill_dense(D, S, state_const); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d); - cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice); + hipMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), hipMemcpyHostToDevice); RandLAPACK::BQRRP_GPU BQRRP_GPU_QRF(profile_runtime, block_size); BQRRP_GPU_QRF.qr_tall = GPUSubroutines::QRTall::geqrf; auto start_bqrrp_qrf = std::chrono::steady_clock::now(); @@ -115,8 +114,8 @@ class BenchBQRRP : public ::testing::TestWithParam auto stop_bqrrp_qrf = std::chrono::steady_clock::now(); auto diff_bqrrp_qrf = std::chrono::duration_cast(stop_bqrrp_qrf - start_bqrrp_qrf).count(); data_regen(m_info, all_data, state); - cudaFree(all_data.A_sk_device); - delete[] all_data.A_sk; + hipFree(all_data.A_sk_device); + free(all_data.A_sk); if(profile_runtime) { std::ofstream file(output_filename_breakdown_QRF, std::ios::app); @@ -126,11 +125,11 @@ class BenchBQRRP : public ::testing::TestWithParam // BQRRP with CholQR // Skethcing in an sampling regime - cudaMalloc(&all_data.A_sk_device, d * n * sizeof(T)); - all_data.A_sk = new T[d * n](); + hipMalloc(&all_data.A_sk_device, d * n * sizeof(T)); + all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) ); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d); - delete[] S; - cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice); + free(S); + hipMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), hipMemcpyHostToDevice); RandLAPACK::BQRRP_GPU BQRRP_GPU_CholQR(profile_runtime, block_size); BQRRP_GPU_CholQR.qr_tall = GPUSubroutines::QRTall::cholqr; auto start_bqrrp_cholqr = std::chrono::steady_clock::now(); @@ -138,8 +137,8 @@ class BenchBQRRP : public ::testing::TestWithParam auto stop_bqrrp_cholqr = std::chrono::steady_clock::now(); auto diff_bqrrp_cholqr = std::chrono::duration_cast(stop_bqrrp_cholqr - start_bqrrp_cholqr).count(); data_regen(m_info, all_data, state); - cudaFree(all_data.A_sk_device); - delete[] all_data.A_sk; + hipFree(all_data.A_sk_device); + free(all_data.A_sk); if(profile_runtime) { std::ofstream file(output_filename_breakdown_CholQR, std::ios::app); @@ -172,10 +171,10 @@ class BenchBQRRP : public ::testing::TestWithParam printf(" BLOCK SIZE = %ld BQRRP+QRF TIME (MS) = %ld BQRRP+CholQR TIME (MS) = %ld\n", block_size, diff_bqrrp_qrf, diff_bqrrp_cholqr); std::ofstream file(output_filename_speed, std::ios::app); file << m << " " << n << " " << block_size << " " << diff_bqrrp_qrf << " " << diff_bqrrp_cholqr << " " << diff_qrf << "\n"; - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before bench_bqrrp returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before bench_bqrrp returned. " << hipGetErrorString(ierr)) abort(); } } @@ -195,7 +194,7 @@ class BenchBQRRP : public ::testing::TestWithParam // Initialize GPU stuff lapack::Queue lapack_queue(0); - cudaStream_t strm = lapack_queue.stream(); + hipStream_t strm = lapack_queue.stream(); using lapack::device_info_int; device_info_int* d_info = blas::device_malloc< device_info_int >( 1, lapack_queue ); char* d_work_geqrf; @@ -215,7 +214,7 @@ class BenchBQRRP : public ::testing::TestWithParam // ORHR_COL part RandLAPACK::cuda_kernels::orhr_col_gpu(strm, m, n, all_data.A_device, m, all_data.tau_device, all_data.D_device); RandLAPACK::cuda_kernels::R_cholqr_signs_gpu(strm, n, n, all_data.R_device, all_data.D_device); - cudaStreamSynchronize(strm); + hipStreamSynchronize(strm); auto stop_orhr_col = std::chrono::steady_clock::now(); auto diff_orhr_col = std::chrono::duration_cast(stop_orhr_col - start_orhr_col).count(); @@ -239,10 +238,10 @@ class BenchBQRRP : public ::testing::TestWithParam std::ofstream file(output_filename, std::ios::app); file << m << " " << n << " " << diff_cholqr << " " << diff_orhr_col << " " << diff_qrf << "\n"; - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before bench_CholQR returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before bench_CholQR returned. " << hipGetErrorString(ierr)) abort(); } } @@ -262,7 +261,7 @@ TEST_P(BenchBQRRP, GPU_fixed_blocksize) { BQRRPBenchData all_data(m, n); RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); - cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); + hipMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice); std::string file1 = "BQRRP_GPU_runtime_breakdown_innerQRF_1_rows_" diff --git a/test/drivers/test_bqrrp_gpu.cu b/test/drivers/test_bqrrp_gpu.cu index 99858865..5c5ea120 100644 --- a/test/drivers/test_bqrrp_gpu.cu +++ b/test/drivers/test_bqrrp_gpu.cu @@ -3,9 +3,9 @@ #include "rl_lapackpp.hh" #include "rl_gen.hh" -#include -#include -#include +#include +#include +#include #include #include @@ -73,17 +73,17 @@ class TestBQRRP : public ::testing::TestWithParam row = m; col = n; rank = k; - cudaMalloc(&A_device, m * n * sizeof(T)); - cudaMalloc(&A_sk_device, d * n * sizeof(T)); - cudaMalloc(&tau_device, n * sizeof(T)); - cudaMalloc(&J_device, n * sizeof(int64_t)); + hipMalloc(&A_device, m * n * sizeof(T)); + hipMalloc(&A_sk_device, d * n * sizeof(T)); + hipMalloc(&tau_device, n * sizeof(T)); + hipMalloc(&J_device, n * sizeof(int64_t)); } ~BQRRPTestData() { - cudaFree(A_device); - cudaFree(A_sk_device); - cudaFree(tau_device); - cudaFree(J_device); + hipFree(A_device); + hipFree(A_sk_device); + hipFree(tau_device); + hipFree(J_device); } }; @@ -95,14 +95,14 @@ class TestBQRRP : public ::testing::TestWithParam auto state_const = state; // Skethcing in an sampling regime - T* S = new T[d * m](); + T* S = ( T * ) calloc( d * m, sizeof( T ) ); RandBLAS::DenseDist D(d, m); - RandBLAS::fill_dense(D, S, state_const).second; + RandBLAS::fill_dense(D, S, state_const); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk.data(), d); - delete[] S; - cudaMemcpy(all_data.A_sk_device, all_data.A_sk.data(), d * n * sizeof(double), cudaMemcpyHostToDevice); + free(S); + hipMemcpy(all_data.A_sk_device, all_data.A_sk.data(), d * n * sizeof(double), hipMemcpyHostToDevice); - cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); + hipMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpu.data(), m); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m); @@ -174,7 +174,7 @@ class TestBQRRP : public ::testing::TestWithParam BQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); if(BQRRP_GPU.rank == 0) { - cudaMemcpy(all_data.A.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost); + hipMemcpy(all_data.A.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost); for(int i = 0; i < m * n; ++i) { ASSERT_NEAR(all_data.A[i], 0.0, atol); } @@ -182,10 +182,10 @@ class TestBQRRP : public ::testing::TestWithParam all_data.rank = BQRRP_GPU.rank; printf("RANK AS RETURNED BY BQRRP GPU %4ld\n", all_data.rank); - cudaMemcpy(all_data.R_full.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost); - cudaMemcpy(all_data.Q.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost); - cudaMemcpy(all_data.tau.data(), all_data.tau_device, n * sizeof(T), cudaMemcpyDeviceToHost); - cudaMemcpy(all_data.J.data(), all_data.J_device, n * sizeof(int64_t), cudaMemcpyDeviceToHost); + hipMemcpy(all_data.R_full.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(all_data.Q.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(all_data.tau.data(), all_data.tau_device, n * sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(all_data.J.data(), all_data.J_device, n * sizeof(int64_t), hipMemcpyDeviceToHost); lapack::ungqr(m, n, n, all_data.Q.data(), m, all_data.tau.data()); RandLAPACK::util::upsize(all_data.rank * n, all_data.R); @@ -197,10 +197,10 @@ class TestBQRRP : public ::testing::TestWithParam error_check(norm_A, all_data, atol); } - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test_BQRRP_general returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test_BQRRP_general returned. " << hipGetErrorString(ierr)) abort(); } } @@ -219,9 +219,9 @@ class TestBQRRP : public ::testing::TestWithParam BQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); BQRRP_CPU.call(m, n, all_data.A_cpu.data(), m, (T) (d / BQRRP_CPU.block_size) , all_data.tau_cpu.data(), all_data.J_cpu.data(), state); - cudaMemcpy(all_data.R_full.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost); - cudaMemcpy(all_data.tau.data(), all_data.tau_device, n * sizeof(T), cudaMemcpyDeviceToHost); - cudaMemcpy(all_data.J.data(), all_data.J_device, n * sizeof(int64_t), cudaMemcpyDeviceToHost); + hipMemcpy(all_data.R_full.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(all_data.tau.data(), all_data.tau_device, n * sizeof(T), hipMemcpyDeviceToHost); + hipMemcpy(all_data.J.data(), all_data.J_device, n * sizeof(int64_t), hipMemcpyDeviceToHost); for(int i = 0; i < n; ++i) { all_data.tau[i] -= all_data.tau_cpu[i]; @@ -243,10 +243,10 @@ class TestBQRRP : public ::testing::TestWithParam ASSERT_NEAR(col_nrm_tau, 0.0, atol1); ASSERT_NEAR(norm_R_diff, 0.0, atol2); - cudaError_t ierr = cudaGetLastError(); - if (ierr != cudaSuccess) + hipError_t ierr = hipGetLastError(); + if (ierr != hipSuccess) { - RandLAPACK_CUDA_ERROR("Error before test_BQRRP_compare_with_CPU returned. " << cudaGetErrorString(ierr)) + RandLAPACK_CUDA_ERROR("Error before test_BQRRP_compare_with_CPU returned. " << hipGetErrorString(ierr)) abort(); } } diff --git a/test/drivers/test_cqrrpt_gpu.cu b/test/drivers/test_cqrrpt_gpu.cu index fd51e6b2..4be8a1d6 100644 --- a/test/drivers/test_cqrrpt_gpu.cu +++ b/test/drivers/test_cqrrpt_gpu.cu @@ -3,13 +3,13 @@ #include "rl_lapackpp.hh" #include "rl_gen.hh" -#include -#include +#include +#include #include #include #include -#include +#include #ifndef USE_CUDA #define USE_CUDA diff --git a/test/drivers/test_hqrrp.cc b/test/drivers/test_hqrrp.cc index 4e079620..ffa09f2b 100644 --- a/test/drivers/test_hqrrp.cc +++ b/test/drivers/test_hqrrp.cc @@ -53,7 +53,7 @@ class TestHQRRP : public ::testing::Test lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m); lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m); norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); - std::fill(all_data.J.begin(), all_data.J.end(), 0); + std::iota(all_data.J.begin(), all_data.J.end(), 1); } diff --git a/test/drivers/test_revd2.cc b/test/drivers/test_revd2.cc index 4feaf31b..6cfad98b 100644 --- a/test/drivers/test_revd2.cc +++ b/test/drivers/test_revd2.cc @@ -7,6 +7,7 @@ #include #include + class TestREVD2 : public ::testing::Test { protected: @@ -75,12 +76,13 @@ class TestREVD2 : public ::testing::Test template struct algorithm_objects { - RandLAPACK::SYPS SYPS; - RandLAPACK::HQRQ Orth_RF; - RandLAPACK::SYRF SYRF; - // ^ Needs a symmetric power skether and an orthogonalizer - RandLAPACK::REVD2 REVD2; - // ^ Needs a symmetric rangefinder. + using SYPS_t = RandLAPACK::SYPS; + using Orth_t = RandLAPACK::HQRQ; + using SYRF_t = RandLAPACK::SYRF; + SYPS_t syps; + Orth_t orth; + SYRF_t syrf; + RandLAPACK::REVD2 revd2; algorithm_objects( @@ -90,10 +92,10 @@ class TestREVD2 : public ::testing::Test int64_t passes_per_syps_stabilization, int64_t num_steps_power_iter_error_est ) : - SYPS(num_syps_passes, passes_per_syps_stabilization, verbose, cond_check), - Orth_RF(cond_check, verbose), - SYRF(SYPS, Orth_RF, verbose, cond_check), - REVD2(SYRF, num_steps_power_iter_error_est, verbose) + syps(num_syps_passes, passes_per_syps_stabilization, verbose, cond_check), + orth(cond_check, verbose), + syrf(syps, orth, verbose, cond_check), + revd2(syrf, num_steps_power_iter_error_est, verbose) {} }; @@ -150,7 +152,7 @@ class TestREVD2 : public ::testing::Test auto m = all_data.dim; int64_t k = k_start; - all_algs.REVD2.call(blas::Uplo::Upper, m, all_data.A.data(), k, tol, all_data.V, all_data.eigvals, state); + all_algs.revd2.call(blas::Uplo::Upper, m, all_data.A.data(), k, tol, all_data.V, all_data.eigvals, state); T* E_dat = RandLAPACK::util::upsize(k * k, all_data.E); T* Buf_dat = RandLAPACK::util::upsize(m * k, all_data.Buf); @@ -188,8 +190,8 @@ class TestREVD2 : public ::testing::Test auto m = all_data.dim; int64_t k = k_start; - all_algs.REVD2.call(blas::Uplo::Upper, m, all_data.A_u.data(), k, tol, all_data.V_u, all_data.eigvals_u, state); - all_algs.REVD2.call(blas::Uplo::Lower, m, all_data.A_l.data(), k, tol, all_data.V_l, all_data.eigvals_l, state); + all_algs.revd2.call(blas::Uplo::Upper, m, all_data.A_u.data(), k, tol, all_data.V_u, all_data.eigvals_u, state); + all_algs.revd2.call(blas::Uplo::Lower, m, all_data.A_l.data(), k, tol, all_data.V_l, all_data.eigvals_l, state); T* E_u_dat = RandLAPACK::util::upsize(k * k, all_data.E_u); T* E_l_dat = RandLAPACK::util::upsize(k * k, all_data.E_l);