diff --git a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
index 5745c9a8..8d1108dc 100644
--- a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
+++ b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
@@ -1,8 +1,9 @@
+#include "hip/hip_runtime.h"
 #pragma once
 #include "rl_cuda_macros.hh"
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cooperative_groups.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
 
 namespace RandLAPACK::cuda_kernels {
 
@@ -487,7 +488,7 @@ __global__ void copy_mat_gpu(
 
 template <typename T>
 void naive_rank_est(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t n,
     const T alpha,
     const T* A,
@@ -496,21 +497,21 @@ void naive_rank_est(
 ) {
 #ifdef USE_CUDA
     int64_t* rank_device;
-    cudaMalloc(&rank_device, sizeof(int64_t));
+    hipMalloc(&rank_device, sizeof(int64_t));
 
     constexpr int threadsPerBlock{128};
     int64_t num_blocks_ger{(n + threadsPerBlock - 1) / threadsPerBlock};
     naive_rank_est<<<num_blocks_ger, threadsPerBlock, sizeof(int64_t), stream>>>(n, alpha, A, lda, rank_device);
 
     // It seems that these synchs are required, after all
-    cudaStreamSynchronize(stream);
-    cudaMemcpy(&rank, rank_device, sizeof(int64_t), cudaMemcpyDeviceToHost);
-    cudaStreamSynchronize(stream);
-    cudaFree(rank_device);
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipStreamSynchronize(stream);
+    hipMemcpy(&rank, rank_device, sizeof(int64_t), hipMemcpyDeviceToHost);
+    hipStreamSynchronize(stream);
+    hipFree(rank_device);
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch naive_rank_est. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch naive_rank_est. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -520,7 +521,7 @@ void naive_rank_est(
 
 template <typename T>
 void all_of(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t n,
     const T alpha,
     const T* A,
@@ -528,19 +529,19 @@ void all_of(
 ) {
 #ifdef USE_CUDA
     bool* all_greater_device;
-    cudaMalloc(&all_greater_device, sizeof(bool));
+    hipMalloc(&all_greater_device, sizeof(bool));
 
     constexpr int threadsPerBlock{128};
     int64_t num_blocks_ger{(n + threadsPerBlock - 1) / threadsPerBlock};
     all_of<<<num_blocks_ger, threadsPerBlock, sizeof(bool), stream>>>(n, alpha, A, all_greater_device);
-    cudaStreamSynchronize(stream);
-    cudaMemcpy(&all_greater, all_greater_device, sizeof(bool), cudaMemcpyDeviceToHost);
-    cudaStreamSynchronize(stream);
-    cudaFree(all_greater_device);
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipStreamSynchronize(stream);
+    hipMemcpy(&all_greater, all_greater_device, sizeof(bool), hipMemcpyDeviceToHost);
+    hipStreamSynchronize(stream);
+    hipFree(all_greater_device);
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch all_greater. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch all_greater. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -550,7 +551,7 @@ void all_of(
 
 template <typename T>
 void ger_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m, 
     int64_t n, 
     T alpha, 
@@ -567,10 +568,10 @@ void ger_gpu(
     int64_t num_blocks_ger{(m + threadsPerBlock - 1) / threadsPerBlock};
     ger_gpu<<<num_blocks_ger, threadsPerBlock, 0, stream>>>(m, n, alpha, x, incx, y, incy, A, lda);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch ger_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch ger_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -580,7 +581,7 @@ void ger_gpu(
 
 template <typename T>
 void copy_mat_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m,
     int64_t n,
     const T* A,
@@ -594,10 +595,10 @@ void copy_mat_gpu(
     dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x,
                    (m + threadsPerBlock.y - 1) / threadsPerBlock.y);
     copy_mat_gpu<<<numBlocks, threadsPerBlock, 0, stream>>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle);
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch copy_mat_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch copy_mat_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -607,7 +608,7 @@ void copy_mat_gpu(
 
 template <typename T>
 void col_swap_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m, 
     int64_t n, 
     int64_t k,
@@ -617,39 +618,39 @@ void col_swap_gpu(
 ) {
 #ifdef USE_CUDA
     int64_t* idx_copy;
-    cudaMallocAsync(&idx_copy, sizeof(int64_t) * n, stream);
+    hipMallocAsync(&idx_copy, sizeof(int64_t) * n, stream);
     
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to allocate for col_swap_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to allocate for col_swap_gpu. " << hipGetErrorString(ierr))
         abort();
     }
     constexpr int numThreads = 128;
     dim3 dimBlock(numThreads, 1, 1);
     static int upper_bound = [&] {
         int numBlocksPerSm = 0;
-        cudaDeviceProp deviceProp;
-        cudaGetDeviceProperties(&deviceProp, 0);
-        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, col_swap_gpu_kernel<T>, numThreads, 0);
+        hipDeviceProp_t deviceProp;
+        hipGetDeviceProperties(&deviceProp, 0);
+        hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, col_swap_gpu_kernel<T>, numThreads, 0);
         return deviceProp.multiProcessorCount * numBlocksPerSm;
     }();
     int lower_bound = std::max(m, k);
     lower_bound = (lower_bound + numThreads - 1) / numThreads;
     dim3 dimGrid(std::min(upper_bound, lower_bound), 1, 1);
     void* kernelArgs[] = {(void*)&m, (void*)&n, (void*)&k, (void*)&A, (void*)&lda, (void*)&idx, (void*)&idx_copy};
-    cudaLaunchCooperativeKernel((void*)col_swap_gpu_kernel_sequential<T>, dimGrid, dimBlock, kernelArgs, 0, stream);
-    ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipLaunchCooperativeKernel((void*)col_swap_gpu_kernel_sequential<T>, dimGrid, dimBlock, kernelArgs, 0, stream);
+    ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu sequential. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu sequential. " << hipGetErrorString(ierr))
         abort();
     }
-    cudaFreeAsync(idx_copy, stream);
-    ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipFreeAsync(idx_copy, stream);
+    ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to deallocate for col_swap_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to deallocate for col_swap_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -661,7 +662,7 @@ void col_swap_gpu(
 /// idx array modified ONLY within the scope of this function.
 template <typename T>
 void col_swap_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m, 
     int64_t n, 
     int64_t k,
@@ -678,10 +679,10 @@ void col_swap_gpu(
                          (m + threadsPerBlock.y - 1) / threadsPerBlock.y);
     col_swap_gpu_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(m, n, k, A, lda, A_cpy, ldac, idx);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu with parallel pivots. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu with parallel pivots. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -693,7 +694,7 @@ void col_swap_gpu(
 /// idx array modified ONLY within the scope of this function.
 template <typename T>
 void col_swap_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m,  
     int64_t k,
     int64_t* J, 
@@ -703,26 +704,26 @@ void col_swap_gpu(
 /*
     std::vector<int64_t> idx_copy(k);
     auto j = std::make_unique<int64_t[]>(m);
-    cudaMemcpyAsync(idx_copy.data(), idx, sizeof(int64_t) * k, cudaMemcpyDeviceToHost, stream);
-    cudaMemcpyAsync(j.get(), J, sizeof(int64_t) * m, cudaMemcpyDeviceToHost, stream);
-    cudaStreamSynchronize(stream);
+    hipMemcpyAsync(idx_copy.data(), idx, sizeof(int64_t) * k, hipMemcpyDeviceToHost, stream);
+    hipMemcpyAsync(j.get(), J, sizeof(int64_t) * m, hipMemcpyDeviceToHost, stream);
+    hipStreamSynchronize(stream);
 
     RandLAPACK::util::col_swap<T>(m, k, j.get(), std::move(idx_copy));
 
-    cudaMemcpyAsync(J, j.get(), sizeof(int64_t) * m, cudaMemcpyHostToDevice, stream);
+    hipMemcpyAsync(J, j.get(), sizeof(int64_t) * m, hipMemcpyHostToDevice, stream);
     // must add this to avoid dangling reference during async copy
-    cudaStreamSynchronize(stream);
+    hipStreamSynchronize(stream);
 */
 #ifdef USE_CUDA
     int64_t* idx_copy;
-    cudaMallocAsync(&idx_copy, sizeof(int64_t) * k, stream);
+    hipMallocAsync(&idx_copy, sizeof(int64_t) * k, stream);
     vec_ell_swap_gpu<T><<<1, 512, 0, stream>>>(m, k, J, idx, idx_copy);
 
-    cudaFreeAsync(idx_copy, stream);
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipFreeAsync(idx_copy, stream);
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -732,7 +733,7 @@ void col_swap_gpu(
 
 template <typename T>
 void col_swap_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m,  
     int64_t k,
     int64_t* J, 
@@ -744,10 +745,10 @@ void col_swap_gpu(
     int64_t num_blocks{(k + threadsPerBlock - 1) / threadsPerBlock};
     vec_ell_swap_gpu<T><<<num_blocks, threadsPerBlock, 0, stream>>>(m, k, J, J_cpy, idx);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu vector. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch col_swap_gpu vector. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -758,7 +759,7 @@ void col_swap_gpu(
 
 template <typename T>
 void transposition_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m,
     int64_t n,
     const T* A,
@@ -772,10 +773,10 @@ void transposition_gpu(
     int64_t num_blocks{(m + threadsPerBlock - 1) / threadsPerBlock};
     transposition_gpu<<<num_blocks, threadsPerBlock, 0, stream>>>(m, n, A, lda, AT, ldat, copy_upper_triangle);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch transposition_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch transposition_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -788,7 +789,7 @@ void transposition_gpu(
 // TODO: we get a multiple definition linker error if this function is not templated.
 template <typename Idx>
 inline void LUQRCP_piv_process_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     Idx sampling_dim,
     Idx cols,
     Idx* J_buffer,
@@ -799,10 +800,10 @@ inline void LUQRCP_piv_process_gpu(
     Idx n{std::min(sampling_dim, cols)};
     LUQRCP_piv_process_gpu_global<<<1, threadsPerBlock, 0, stream>>>(cols, n, J_buffer, J_buffer_lu);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch piv_process_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch piv_process_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -814,7 +815,7 @@ inline void LUQRCP_piv_process_gpu(
 // Outputs tau instead of T.
 template <typename T>
 void orhr_col_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t m,
     int64_t n,
     T* A,
@@ -840,10 +841,10 @@ void orhr_col_gpu(
     int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock};
     elementwise_product<<<num_blocks, threadsPerBlock, 0, stream>>>(n, T{-1}, A, lda + 1, D, 1, tau, 1);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch orhr_col_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch orhr_col_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -854,7 +855,7 @@ void orhr_col_gpu(
 // Setting signs in the R-factor from CholQR after orhr_col outputs.
 template <typename T>
 void R_cholqr_signs_gpu(
-    cudaStream_t stream, 
+    hipStream_t stream, 
     int64_t b_sz,
     int64_t b_sz_const, 
     T* R,  
@@ -867,10 +868,10 @@ void R_cholqr_signs_gpu(
 
     R_cholqr_signs_gpu<<<numBlocks, threadsPerBlock, 0, stream>>>(b_sz, b_sz_const, R, D);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch R_cholqr_signs_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch R_cholqr_signs_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -880,7 +881,7 @@ void R_cholqr_signs_gpu(
 
 template <typename T>
 void copy_gpu(
-    cudaStream_t stream,
+    hipStream_t stream,
     int64_t n,  
     T const* src,
     int64_t incr_src,  
@@ -892,10 +893,10 @@ void copy_gpu(
     int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock};
     copy_gpu<<<num_blocks, threadsPerBlock, 0, stream>>>(n, src, incr_src, dest, incr_dest);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch copy_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch copy_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -905,7 +906,7 @@ void copy_gpu(
 
 template <typename T>
 void fill_gpu(
-    cudaStream_t stream,
+    hipStream_t stream,
     int64_t n,  
     T* x,
     int64_t incx,  
@@ -916,10 +917,10 @@ void fill_gpu(
     int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock};
     fill_gpu<<<num_blocks, threadsPerBlock, 0, stream>>>(n, x, incx, alpha);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -929,7 +930,7 @@ void fill_gpu(
 
 template <typename T>
 void fill_gpu(
-    cudaStream_t stream,
+    hipStream_t stream,
     int64_t n,  
     int64_t* x,
     int64_t incx,  
@@ -940,10 +941,10 @@ void fill_gpu(
     int64_t num_blocks{(n + threadsPerBlock - 1) / threadsPerBlock};
     fill_gpu<<<num_blocks, threadsPerBlock, 0, stream>>>(n, x, incx, alpha);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch fill_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
@@ -953,7 +954,7 @@ void fill_gpu(
 
 template <typename T>
 void get_U_gpu(
-    cudaStream_t stream,
+    hipStream_t stream,
     int64_t m,
     int64_t n,
     T* A,
@@ -964,10 +965,10 @@ void get_U_gpu(
     int64_t num_blocks{std::max((n + threadsPerBlock - 1) / threadsPerBlock, (int64_t) 1)};
     get_U_gpu<<<num_blocks, threadsPerBlock, 0, stream>>>(m, n, A, lda);
 
-    cudaError_t ierr = cudaGetLastError();
-    if (ierr != cudaSuccess)
+    hipError_t ierr = hipGetLastError();
+    if (ierr != hipSuccess)
     {
-        RandLAPACK_CUDA_ERROR("Failed to launch get_U_gpu. " << cudaGetErrorString(ierr))
+        RandLAPACK_CUDA_ERROR("Failed to launch get_U_gpu. " << hipGetErrorString(ierr))
         abort();
     }
 #else
diff --git a/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc b/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc
index f8afb254..956d94cb 100644
--- a/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc
+++ b/benchmark/bench_BQRRP/BQRRP_runtime_breakdown.cc
@@ -120,8 +120,8 @@ int main(int argc, char *argv[]) {
     int64_t m          = std::stol(size);
     int64_t n          = std::stol(size);
     double  d_factor   = 1.0;
-    std::vector<int64_t> b_sz = {250, 500, 1000, 2000, 4000, 8000};
-    //std::vector<int64_t> b_sz = {256, 512, 1024, 2048, 4096, 8192};
+    int64_t b_sz_start = 32;
+    int64_t b_sz_end   = 128;
     auto state         = RandBLAS::RNGState<r123::Philox4x32>();
     auto state_constant = state;
     // Timing results
@@ -141,7 +141,7 @@ int main(int argc, char *argv[]) {
                                                                  + "_num_info_lines_" + std::to_string(6) +
                                                                    ".txt";
 
-    std::ofstream file(output_filename, std::ios::out | std::ios::app);
+    std::ofstream file(output_filename, std::ios::out | std::ios::trunc);
 
     // Writing important data into file
     file << "Description: Results from the BQRRP runtime breakdown benchmark, recording the time it takes to perform every subroutine in BQRRP."
@@ -149,13 +149,12 @@ int main(int argc, char *argv[]) {
               "               rows correspond to BQRRP runs with block sizes varying in powers of 2, with numruns repititions of each block size"
               "\nInput type:"       + std::to_string(m_info.m_type) +
               "\nInput size:"       + std::to_string(m) + " by "  + std::to_string(n) +
-              "\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz.front()) + " BQRRP block size end: " + std::to_string(b_sz.back()) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: "   + std::to_string(d_factor) +
+              "\nAdditional parameters: Tall QR subroutine " + argv[2] + " BQRRP block size start: " + std::to_string(b_sz_start) + " BQRRP block size end: " + std::to_string(b_sz_end) + " num runs per size " + std::to_string(numruns) + " BQRRP d factor: "   + std::to_string(d_factor) +
               "\n";
     file.flush();
 
-    int i = 0;
-    for (;i < b_sz.size(); ++i) {
-        call_all_algs(m_info, numruns, b_sz[i], qr_tall, all_data, state_constant, output_filename);
+    for (;b_sz_start <= b_sz_end; b_sz_start *= 2) {
+        call_all_algs(m_info, numruns, b_sz_start, qr_tall, all_data, state_constant, output_filename);
     }
 }
 #endif
diff --git a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc
index 0a0919e8..1657d55c 100644
--- a/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc
+++ b/benchmark/bench_BQRRP/BQRRP_subroutines_speed.cc
@@ -136,26 +136,31 @@ static void call_wide_qrcp(
     std::string output_filename) {
 
     auto m = all_data.row;  
+    auto tol = all_data.tolerance;
+
+    RandLAPACK::CQRRPT<double, r123::Philox4x32> CQRRPT(false, tol);
+    CQRRPT.nnz = 4;
 
     // timing vars
     long dur_geqp3  = 0;
     long dur_luqr   = 0;
+    long dur_cqrrpt = 0;
  
     // Making sure the states are unchanged
     auto state_alg = state;
 
     int i, j = 0;
     for (i = 0; i < numruns; ++i) {
-        printf("Wide QRCP iteration %d; m==%ld start.\n", i, n);
+        printf("Wide QRCP iteration %d; m==%d start.\n", i, n);
         // Testing GEQP3
-        auto start_geqp3 = high_resolution_clock::now();
+        auto start_geqp3 = steady_clock::now();
         lapack::geqp3(n, m, all_data.A.data(), n, all_data.J.data(), all_data.tau.data());
-        auto stop_geqp3 = high_resolution_clock::now();
+        auto stop_geqp3 = steady_clock::now();
         dur_geqp3 = duration_cast<microseconds>(stop_geqp3 - start_geqp3).count();
         data_regen(m_info, all_data, state, state, 1);
 
         // Testing LUQR
-        auto start_luqr = high_resolution_clock::now();
+        auto start_luqr = steady_clock::now();
         // Perform pivoted LU on A_sk', follow it up by unpivoted QR on a permuted A_sk.
         // Get a transpose of A_sk 
         RandLAPACK::util::transposition(n, m, all_data.A.data(), n, all_data.A_trans.data(), m, 0);
@@ -172,7 +177,7 @@ static void call_wide_qrcp(
         RandLAPACK::util::col_swap(n, m, m, all_data.A.data(), n, all_data.J);
         // Perform an unpivoted QR on A_sk
         lapack::geqrf(n, m, all_data.A.data(), n, all_data.tau.data());
-        auto stop_luqr = high_resolution_clock::now();
+        auto stop_luqr = steady_clock::now();
         dur_luqr = duration_cast<microseconds>(stop_luqr - start_luqr).count();
         data_regen(m_info, all_data, state, state, 1);
     
@@ -193,6 +198,7 @@ static void call_tsqr(
     std::string output_filename) {
 
     auto m   = all_data.row;
+    auto tol = all_data.tolerance;
     int64_t tsize = 0;
 
     // timing vars
@@ -205,11 +211,10 @@ static void call_tsqr(
     long dur_cholqr_r_restore  = 0;
 
     // Imitating the QRCP on a sketch stage of BQRRP - needed to get a preconditioner
-    T* S       = new T[n * m]();
-    T* A_sk    = new T[n * n]();
-    int64_t* J = new int64_t[n]();
-    T* tau     = new T[n]();
-    
+    T* S       = ( T * )       calloc( n * m, sizeof( T ) );
+    T* A_sk    = ( T * )       calloc( n * n, sizeof( T ) );
+    int64_t* J = ( int64_t * ) calloc( n,     sizeof( int64_t ) );
+    T* tau     = ( T * )       calloc( n,     sizeof( T ) );
     RandBLAS::DenseDist D(n, m);
     auto state_const = state;
     RandBLAS::fill_dense(D, S, state_const);
@@ -224,45 +229,45 @@ static void call_tsqr(
         for(nb = geqrt_nb_start; nb <= n; nb *=2) {
             printf("TSQR iteration %d; n==%ld start.\n", i, n);
 
-            auto start_geqrt = high_resolution_clock::now();
+            auto start_geqrt = steady_clock::now();
             lapack::geqrt( m, n, nb, all_data.A.data(), m, all_data.T_mat.data(), n );
-            auto stop_geqrt = high_resolution_clock::now();
+            auto stop_geqrt = steady_clock::now();
             dur_geqrt = duration_cast<microseconds>(stop_geqrt - start_geqrt).count();
 
             if(nb == geqrt_nb_start) {
                 // Testing GEQRF
-                auto start_geqrf = high_resolution_clock::now();
+                auto start_geqrf = steady_clock::now();
                 lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data());
-                auto stop_geqrf = high_resolution_clock::now();
+                auto stop_geqrf = steady_clock::now();
                 dur_geqrf = duration_cast<microseconds>(stop_geqrf - start_geqrf).count();
                 data_regen(m_info, all_data, state, state, 2);
 
                 // Testing GEQR
-                auto start_geqr = high_resolution_clock::now();
+                auto start_geqr = steady_clock::now();
                 lapack::geqr(m, n, all_data.A.data(), m,  all_data.tau.data(), -1);
                 tsize = (int64_t) all_data.tau[0]; 
                 all_data.tau.resize(tsize);
                 lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize);
-                auto stop_geqr = high_resolution_clock::now();
+                auto stop_geqr = steady_clock::now();
                 dur_geqr = duration_cast<microseconds>(stop_geqr - start_geqr).count();
                 data_regen(m_info, all_data, state, state, 2);
 
                 // Testing CholQR
-                auto start_precond = high_resolution_clock::now();
+                auto start_precond = steady_clock::now();
                 blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, A_sk, n, all_data.A.data(), m);
-                auto stop_precond = high_resolution_clock::now();
+                auto stop_precond = steady_clock::now();
                 dur_cholqr_precond = duration_cast<microseconds>(stop_precond - start_precond).count();
-                auto start_cholqr = high_resolution_clock::now();
+                auto start_cholqr = steady_clock::now();
                 blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n);
                 lapack::potrf(Uplo::Upper, n, all_data.R.data(), n);
                 blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, all_data.R.data(), n, all_data.A.data(), m);
-                auto stop_cholqr = high_resolution_clock::now();
+                auto stop_cholqr = steady_clock::now();
                 dur_cholqr = duration_cast<microseconds>(stop_cholqr - start_cholqr).count();
-                auto start_orhr_col = high_resolution_clock::now();
+                auto start_orhr_col = steady_clock::now();
                 lapack::orhr_col(m, n, n, all_data.A.data(), m, all_data.T_mat.data(), n, all_data.D.data());
-                auto stop_cholqr_orhr = high_resolution_clock::now();
+                auto stop_cholqr_orhr = steady_clock::now();
                 dur_cholqr_house_rest = duration_cast<microseconds>(stop_cholqr_orhr - start_orhr_col).count();
-                auto start_r_restore = high_resolution_clock::now();
+                auto start_r_restore = steady_clock::now();
                 // Construct the proper R-factor
                 for(int i = 0; i < n; ++i) {
                     for(int j = 0; j < (i + 1); ++j) {
@@ -271,7 +276,7 @@ static void call_tsqr(
                 }
                 blas::trmm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, n, n, (T) 1.0, A_sk, n, all_data.R.data(), n);
                 lapack::lacpy(MatrixType::Upper, n, n, all_data.R.data(), n, all_data.A.data(), m);
-                auto stop_r_restore = high_resolution_clock::now();
+                auto stop_r_restore = steady_clock::now();
                 dur_cholqr_r_restore = duration_cast<microseconds>(stop_r_restore - start_r_restore).count();
                 data_regen(m_info, all_data, state, state, 2);
             
@@ -284,10 +289,10 @@ static void call_tsqr(
         file << "\n";
     }
 
-    delete[] A_sk;
-    delete[] S;
-    delete[] J;
-    delete[] tau;
+    free(A_sk);
+    free(S);
+    free(J);
+    free(tau);
 }
 
 template <typename T, typename RNG>
@@ -306,6 +311,7 @@ static void call_apply_q(
     // timing vars
     long dur_ormqr  = 0;
     long dur_gemqrt = 0;
+    long dur_gemm   = 0;
 
     std::ofstream file(output_filename, std::ios::app);
 
@@ -313,7 +319,7 @@ static void call_apply_q(
     int64_t nb = 0;
     for (i = 0; i < numruns; ++i) {
         for(nb = gemqrt_nb_start; nb <= n; nb *=2) {
-            printf("Apply Q iteration %d; n==%ld start.\n", i, n);
+            printf("Apply Q iteration %d; n==%d start.\n", i, n);
             // Performing CholQR
             blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, (T) 1.0, all_data.A.data(), m, (T) 0.0, all_data.R.data(), n);
             lapack::potrf(Uplo::Upper, n, all_data.R.data(), n);
@@ -322,9 +328,9 @@ static void call_apply_q(
             lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_gemqrt.data(), m);
             lapack::orhr_col(m, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.D.data());
             
-            auto start_gemqrt = high_resolution_clock::now();
+            auto start_gemqrt = steady_clock::now();
             lapack::gemqrt(Side::Left, Op::Trans, m, m - n, n, nb, all_data.A_gemqrt.data(), m, all_data.T_gemqrt.data(), n, all_data.B1.data(), m);
-            auto stop_gemqrt = high_resolution_clock::now();
+            auto stop_gemqrt = steady_clock::now();
             dur_gemqrt = duration_cast<microseconds>(stop_gemqrt - start_gemqrt).count();
 
             // We do not re-run ormqr and gemm for different nbs
@@ -335,9 +341,9 @@ static void call_apply_q(
                 for(j = 0; j < n; ++j)
                     all_data.tau[j] = all_data.T_mat[(n + 1) * j];
 
-                auto start_ormqr = high_resolution_clock::now();
+                auto start_ormqr = steady_clock::now();
                 lapack::ormqr(Side::Left, Op::Trans, m, m - n, n, all_data.A.data(), m, all_data.tau.data(), all_data.B.data(), m);
-                auto stop_ormqr = high_resolution_clock::now();
+                auto stop_ormqr = steady_clock::now();
                 dur_ormqr = duration_cast<microseconds>(stop_ormqr - start_ormqr).count();
             
                 file << dur_ormqr << ",  ";                
@@ -404,4 +410,4 @@ int main(int argc, char *argv[]) {
     for (i = n_start; i <= n_stop; i *= 2)
         call_apply_q(m_info, numruns, i, nb_start, all_data, state, state_B, output_filename);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc b/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc
index 6c1c8a97..06e7a0fa 100644
--- a/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc
+++ b/benchmark/bench_BQRRP/HQRRP_runtime_breakdown.cc
@@ -49,7 +49,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,
 
     RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state);
     std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0);
-    std::fill(all_data.J.begin(), all_data.J.end(), 0);
+    std::iota(all_data.J.begin(), all_data.J.end(), 1);
 }
 
 template <typename T, typename RNG>
@@ -71,7 +71,7 @@ static void call_all_algs(
     int panel_pivoting = 0;
 
     // Timing vars
-    T* times  = ( T * ) calloc( 27, sizeof( T ) );
+    T* times = ( T * ) calloc(27, sizeof( T ) );
 
     for (int i = 0; i < numruns; ++i) {
         printf("ITERATION %d, NUMCOLS %ld\n", i, n);
@@ -125,7 +125,7 @@ int main(int argc, char *argv[]) {
                                                                  + "_num_info_lines_" + std::to_string(6) +
                                                                    ".txt";
 
-    std::ofstream file(output_filename, std::ios::out | std::ios::app);
+    std::ofstream file(output_filename, std::ios::out | std::ios::trunc);
 
     // Writing important data into file
     file << "Description: Results from the HQRRP runtime breakdown benchmark, recording the time it takes to perform every subroutine in HQRRP."
diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc
index c2c83b18..1c77e572 100644
--- a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc
+++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc
@@ -50,7 +50,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,
 
 // Re-generate and clear data
 template <typename T>
-static std::vector<T> get_norms(int64_t n, std::vector<T> Mat, int64_t lda) {
+static std::vector<T> get_norms( int64_t m, int64_t n, std::vector<T> Mat, int64_t lda) {
 
     std::vector<T> R_norms (n, 0.0);
     for (int i = 0; i < n; ++i) {
@@ -81,7 +81,7 @@ static void R_norm_ratio(
 
     // Running GEQP3
     lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());
-    std::vector<T> R_norms_GEQP3 = get_norms(n, all_data.A, m);
+    std::vector<T> R_norms_GEQP3 = get_norms(m, n, all_data.A, m);
     printf("\nDone with QP3\n");
 
     // Clear and re-generate data
@@ -92,7 +92,7 @@ static void R_norm_ratio(
     // Running CQRRP
     state_alg = state;
     CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg);
-    std::vector<T> R_norms_CQRRPT = get_norms(n, all_data.R, n);
+    std::vector<T> R_norms_CQRRPT = get_norms(n, n, all_data.R, n);
 
     // Declare a data file
     std::fstream file1("QR_R_norm_ratios_rows_"        + std::to_string(m)
diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc
index 9440eba0..b19bb958 100644
--- a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc
+++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc
@@ -85,27 +85,27 @@ static void call_all_algs(
     for (int i = 0; i < numruns; ++i) {
         printf("Iteration %d start.\n", i);
         // Testing GEQP3
-        auto start_geqp3 = high_resolution_clock::now();
+        auto start_geqp3 = steady_clock::now();
         lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());
-        auto stop_geqp3 = high_resolution_clock::now();
+        auto stop_geqp3 = steady_clock::now();
         dur_geqp3 = duration_cast<microseconds>(stop_geqp3 - start_geqp3).count();
 
         state_gen = state;
         data_regen(m_info, all_data, state_gen);
 
         // Testing GEQRF
-        auto start_geqrf = high_resolution_clock::now();
+        auto start_geqrf = steady_clock::now();
         lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data());
-        auto stop_geqrf = high_resolution_clock::now();
+        auto stop_geqrf = steady_clock::now();
         dur_geqrf = duration_cast<microseconds>(stop_geqrf - start_geqrf).count();
 
         state_gen = state;
         data_regen(m_info, all_data, state_gen);
 
         // Testing CQRRPT
-        auto start_cqrrp = high_resolution_clock::now();
+        auto start_cqrrp = steady_clock::now();
         CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg);
-        auto stop_cqrrp = high_resolution_clock::now();
+        auto stop_cqrrp = steady_clock::now();
         dur_cqrrpt = duration_cast<microseconds>(stop_cqrrp - start_cqrrp).count();
 
         state_gen = state;
@@ -113,7 +113,7 @@ static void call_all_algs(
         data_regen(m_info, all_data, state_gen);
 
         // Testing SCHOLQR3
-        auto start_scholqr = high_resolution_clock::now();
+        auto start_scholqr = steady_clock::now();
         //--------------------------------------------------------------------------------------------------------------------------//
         T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
         T shift = 11 * std::numeric_limits<T>::epsilon() * n * std::pow(norm_A, 2);
@@ -131,7 +131,7 @@ static void call_all_algs(
         lapack::potrf(Uplo::Upper, n, all_data.R.data(), n);
         blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m);
         //--------------------------------------------------------------------------------------------------------------------------//
-        auto stop_scholqr = high_resolution_clock::now();
+        auto stop_scholqr = steady_clock::now();
         dur_scholqr = duration_cast<microseconds>(stop_scholqr - start_scholqr).count();
 
         auto state_gen = state;
@@ -139,21 +139,21 @@ static void call_all_algs(
 
         // Testing GEQR + GEQPT
 #if !defined(__APPLE__)
-        auto start_geqpt = high_resolution_clock::now();
-        auto start_geqr  = high_resolution_clock::now();
+        auto start_geqpt = steady_clock::now();
+        auto start_geqr  = steady_clock::now();
         // GEQR(A) part
         lapack::geqr(m, n, all_data.A.data(), m,  all_data.tau.data(), -1);
         int64_t tsize = (int64_t) all_data.tau[0]; 
         all_data.tau.resize(tsize);
         lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize);
 
-        auto stop_geqr = high_resolution_clock::now();
+        auto stop_geqr = steady_clock::now();
         dur_geqr = duration_cast<microseconds>(stop_geqr - start_geqr).count();
 
         // GEQP3(R) part
         lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n);
         lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data());
-        auto stop_geqpt = high_resolution_clock::now();
+        auto stop_geqpt = steady_clock::now();
         dur_geqpt = duration_cast<microseconds>(stop_geqpt - start_geqpt).count();
         state_gen = state;
         data_regen(m_info, all_data, state_gen);
diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc
index 704777b1..2a015012 100644
--- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc
+++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc
@@ -45,42 +45,27 @@ struct RBKI_benchmark_data {
     RBKI_benchmark_data(int64_t m, int64_t n, T tol) :
     A_spectra(m, n)
     {
-        A          = new T[m * n]();
-        U          = new T[m * n]();
-        VT         = new T[n * n]();
-        V          = new T[n * n]();
-        Sigma      = new T[m]();
-        U_RSVD     = new T[m * n]();
-        V_RSVD     = new T[n * n]();
-        Sigma_RSVD = new T[n]();
-        Buffer     = new T[m * n]();
-        Sigma_cpy  = new T[n * n]();
-        U_cpy      = new T[m * n]();
-        VT_cpy     = new T[n * n]();
-
+        A                   = ( T * ) calloc(m * n, sizeof( T ) );
+        U                   = ( T * ) calloc(m * n, sizeof( T ) );
+        VT                  = ( T * ) calloc(n * n, sizeof( T ) );
+        V                   = ( T * ) calloc(n * n, sizeof( T ) );
+        Sigma               = ( T * ) calloc(m,     sizeof( T ) );
+        U_RSVD              = ( T * ) calloc(m * n, sizeof( T ) );
+	    V_RSVD              = ( T * ) calloc(n * n, sizeof( T ) );
+	    Sigma_RSVD          = ( T * ) calloc(n,     sizeof( T ) );
+	    Buffer              = ( T * ) calloc(m * n, sizeof( T ) );
+        Sigma_cpy           = ( T * ) calloc(n * n, sizeof( T ) );
+        U_cpy               = ( T * ) calloc(m * n, sizeof( T ) );
+        VT_cpy              = ( T * ) calloc(n * n, sizeof( T ) );
+
+        //A_lowrank_svd       = ( T * ) calloc(m * n, sizeof( T ) );
+        //A_lowrank_svd_const = ( T * ) calloc(m * n, sizeof( T ) );
         A_lowrank_svd       = nullptr;
         A_lowrank_svd_const = nullptr;
         row                 = m;
         col                 = n;
         tolerance           = tol;
     }
-
-    ~RBKI_benchmark_data() {
-        delete[] A;
-        delete[] U;
-        delete[] VT;
-        delete[] V;
-        delete[] Sigma;
-        delete[] U_RSVD;
-        delete[] V_RSVD;
-        delete[] Sigma_RSVD;
-        delete[] Buffer;
-        delete[] Sigma_cpy;
-        delete[] U_cpy;
-        delete[] VT_cpy;
-        delete[] A_lowrank_svd;
-        delete[] A_lowrank_svd_const;
-    }
 };
 
 template <typename T, typename RNG>
@@ -259,9 +244,9 @@ static void call_all_algs(
         // There is no reason to run SVD many times, as it always outputs the same result.
         if ((b_sz == 16) && (num_matmuls == 2) && ((i == 0) || (i == 1))) {
             // Running SVD
-            auto start_svd = high_resolution_clock::now();
+            auto start_svd = steady_clock::now();
             lapack::gesdd(Job::SomeVec, m, n, all_data.A, m, all_data.Sigma, all_data.U, m, all_data.VT, n);
-            auto stop_svd = high_resolution_clock::now();
+            auto stop_svd = steady_clock::now();
             dur_svd = duration_cast<microseconds>(stop_svd - start_svd).count();
             printf("TOTAL TIME FOR SVD %ld\n", dur_svd);
 
@@ -281,9 +266,9 @@ static void call_all_algs(
         }
         
         // Running RBKI
-        auto start_rbki = high_resolution_clock::now();
+        auto start_rbki = steady_clock::now();
         all_algs.RBKI.call(m, n, all_data.A, m, b_sz, all_data.U, all_data.VT, all_data.Sigma, state_alg);
-        auto stop_rbki = high_resolution_clock::now();
+        auto stop_rbki = steady_clock::now();
         dur_rbki = duration_cast<microseconds>(stop_rbki - start_rbki).count();
         printf("TOTAL TIME FOR RBKI %ld\n", dur_rbki);
 
@@ -298,10 +283,10 @@ static void call_all_algs(
         data_regen(m_info, all_data, state_gen, 1);
         
         // Running RSVD
-        auto start_rsvd = high_resolution_clock::now();
+        auto start_rsvd = steady_clock::now();
         int64_t threshold_RSVD = (int64_t ) (b_sz * num_matmuls / 2);
         all_algs.RSVD.call(m, n, all_data.A, threshold_RSVD, tol, all_data.U_RSVD, all_data.Sigma_RSVD, all_data.V_RSVD, state_alg);
-        auto stop_rsvd = high_resolution_clock::now();
+        auto stop_rsvd = steady_clock::now();
         dur_rsvd = duration_cast<microseconds>(stop_rsvd - start_rsvd).count();
         printf("TOTAL TIME FOR RSVD %ld\n", dur_rsvd);
 
@@ -324,10 +309,10 @@ static void call_all_algs(
         // There is no reason to run SVDS many times, as it always outputs the same result.
         if ((num_matmuls == 2) && ((i == 0) || (i == 1))) {
             // Running SVDS
-            auto start_svds = high_resolution_clock::now();
+            auto start_svds = steady_clock::now();
             Spectra::PartialSVDSolver<Eigen::MatrixXd> svds(all_data.A_spectra, std::min(custom_rank, n-2), std::min(2 * custom_rank, n-1));
             svds.compute();
-            auto stop_svds = high_resolution_clock::now();
+            auto stop_svds = steady_clock::now();
             dur_svds = duration_cast<microseconds>(stop_svds - start_svds).count();
             printf("TOTAL TIME FOR SVDS %ld\n", dur_svds);
 
@@ -418,8 +403,8 @@ int main(int argc, char *argv[]) {
         RandLAPACK::gen::mat_gen_info<double> m_info_A_svd(m, n, RandLAPACK::gen::custom_input);
         m_info_A_svd.filename            = argv[2];
         m_info_A_svd.workspace_query_mod = 0;
-        all_data.A_lowrank_svd       = new double[m * n]();
-        all_data.A_lowrank_svd_const = new double[m * n]();
+        all_data.A_lowrank_svd           = ( double * ) calloc(m * n, sizeof( double ) );
+        all_data.A_lowrank_svd_const     = ( double * ) calloc(m * n, sizeof( double ) );
         RandLAPACK::gen::mat_gen<double>(m_info_A_svd, all_data.A_lowrank_svd_const, state);
         lapack::lacpy(MatrixType::General, m, n, all_data.A_lowrank_svd_const, m, all_data.A_lowrank_svd, m);
     
diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc
index 23955b3a..79911dc7 100644
--- a/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc
+++ b/benchmark/bench_RBKI/RBKI_speed_comparisons_just_RBKI.cc
@@ -138,9 +138,9 @@ static void call_all_algs(
         printf("Iteration %d start.\n", i);
         
         // Testing RBKI
-        auto start_rbki = high_resolution_clock::now();
+        auto start_rbki = steady_clock::now();
         RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state_alg);
-        auto stop_rbki = high_resolution_clock::now();
+        auto stop_rbki = steady_clock::now();
         dur_rbki = duration_cast<microseconds>(stop_rbki - start_rbki).count();
 
         T residual_err_custom = residual_error_comp<T>(all_data, custom_rank);
diff --git a/benchmark/bench_general/GEMM_flop_count.cc b/benchmark/bench_general/GEMM_flop_count.cc
index f0392760..856d3544 100644
--- a/benchmark/bench_general/GEMM_flop_count.cc
+++ b/benchmark/bench_general/GEMM_flop_count.cc
@@ -22,9 +22,9 @@ test_flops(int64_t k,
     int runs = 50;
     T GFLOPS_rate_best = 0;
 
-    T* A = new T[size]();
-    T* B = new T[size]();
-    T* C = new T[size]();
+    T* A = ( T * ) calloc( size, sizeof( T ) );
+    T* B = ( T * ) calloc( size, sizeof( T ) );
+    T* C = ( T * ) calloc( size, sizeof( T ) );
 
     RandLAPACK::gen::mat_gen_info<double> m_info(k, k, RandLAPACK::gen::gaussian);  
 
@@ -33,9 +33,9 @@ test_flops(int64_t k,
         RandLAPACK::gen::mat_gen(m_info, B, state);
 
         // Get the timing
-        auto start = high_resolution_clock::now();
+        auto start = steady_clock::now();
         gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, k, k, k, 1.0, A, k, B, k, 0.0, C, k);
-        auto stop = high_resolution_clock::now();
+        auto stop = steady_clock::now();
         long dur = duration_cast<microseconds>(stop - start).count();
     
         T dur_s = dur / 1e+6;
@@ -46,10 +46,6 @@ test_flops(int64_t k,
     }
 
     printf("THE SYSTEM IS CAPABLE OF %f GFLOPs/sec.\n\n", GFLOPS_rate_best);
-
-    delete[] A;
-    delete[] B;
-    delete[] C;
 }
 
 int main() {
diff --git a/benchmark/bench_general/Gemm_vs_ormqr.cc b/benchmark/bench_general/Gemm_vs_ormqr.cc
index 7b5f922a..252bf73f 100644
--- a/benchmark/bench_general/Gemm_vs_ormqr.cc
+++ b/benchmark/bench_general/Gemm_vs_ormqr.cc
@@ -46,15 +46,15 @@ test_speed(int64_t m,
         // Get the implicit Q-factor in A_dat
         lapack::geqrf(m, n, A_dat, m, tau_dat);
 
-        auto start_ormqr = high_resolution_clock::now();
+        auto start_ormqr = steady_clock::now();
         lapack::ormqr(Side::Left, Op::Trans, m, n, n, A_dat, m, tau_dat, B1_dat, m);
-        auto stop_ormqr = high_resolution_clock::now();
+        auto stop_ormqr = steady_clock::now();
         long dur_ormqr = duration_cast<microseconds>(stop_ormqr - start_ormqr).count();
 
-        auto start_gemm = high_resolution_clock::now();
+        auto start_gemm = steady_clock::now();
         lapack::ungqr(m, n, n, A_dat, m, tau_dat);
         gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, n, m, 1.0, A_dat, m, B2_dat, m, 0.0, Product_dat, n);
-        auto stop_gemm = high_resolution_clock::now();
+        auto stop_gemm = steady_clock::now();
         long dur_gemm = duration_cast<microseconds>(stop_gemm - start_gemm).count();
 
         T gflop_count_gemm     = (2 * std::pow(n, 2) * m) / std::pow(10, 9);
diff --git a/benchmark/bench_general/basic_blas_speed.cc b/benchmark/bench_general/basic_blas_speed.cc
index 0b1fe207..171dab5b 100644
--- a/benchmark/bench_general/basic_blas_speed.cc
+++ b/benchmark/bench_general/basic_blas_speed.cc
@@ -84,27 +84,27 @@ static void call_all_algs(
     for (int i = 0; i < numruns; ++i) {
         printf("ITERATION %d, DIM %ld\n", i, n);
         // Testing BLAS3
-        auto start_blas3 = high_resolution_clock::now();
+        auto start_blas3 = steady_clock::now();
         blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, 1.0, all_data.A.data(), n, all_data.B.data(), n, 0.0, all_data.C.data(), n);
-        auto stop_blas3 = high_resolution_clock::now();
+        auto stop_blas3 = steady_clock::now();
         dur_blas3 = duration_cast<microseconds>(stop_blas3 - start_blas3).count();
 
         state_gen = state;
         data_regen(m_info, all_data, state_gen, 3);
 
         // Testing BLAS2
-        auto start_blas2 = high_resolution_clock::now();
+        auto start_blas2 = steady_clock::now();
         blas::gemv(Layout::ColMajor, Op::NoTrans, n, n, 1.0, all_data.A.data(), n, all_data.a.data(), 1, 1.0, all_data.b.data(), 1);
-        auto stop_blas2 = high_resolution_clock::now();
+        auto stop_blas2 = steady_clock::now();
         dur_blas2 = duration_cast<microseconds>(stop_blas2 - start_blas2).count();
     
         state_gen = state;
         data_regen(m_info, all_data, state_gen, 2);
 
         // Testing BLAS1
-        auto start_blas1 = high_resolution_clock::now();
+        auto start_blas1 = steady_clock::now();
         blas::axpy(n, -1.0, all_data.a.data(), 1, all_data.b.data(), 1);
-        auto stop_blas1 = high_resolution_clock::now();
+        auto stop_blas1 = steady_clock::now();
         dur_blas1 = duration_cast<microseconds>(stop_blas1 - start_blas1).count();
 
         state_gen = state;
diff --git a/test/comps/test_determiter.cc b/test/comps/test_determiter.cc
index fefc1ec8..76887f7d 100644
--- a/test/comps/test_determiter.cc
+++ b/test/comps/test_determiter.cc
@@ -47,9 +47,9 @@ class TestDetermiterOLS : public ::testing::Test {
         double delta = 0.1;
 	    double tol = 1e-8;
 
-        RandLAPACK::pcg(
+        RandLAPACK::pcg_saddle(
             m, n, A.data(), m, b.data(), c.data(), delta,
-            resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data());
+            10*n, resid_vec.data(), tol, n, M.data(), n, x0.data(), x.data(), y.data());
         
 
         int64_t iter_count = 0;
@@ -72,3 +72,129 @@ TEST_F(TestDetermiterOLS, Trivial) {
         run(k_idx);
     }
 }
+
+class TestDetermiterLockBlockPCG : public ::testing::Test {
+    protected:
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename T>
+    void run_simple_block(int64_t m, int64_t s, T coeff, uint32_t seed) {   
+        using std::vector;
+        auto layout = blas::Layout::ColMajor;
+        vector<T> G_buff(m*m, 0.0);
+        vector<T> H(m*s, 0.0);
+        randblas_require((int64_t) H.size() == m*s);
+        vector<T> X_star(m*s, 0.0);
+        vector<T> X_init(m*s, 0.0);
+        RandBLAS::RNGState state0(seed);
+        vector<T> temp(2*m*m);
+        auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian};
+        auto state1 = RandBLAS::fill_dense(D, temp.data(), state0);
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m);
+
+        vector<T> regs(1, coeff);
+        RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs);
+        RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian};
+        auto Xsd = X_star.data();
+        auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1);
+        G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m);
+
+        RandLAPACK::StatefulFrobeniusNorm<T> seminorm{};
+
+        auto I_buff = eye<T>(m);
+        vector<T> zeros(1, 0.0);
+        RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros);
+
+        T tol = 100*std::numeric_limits<T>::epsilon();
+        RandLAPACK::pcg(G, H.data(), s, seminorm, tol, m, I, X_init.data(), true);
+
+        T tol_scale = std::sqrt((T)m);
+        T atol = tol_scale * std::pow(std::numeric_limits<T>::epsilon(), 0.5);
+        T rtol = tol_scale * atol;
+        test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s,
+            __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol
+        );
+        return;
+    }
+
+    virtual void run_simple_lockstep(int64_t m, int64_t s, uint32_t seed) {  
+        using T = double;
+        randblas_require(s <= 4);
+        using std::vector;
+        vector<T> reg_coeffs{};
+        reg_coeffs.push_back(100);
+        if (s > 1)
+            reg_coeffs.push_back(7);
+        if (s > 2)
+            reg_coeffs.push_back(0.1);
+        if (s > 3)
+            reg_coeffs.push_back(0.5483);
+        auto layout = blas::Layout::ColMajor;
+        vector<T> G_buff(m*m, 0.0);
+        vector<T> H(m*s, 0.0);
+        vector<T> X_star(m*s, 0.0);
+        vector<T> X_init(m*s, 0.0);
+        RandBLAS::RNGState state0(seed);
+        vector<T> temp(2*m*m);
+        
+        auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian};
+        auto state1 = RandBLAS::fill_dense(D, temp.data(), state0);
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m);
+
+        vector<T> regs(reg_coeffs);
+        RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs);
+        RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian};
+        auto Xsd = X_star.data();
+        auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1);
+        G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m);
+
+        auto I_buff = eye<T>(m);
+        vector<T> zeros(s, 0.0);
+        RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros);
+
+        // While we're testing lockstep PCG, let's call the overload of pcg
+        // that automatically chooses the seminorm. We don't care about the return value
+        // in this case.
+        T tol = 100*std::numeric_limits<T>::epsilon();
+        RandLAPACK::pcg(G, H, tol, m, I, X_init, true);
+
+        T tol_scale = std::sqrt((T)m);
+        T atol = tol_scale * std::pow(std::numeric_limits<T>::epsilon(), 0.5);
+        T rtol = tol_scale * atol;
+        test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s,
+            __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol
+        );
+        return;
+    }
+};
+
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_1) {
+    run_simple_block<double>(5, 1, 0.5, 1997);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_6_2) {
+    run_simple_block<double>(6, 2, 0.5, 1997);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_4) {
+    run_simple_block<double>(5, 4, 0.5, 1997);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_1) {
+    run_simple_lockstep(5, 1, 1997);
+    run_simple_lockstep(5, 1, 2024);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_6_2) {
+    run_simple_lockstep(6, 2, 1997);
+    run_simple_lockstep(6, 2, 2024);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_4) {
+    run_simple_lockstep(5, 4, 1997);
+    run_simple_lockstep(5, 4, 2024);
+}
diff --git a/test/comps/test_pcgls.cc b/test/comps/test_pcgls.cc
index 9181d932..92e2f106 100644
--- a/test/comps/test_pcgls.cc
+++ b/test/comps/test_pcgls.cc
@@ -29,8 +29,8 @@ void run_pcgls_ex(int n, int m)
     double delta = 0.1;
     double tol = 1e-8;
 
-    RandLAPACK::pcg(m, n, A.data(), m, b.data(), c.data(), delta,
-        resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data());
+    RandLAPACK::pcg_saddle(m, n, A.data(), m, b.data(), c.data(), delta,
+        10*n, resid_vec.data(), tol, n, M.data(), n, x0.data(), x.data(), y.data());
 
     for (double res: resid_vec)
     {
diff --git a/test/comps/test_syrf.cc b/test/comps/test_syrf.cc
index 23d68af9..b8afc19b 100644
--- a/test/comps/test_syrf.cc
+++ b/test/comps/test_syrf.cc
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <gtest/gtest.h>
 
+
 class TestSYRF : public ::testing::Test
 {
     protected:
@@ -45,9 +46,12 @@ class TestSYRF : public ::testing::Test
 
     template <typename T, typename RNG>
     struct algorithm_objects {
-        RandLAPACK::SYPS<T, RNG> SYPS;
-        RandLAPACK::HQRQ<T> Orth_RF; 
-        RandLAPACK::SYRF<T, RNG> SYRF;
+        using SYPS_t = RandLAPACK::SYPS<T, RNG>;
+        using Orth_t = RandLAPACK::HQRQ<T>;
+        using SYRF_t = RandLAPACK::SYRF<SYPS_t, Orth_t>;
+        SYPS_t syps;
+        Orth_t orth; 
+        SYRF_t syrf;
 
         algorithm_objects(
             bool verbose, 
@@ -55,9 +59,9 @@ class TestSYRF : public ::testing::Test
             int64_t p, 
             int64_t passes_per_iteration
         ) :
-            SYPS(p, passes_per_iteration, verbose, cond_check),
-            Orth_RF(cond_check, verbose),
-            SYRF(SYPS, Orth_RF, verbose, cond_check)
+            syps(p, passes_per_iteration, verbose, cond_check),
+            orth(cond_check, verbose),
+            syrf(syps, orth, verbose, cond_check)
             {}
     };
 
@@ -95,7 +99,7 @@ class TestSYRF : public ::testing::Test
         auto m = all_data.row;
         auto k = all_data.rank;
 
-        all_algs.SYRF.call(Uplo::Upper, m, all_data.A.data(), k, all_data.Q, state, NULL);
+        all_algs.syrf.call(Uplo::Upper, m, all_data.A.data(), k, all_data.Q, state, NULL);
 
         // Reassing pointers because Q, B have been resized
         T* Q_dat = all_data.Q.data();
diff --git a/test/comps/test_util_gpu.cu b/test/comps/test_util_gpu.cu
index 5fa7ebf1..c9bbbbcb 100644
--- a/test/comps/test_util_gpu.cu
+++ b/test/comps/test_util_gpu.cu
@@ -4,8 +4,8 @@
 
 #include <RandBLAS.hh>
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
 
 #include <math.h>
 #include <chrono>
@@ -61,18 +61,18 @@ class TestUtil_GPU : public ::testing::Test
         {
             row = m;
             col = n;
-            cudaMalloc(&A_device,   m * n * sizeof(T));
-            cudaMalloc(&B_device,   m * n * sizeof(T));
-            cudaMalloc(&J_device,   n * sizeof(int64_t));
-            cudaMalloc(&buf_device, n * sizeof(int64_t));
-            cudaDeviceSynchronize();
+            hipMalloc(&A_device,   m * n * sizeof(T));
+            hipMalloc(&B_device,   m * n * sizeof(T));
+            hipMalloc(&J_device,   n * sizeof(int64_t));
+            hipMalloc(&buf_device, n * sizeof(int64_t));
+            hipDeviceSynchronize();
         }
         
         ~ColSwpTestData() {
-            cudaFree(A_device);
-            cudaFree(B_device);
-            cudaFree(J_device);
-            cudaFree(buf_device);
+            hipFree(A_device);
+            hipFree(B_device);
+            hipFree(J_device);
+            hipFree(buf_device);
         }
     };
 
@@ -93,14 +93,14 @@ class TestUtil_GPU : public ::testing::Test
         {
             length_J   = m;
             length_idx = k;
-            cudaMalloc(&J_device,   m * sizeof(int64_t));
-            cudaMalloc(&idx_device, k * sizeof(int64_t));
-            cudaDeviceSynchronize();
+            hipMalloc(&J_device,   m * sizeof(int64_t));
+            hipMalloc(&idx_device, k * sizeof(int64_t));
+            hipDeviceSynchronize();
         }
 
         ~ColSwpVecTestData() {
-            cudaFree(J_device);
-            cudaFree(idx_device);
+            hipFree(J_device);
+            hipFree(idx_device);
         }
     };
 
@@ -121,13 +121,13 @@ class TestUtil_GPU : public ::testing::Test
         {
             row = m;
             col = n;
-            cudaMalloc(&A_device,   m * n * sizeof(T));
-            cudaMalloc(&A_device_T, n * m * sizeof(T));
+            hipMalloc(&A_device,   m * n * sizeof(T));
+            hipMalloc(&A_device_T, n * m * sizeof(T));
         }
 
         ~TranspTestData() {
-            cudaFree(A_device);
-            cudaFree(A_device_T);
+            hipFree(A_device);
+            hipFree(A_device_T);
         }
     };
 
@@ -151,15 +151,15 @@ class TestUtil_GPU : public ::testing::Test
         {
             row = m;
             col = n;
-            cudaMalloc(&A_device, m * n * sizeof(T));
-            cudaMalloc(&y_device, n * n * sizeof(T));
-            cudaMalloc(&x_device, m * sizeof(T));
+            hipMalloc(&A_device, m * n * sizeof(T));
+            hipMalloc(&y_device, n * n * sizeof(T));
+            hipMalloc(&x_device, m * sizeof(T));
         }
 
         ~GerTestData() {
-            cudaFree(A_device);
-            cudaFree(y_device);
-            cudaFree(x_device);
+            hipFree(A_device);
+            hipFree(y_device);
+            hipFree(x_device);
         }
     };
 
@@ -169,26 +169,26 @@ class TestUtil_GPU : public ::testing::Test
 
         auto m = all_data.row;
         auto n = all_data.col;
-        cudaStream_t strm = cudaStreamPerThread;
+        hipStream_t strm = hipStreamPerThread;
 
-        cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice, strm);
-        cudaMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), cudaMemcpyHostToDevice, strm);
-        cudaStreamSynchronize(strm);
+        hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice, strm);
+        hipMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), hipMemcpyHostToDevice, strm);
+        hipStreamSynchronize(strm);
 
 
-        cudaError_t ierr = cudaGetLastError();
-        if (ierr != cudaSuccess)
+        hipError_t ierr = hipGetLastError();
+        if (ierr != hipSuccess)
         {
-            RandLAPACK_CUDA_ERROR("GPU ERROR. " << cudaGetErrorString(ierr))
+            RandLAPACK_CUDA_ERROR("GPU ERROR. " << hipGetErrorString(ierr))
             abort();
         }
         printf("Passed the general error check\n");
 
         RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, n, n, all_data.A_device, m, all_data.J_device);
-        cudaMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost, strm);
-        cudaMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), cudaMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), hipMemcpyDeviceToHost, strm);
         RandLAPACK::util::col_swap(m, n, n, all_data.A.data(), m, all_data.J);
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
 
         for(int i = 0; i < m*n; ++i)
             all_data.A[i] -= all_data.A_host_buffer[i];
@@ -196,10 +196,10 @@ class TestUtil_GPU : public ::testing::Test
         T norm_test = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
         printf("\nNorm diff GPU CPU: %e\n", norm_test);
         EXPECT_NEAR(norm_test, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.75));
-    	ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -218,21 +218,21 @@ class TestUtil_GPU : public ::testing::Test
         auto n   = all_data.col;
         auto lda = m;
 
-        cudaStream_t strm = cudaStreamPerThread;
-        cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), cudaMemcpyHostToDevice, strm);
-        cudaMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), cudaMemcpyHostToDevice, strm);
+        hipStream_t strm = hipStreamPerThread;
+        hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), hipMemcpyHostToDevice, strm);
+        hipMemcpyAsync(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), hipMemcpyHostToDevice, strm);
         T* A_device = all_data.A_device;
         T* A_device_submat = all_data.A_device + (col_offset * m + offset);
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
         RandLAPACK::cuda_kernels::col_swap_gpu(strm, m_submat, n_submat, k_submat, A_device_submat, lda, all_data.J_device);
-        cudaMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost, strm);
-        cudaMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), cudaMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, n * sizeof(int64_t), hipMemcpyDeviceToHost, strm);
 
         T* A = all_data.A.data();
         T* A_submat = all_data.A.data() + (col_offset * m + offset);
         RandLAPACK::util::col_swap(m_submat, n_submat, k_submat, A_submat, lda, all_data.J);
 
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
 
         for(int i = 0; i < m*n; ++i)
             all_data.A[i] -= all_data.A_host_buffer[i];
@@ -241,10 +241,10 @@ class TestUtil_GPU : public ::testing::Test
         printf("\nNorm diff GPU CPU: %e\n", norm_test);
         EXPECT_NEAR(norm_test, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.75));
     
-    	cudaError_t ierr  = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	hipError_t ierr  = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -258,18 +258,18 @@ class TestUtil_GPU : public ::testing::Test
         auto m   = all_data.length_J;
         auto k   = all_data.length_idx;
 
-        cudaStream_t strm = cudaStreamPerThread;
-        cudaMemcpyAsync(all_data.J_device, all_data.J.data(),     m * sizeof(int64_t), cudaMemcpyHostToDevice, strm);
-        cudaMemcpyAsync(all_data.idx_device, all_data.idx.data(), k * sizeof(int64_t), cudaMemcpyHostToDevice, strm);
+        hipStream_t strm = hipStreamPerThread;
+        hipMemcpyAsync(all_data.J_device, all_data.J.data(),     m * sizeof(int64_t), hipMemcpyHostToDevice, strm);
+        hipMemcpyAsync(all_data.idx_device, all_data.idx.data(), k * sizeof(int64_t), hipMemcpyHostToDevice, strm);
         int64_t* J_device_subvec = all_data.J_device + offset;
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
         RandLAPACK::cuda_kernels::col_swap_gpu<T>(strm, m, k, J_device_subvec, all_data.idx_device);
-        cudaMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, m * sizeof(int64_t), cudaMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.J_host_buffer.data(), all_data.J_device, m * sizeof(int64_t), hipMemcpyDeviceToHost, strm);
 
         int64_t* J_subvec = all_data.J.data() + offset;
         std::vector<int64_t> buf;
         RandLAPACK::util::col_swap<T>(m, k, J_subvec, all_data.idx);
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
 
         for(int i = 0; i < m; ++i){
             all_data.J[i] -= all_data.J_host_buffer[i];
@@ -279,10 +279,10 @@ class TestUtil_GPU : public ::testing::Test
         printf("\nNorm diff GPU CPU: %e\n", norm_test);
         EXPECT_NEAR(norm_test, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.75));
    	 
-    	cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -294,15 +294,15 @@ class TestUtil_GPU : public ::testing::Test
 
         auto m = all_data.row;
         auto n = all_data.col;
-        cudaStream_t strm = cudaStreamPerThread;
+        hipStream_t strm = hipStreamPerThread;
     
-        cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
+        hipMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice);
         // Perform Pivoted QR
         lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());
         // Swap columns in A's copy
-        cudaMemcpy(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), cudaMemcpyHostToDevice);
+        hipMemcpy(all_data.J_device, all_data.J.data(), n * sizeof(int64_t), hipMemcpyHostToDevice);
         RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, n, n, all_data.A_device, m, all_data.J_device);
-        cudaMemcpy(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost);
+        hipMemcpy(all_data.A_host_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost);
 
         // Create an identity and store Q in it.
         RandLAPACK::util::eye(m, n, all_data.Ident.data());
@@ -318,10 +318,10 @@ class TestUtil_GPU : public ::testing::Test
         printf("||A_piv - QR||_F:  %e\n", norm);
         EXPECT_NEAR(norm, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.625));
     	
-    	cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -332,14 +332,14 @@ class TestUtil_GPU : public ::testing::Test
 
         auto m = all_data.row;
         auto n = all_data.col;
-        cudaStream_t strm = cudaStreamPerThread;
+        hipStream_t strm = hipStreamPerThread;
     
-        cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice, strm);
-        cudaStreamSynchronize(strm);
+        hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice, strm);
+        hipStreamSynchronize(strm);
         RandLAPACK::cuda_kernels::transposition_gpu(strm, m, n, all_data.A_device, m, all_data.A_device_T, n, 0);
-        cudaMemcpyAsync(all_data.A_T_buffer.data(), all_data.A_device_T, n * m * sizeof(T), cudaMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.A_T_buffer.data(), all_data.A_device_T, n * m * sizeof(T), hipMemcpyDeviceToHost, strm);
         RandLAPACK::util::transposition(m, n, all_data.A.data(), m, all_data.A_T.data(), n, 0);
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
 
         // A_piv - A_cpy
         for(int i = 0; i < m * n; ++i)
@@ -349,10 +349,10 @@ class TestUtil_GPU : public ::testing::Test
         printf("||A_T_host - A_T_device||_F:  %e\n", norm);
         EXPECT_NEAR(norm, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.625));
     	
-    	cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -365,16 +365,16 @@ class TestUtil_GPU : public ::testing::Test
 
         auto m = all_data.row;
         auto n = all_data.col;
-        cudaStream_t strm = cudaStreamPerThread;
-        cudaMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), cudaMemcpyHostToDevice, strm);
-        cudaStreamSynchronize(strm);
-        cudaMemcpyAsync(all_data.y_device, all_data.y.data(), n * n * sizeof(T), cudaMemcpyHostToDevice, strm);
-        cudaMemcpyAsync(all_data.x_device, all_data.x.data(), m * sizeof(T),     cudaMemcpyHostToDevice, strm);
+        hipStream_t strm = hipStreamPerThread;
+        hipMemcpyAsync(all_data.A_device, all_data.A.data(), m * n * sizeof(T), hipMemcpyHostToDevice, strm);
+        hipStreamSynchronize(strm);
+        hipMemcpyAsync(all_data.y_device, all_data.y.data(), n * n * sizeof(T), hipMemcpyHostToDevice, strm);
+        hipMemcpyAsync(all_data.x_device, all_data.x.data(), m * sizeof(T),     hipMemcpyHostToDevice, strm);
         RandLAPACK::cuda_kernels::ger_gpu(strm, m, n, alpha, all_data.x_device, 1, all_data.y_device, n + 1, all_data.A_device, m);
-        cudaMemcpyAsync(all_data.A_buffer.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost, strm);
+        hipMemcpyAsync(all_data.A_buffer.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost, strm);
         // Y has stride of n + 1
         blas::ger(Layout::ColMajor, m, n, alpha, all_data.x.data(), 1, all_data.y.data(), n + 1, all_data.A.data(), m);
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
         
         // A_piv - A_cpy
         for(int i = 0; i < m * n; ++i)
@@ -384,10 +384,10 @@ class TestUtil_GPU : public ::testing::Test
         printf("||A_host - A_device||_F:  %e\n", norm);
         EXPECT_NEAR(norm, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.625));
     	
-    	cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -517,9 +517,9 @@ TEST_F(TestUtil_GPU, test_ger_gpu) {
     m_info.exponent = 2.0;
     RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A.data(), state); 
     RandBLAS::DenseDist D1(n, n);
-    state = RandBLAS::fill_dense(D1, all_data.y.data(), state).second;
+    state = RandBLAS::fill_dense(D1, all_data.y.data(), state);
     RandBLAS::DenseDist D2(1, m);
-    state = RandBLAS::fill_dense(D2, all_data.x.data(), state).second;
+    state = RandBLAS::fill_dense(D2, all_data.x.data(), state);
     
     ger_gpu(alpha, all_data);
 }
diff --git a/test/drivers/bench_bqrrp_gpu.cu b/test/drivers/bench_bqrrp_gpu.cu
index 342f364a..93a8bb1f 100644
--- a/test/drivers/bench_bqrrp_gpu.cu
+++ b/test/drivers/bench_bqrrp_gpu.cu
@@ -3,9 +3,9 @@
 #include "rl_lapackpp.hh"
 #include "rl_gen.hh"
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hipsolver.h>
 
 #include <RandBLAS.hh>
 #include <fstream>
@@ -49,19 +49,19 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
         {
             row = m;
             col = n;
-            cudaMalloc(&A_device,    m * n * sizeof(T));
-            cudaMalloc(&tau_device,  n *     sizeof(T));
-            cudaMalloc(&J_device,    n *     sizeof(int64_t));
-            cudaMalloc(&R_device,    n * n * sizeof(T));
-            cudaMalloc(&D_device,    n *     sizeof(T));
+            hipMalloc(&A_device,    m * n * sizeof(T));
+            hipMalloc(&tau_device,  n *     sizeof(T));
+            hipMalloc(&J_device,    n *     sizeof(int64_t));
+            hipMalloc(&R_device,    n * n * sizeof(T));
+            hipMalloc(&D_device,    n *     sizeof(T));
         }
 
         ~BQRRPBenchData() {
-            cudaFree(A_device);
-            cudaFree(tau_device);
-            cudaFree(J_device);
-            cudaFree(R_device);
-            cudaFree(D_device);
+            hipFree(A_device);
+            hipFree(tau_device);
+            hipFree(J_device);
+            hipFree(R_device);
+            hipFree(D_device);
         }
     };
 
@@ -76,8 +76,8 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
         auto n = m_info.cols;
 
         RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state_const);
-        cudaMemset(all_data.J_device, 0.0, n);
-        cudaMemset(all_data.tau_device, 0.0, n);
+        hipMemset(all_data.J_device, 0.0, n);
+        hipMemset(all_data.tau_device, 0.0, n);
     }
 
     template <typename T, typename RNG>
@@ -100,14 +100,13 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
 
         // BQRRP with QRF
         // Skethcing in an sampling regime
-        cudaMalloc(&all_data.A_sk_device, d * n * sizeof(T));
-        all_data.A_sk = new T[d * n]();
-        T* S          = new T[d * m]();
-
+        hipMalloc(&all_data.A_sk_device, d * n * sizeof(T));
+        all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) );
+        T* S          = ( T * ) calloc( d * m, sizeof( T ) );
         RandBLAS::DenseDist D(d, m);
-        RandBLAS::fill_dense(D, S, state_const).second;
+        RandBLAS::fill_dense(D, S, state_const);
         blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d);
-        cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice);
+        hipMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), hipMemcpyHostToDevice);
         RandLAPACK::BQRRP_GPU<double, r123::Philox4x32> BQRRP_GPU_QRF(profile_runtime, block_size);
         BQRRP_GPU_QRF.qr_tall = GPUSubroutines::QRTall::geqrf;
 	    auto start_bqrrp_qrf = std::chrono::steady_clock::now();
@@ -115,8 +114,8 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
         auto stop_bqrrp_qrf = std::chrono::steady_clock::now();
 	    auto diff_bqrrp_qrf = std::chrono::duration_cast<std::chrono::microseconds>(stop_bqrrp_qrf - start_bqrrp_qrf).count();
         data_regen(m_info, all_data, state);
-        cudaFree(all_data.A_sk_device);
-        delete[] all_data.A_sk;
+        hipFree(all_data.A_sk_device);
+        free(all_data.A_sk);
 
         if(profile_runtime) {
             std::ofstream file(output_filename_breakdown_QRF, std::ios::app);
@@ -126,11 +125,11 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
 
         // BQRRP with CholQR
         // Skethcing in an sampling regime
-        cudaMalloc(&all_data.A_sk_device, d * n * sizeof(T));
-        all_data.A_sk = new T[d * n]();
+        hipMalloc(&all_data.A_sk_device, d * n * sizeof(T));
+        all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) );
         blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d);
-        delete[] S;
-        cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice);
+        free(S);
+        hipMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), hipMemcpyHostToDevice);
         RandLAPACK::BQRRP_GPU<double, r123::Philox4x32> BQRRP_GPU_CholQR(profile_runtime, block_size);
         BQRRP_GPU_CholQR.qr_tall = GPUSubroutines::QRTall::cholqr;
 	    auto start_bqrrp_cholqr = std::chrono::steady_clock::now();
@@ -138,8 +137,8 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
 	    auto stop_bqrrp_cholqr = std::chrono::steady_clock::now();
 	    auto diff_bqrrp_cholqr = std::chrono::duration_cast<std::chrono::microseconds>(stop_bqrrp_cholqr - start_bqrrp_cholqr).count();
         data_regen(m_info, all_data, state);
-        cudaFree(all_data.A_sk_device);
-        delete[] all_data.A_sk;
+        hipFree(all_data.A_sk_device);
+        free(all_data.A_sk);
 
         if(profile_runtime) {
             std::ofstream file(output_filename_breakdown_CholQR, std::ios::app);
@@ -172,10 +171,10 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
 	    printf("  BLOCK SIZE = %ld BQRRP+QRF TIME (MS) = %ld BQRRP+CholQR TIME (MS) = %ld\n", block_size, diff_bqrrp_qrf, diff_bqrrp_cholqr);
         std::ofstream file(output_filename_speed, std::ios::app);
         file << m << "  " << n << "  " << block_size << "  " << diff_bqrrp_qrf << "  " << diff_bqrrp_cholqr << "  " << diff_qrf << "\n";
-        cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+        hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before bench_bqrrp returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before bench_bqrrp returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
@@ -195,7 +194,7 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
 
         // Initialize GPU stuff
         lapack::Queue lapack_queue(0);
-        cudaStream_t strm = lapack_queue.stream();
+        hipStream_t strm = lapack_queue.stream();
         using lapack::device_info_int;
         device_info_int* d_info = blas::device_malloc< device_info_int >( 1, lapack_queue );
         char* d_work_geqrf;
@@ -215,7 +214,7 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
         // ORHR_COL part
         RandLAPACK::cuda_kernels::orhr_col_gpu(strm, m, n, all_data.A_device, m, all_data.tau_device, all_data.D_device);  
         RandLAPACK::cuda_kernels::R_cholqr_signs_gpu(strm, n, n, all_data.R_device, all_data.D_device);
-        cudaStreamSynchronize(strm);
+        hipStreamSynchronize(strm);
         auto stop_orhr_col  = std::chrono::steady_clock::now();
         auto diff_orhr_col  = std::chrono::duration_cast<std::chrono::microseconds>(stop_orhr_col  - start_orhr_col).count();
 
@@ -239,10 +238,10 @@ class BenchBQRRP : public ::testing::TestWithParam<int64_t>
         std::ofstream file(output_filename, std::ios::app);
         file << m << "  " << n << "  " << diff_cholqr << "  " << diff_orhr_col << "  " << diff_qrf << "\n";
 
-        cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+        hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before bench_CholQR returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before bench_CholQR returned. " << hipGetErrorString(ierr))
         	abort();
     	}    
     }
@@ -262,7 +261,7 @@ TEST_P(BenchBQRRP, GPU_fixed_blocksize) {
     BQRRPBenchData<double> all_data(m, n);
     RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::gaussian);
     RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A.data(), state);
-    cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
+    hipMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice);
 
 
     std::string file1 = "BQRRP_GPU_runtime_breakdown_innerQRF_1_rows_"       
diff --git a/test/drivers/test_bqrrp_gpu.cu b/test/drivers/test_bqrrp_gpu.cu
index 99858865..5c5ea120 100644
--- a/test/drivers/test_bqrrp_gpu.cu
+++ b/test/drivers/test_bqrrp_gpu.cu
@@ -3,9 +3,9 @@
 #include "rl_lapackpp.hh"
 #include "rl_gen.hh"
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hipsolver.h>
 
 #include <RandBLAS.hh>
 #include <fstream>
@@ -73,17 +73,17 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
             row = m;
             col = n;
             rank = k;
-            cudaMalloc(&A_device,    m * n * sizeof(T));
-            cudaMalloc(&A_sk_device, d * n * sizeof(T));
-            cudaMalloc(&tau_device,  n *     sizeof(T));
-            cudaMalloc(&J_device,    n *     sizeof(int64_t));
+            hipMalloc(&A_device,    m * n * sizeof(T));
+            hipMalloc(&A_sk_device, d * n * sizeof(T));
+            hipMalloc(&tau_device,  n *     sizeof(T));
+            hipMalloc(&J_device,    n *     sizeof(int64_t));
         }
 
         ~BQRRPTestData() {
-            cudaFree(A_device);
-            cudaFree(A_sk_device);
-            cudaFree(tau_device);
-            cudaFree(J_device);
+            hipFree(A_device);
+            hipFree(A_sk_device);
+            hipFree(tau_device);
+            hipFree(J_device);
         }
     };
 
@@ -95,14 +95,14 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
         auto state_const = state;
 
         // Skethcing in an sampling regime
-        T* S = new T[d * m]();
+        T* S  = ( T * ) calloc( d * m, sizeof( T ) );
         RandBLAS::DenseDist D(d, m);
-        RandBLAS::fill_dense(D, S, state_const).second;
+        RandBLAS::fill_dense(D, S, state_const);
         blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk.data(), d);
-        delete[] S;
-        cudaMemcpy(all_data.A_sk_device, all_data.A_sk.data(), d * n * sizeof(double), cudaMemcpyHostToDevice);
+        free(S);
+        hipMemcpy(all_data.A_sk_device, all_data.A_sk.data(), d * n * sizeof(double), hipMemcpyHostToDevice);
 
-        cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
+        hipMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), hipMemcpyHostToDevice);
         lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpu.data(), m);
         lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m);
         lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m);
@@ -174,7 +174,7 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
         BQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device);
 
         if(BQRRP_GPU.rank == 0) {
-            cudaMemcpy(all_data.A.data(), all_data.A_device, m * n * sizeof(T), cudaMemcpyDeviceToHost);
+            hipMemcpy(all_data.A.data(), all_data.A_device, m * n * sizeof(T), hipMemcpyDeviceToHost);
             for(int i = 0; i < m * n; ++i) {
                 ASSERT_NEAR(all_data.A[i], 0.0, atol);
             }
@@ -182,10 +182,10 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
             all_data.rank = BQRRP_GPU.rank;
             printf("RANK AS RETURNED BY BQRRP GPU %4ld\n", all_data.rank);
             
-            cudaMemcpy(all_data.R_full.data(), all_data.A_device,   m * n * sizeof(T),   cudaMemcpyDeviceToHost);
-            cudaMemcpy(all_data.Q.data(),      all_data.A_device,   m * n * sizeof(T),   cudaMemcpyDeviceToHost);
-            cudaMemcpy(all_data.tau.data(),    all_data.tau_device, n * sizeof(T),       cudaMemcpyDeviceToHost);
-            cudaMemcpy(all_data.J.data(),      all_data.J_device,   n * sizeof(int64_t), cudaMemcpyDeviceToHost);
+            hipMemcpy(all_data.R_full.data(), all_data.A_device,   m * n * sizeof(T),   hipMemcpyDeviceToHost);
+            hipMemcpy(all_data.Q.data(),      all_data.A_device,   m * n * sizeof(T),   hipMemcpyDeviceToHost);
+            hipMemcpy(all_data.tau.data(),    all_data.tau_device, n * sizeof(T),       hipMemcpyDeviceToHost);
+            hipMemcpy(all_data.J.data(),      all_data.J_device,   n * sizeof(int64_t), hipMemcpyDeviceToHost);
 
             lapack::ungqr(m, n, n, all_data.Q.data(), m, all_data.tau.data());
             RandLAPACK::util::upsize(all_data.rank * n, all_data.R);
@@ -197,10 +197,10 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
             error_check(norm_A, all_data, atol);
         }
 
-        cudaError_t ierr = cudaGetLastError();
-        if (ierr != cudaSuccess)
+        hipError_t ierr = hipGetLastError();
+        if (ierr != hipSuccess)
         {
-                RandLAPACK_CUDA_ERROR("Error before test_BQRRP_general returned. " << cudaGetErrorString(ierr))
+                RandLAPACK_CUDA_ERROR("Error before test_BQRRP_general returned. " << hipGetErrorString(ierr))
                 abort();
         }
     }
@@ -219,9 +219,9 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
         BQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device);
         BQRRP_CPU.call(m, n, all_data.A_cpu.data(), m, (T) (d / BQRRP_CPU.block_size) , all_data.tau_cpu.data(), all_data.J_cpu.data(), state);
         
-        cudaMemcpy(all_data.R_full.data(), all_data.A_device,   m * n * sizeof(T),   cudaMemcpyDeviceToHost);
-        cudaMemcpy(all_data.tau.data(),    all_data.tau_device, n * sizeof(T),       cudaMemcpyDeviceToHost);
-        cudaMemcpy(all_data.J.data(),      all_data.J_device,   n * sizeof(int64_t), cudaMemcpyDeviceToHost);
+        hipMemcpy(all_data.R_full.data(), all_data.A_device,   m * n * sizeof(T),   hipMemcpyDeviceToHost);
+        hipMemcpy(all_data.tau.data(),    all_data.tau_device, n * sizeof(T),       hipMemcpyDeviceToHost);
+        hipMemcpy(all_data.J.data(),      all_data.J_device,   n * sizeof(int64_t), hipMemcpyDeviceToHost);
 
         for(int i = 0; i < n; ++i) {
             all_data.tau[i] -= all_data.tau_cpu[i];
@@ -243,10 +243,10 @@ class TestBQRRP : public ::testing::TestWithParam<int64_t>
         ASSERT_NEAR(col_nrm_tau, 0.0, atol1);
         ASSERT_NEAR(norm_R_diff, 0.0, atol2);
     
-    	cudaError_t ierr = cudaGetLastError();
-    	if (ierr != cudaSuccess)
+    	hipError_t ierr = hipGetLastError();
+    	if (ierr != hipSuccess)
     	{
-        	RandLAPACK_CUDA_ERROR("Error before test_BQRRP_compare_with_CPU returned. " << cudaGetErrorString(ierr))
+        	RandLAPACK_CUDA_ERROR("Error before test_BQRRP_compare_with_CPU returned. " << hipGetErrorString(ierr))
         	abort();
     	}
     }
diff --git a/test/drivers/test_cqrrpt_gpu.cu b/test/drivers/test_cqrrpt_gpu.cu
index fd51e6b2..4be8a1d6 100644
--- a/test/drivers/test_cqrrpt_gpu.cu
+++ b/test/drivers/test_cqrrpt_gpu.cu
@@ -3,13 +3,13 @@
 #include "rl_lapackpp.hh"
 #include "rl_gen.hh"
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
 
 #include <RandBLAS.hh>
 #include <fstream>
 #include <gtest/gtest.h>
-#include <cusolverDn.h>
+#include <hipsolver.h>
 
 #ifndef USE_CUDA
 #define USE_CUDA
diff --git a/test/drivers/test_hqrrp.cc b/test/drivers/test_hqrrp.cc
index 4e079620..ffa09f2b 100644
--- a/test/drivers/test_hqrrp.cc
+++ b/test/drivers/test_hqrrp.cc
@@ -53,7 +53,7 @@ class TestHQRRP : public ::testing::Test
         lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy1.data(), m);
         lapack::lacpy(MatrixType::General, m, n, all_data.A.data(), m, all_data.A_cpy2.data(), m);
         norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m);
-        std::fill(all_data.J.begin(), all_data.J.end(), 0);
+        std::iota(all_data.J.begin(), all_data.J.end(), 1);
     }
 
 
diff --git a/test/drivers/test_revd2.cc b/test/drivers/test_revd2.cc
index 4feaf31b..6cfad98b 100644
--- a/test/drivers/test_revd2.cc
+++ b/test/drivers/test_revd2.cc
@@ -7,6 +7,7 @@
 #include <fstream>
 #include <gtest/gtest.h>
 
+
 class TestREVD2 : public ::testing::Test
 {
     protected:
@@ -75,12 +76,13 @@ class TestREVD2 : public ::testing::Test
 
     template <typename T, typename RNG>
     struct algorithm_objects {
-        RandLAPACK::SYPS<T, RNG> SYPS;
-        RandLAPACK::HQRQ<T> Orth_RF; 
-        RandLAPACK::SYRF<T, RNG> SYRF;
-        //  ^ Needs a symmetric power skether and an orthogonalizer
-        RandLAPACK::REVD2<T, RNG> REVD2;
-        //  ^ Needs a symmetric rangefinder.
+        using SYPS_t = RandLAPACK::SYPS<T, RNG>;
+        using Orth_t = RandLAPACK::HQRQ<T>;
+        using SYRF_t = RandLAPACK::SYRF<SYPS_t, Orth_t>;
+        SYPS_t syps;
+        Orth_t orth; 
+        SYRF_t syrf;
+        RandLAPACK::REVD2<SYRF_t> revd2;
 
 
         algorithm_objects(
@@ -90,10 +92,10 @@ class TestREVD2 : public ::testing::Test
             int64_t passes_per_syps_stabilization, 
             int64_t num_steps_power_iter_error_est
         ) : 
-            SYPS(num_syps_passes, passes_per_syps_stabilization, verbose, cond_check),
-            Orth_RF(cond_check, verbose),
-            SYRF(SYPS, Orth_RF, verbose, cond_check),
-            REVD2(SYRF, num_steps_power_iter_error_est, verbose)
+            syps(num_syps_passes, passes_per_syps_stabilization, verbose, cond_check),
+            orth(cond_check, verbose),
+            syrf(syps, orth, verbose, cond_check),
+            revd2(syrf, num_steps_power_iter_error_est, verbose)
             {}
     };
 
@@ -150,7 +152,7 @@ class TestREVD2 : public ::testing::Test
         auto m = all_data.dim;
 
         int64_t k = k_start;
-        all_algs.REVD2.call(blas::Uplo::Upper, m, all_data.A.data(), k, tol, all_data.V, all_data.eigvals, state);
+        all_algs.revd2.call(blas::Uplo::Upper, m, all_data.A.data(), k, tol, all_data.V, all_data.eigvals, state);
 
         T* E_dat = RandLAPACK::util::upsize(k * k, all_data.E);
         T* Buf_dat = RandLAPACK::util::upsize(m * k, all_data.Buf);
@@ -188,8 +190,8 @@ class TestREVD2 : public ::testing::Test
         auto m = all_data.dim;
 
         int64_t k = k_start;
-        all_algs.REVD2.call(blas::Uplo::Upper, m, all_data.A_u.data(), k, tol, all_data.V_u, all_data.eigvals_u, state);
-        all_algs.REVD2.call(blas::Uplo::Lower, m, all_data.A_l.data(), k, tol, all_data.V_l, all_data.eigvals_l, state);
+        all_algs.revd2.call(blas::Uplo::Upper, m, all_data.A_u.data(), k, tol, all_data.V_u, all_data.eigvals_u, state);
+        all_algs.revd2.call(blas::Uplo::Lower, m, all_data.A_l.data(), k, tol, all_data.V_l, all_data.eigvals_l, state);
 
         T* E_u_dat = RandLAPACK::util::upsize(k * k, all_data.E_u);
         T* E_l_dat = RandLAPACK::util::upsize(k * k, all_data.E_l);