Skip to content

Commit

Permalink
[Test] zdot_real_op (and some other math ops) benchmark on cpu (#3840)
Browse files Browse the repository at this point in the history
* Finished benchmark for basic Hsolver kernels.

* Tested function:
 *      - zdot_real_cpu_op
 *      - vector_div_constant_op_cpu
 *      - vector_mul_vector_op_cpu
 *      - vector_div_vector_op_cpu
 *      - constantvector_addORsub_constantVector_op_cpu
 *      - axpy_cpu
 *      - scal_cpu

 *      - zdot_real_gpu_op
 *      - vector_div_constant_op_gpu
 *      - vector_mul_vector_op_gpu
 *      - vector_div_vector_op_gpu
 *      - constantvector_addORsub_constantVector_op_gpu
 *      - axpy_gpu
 *      - scal_gpu

* Remove unnecessary manual timer.

But you can find how to write manual timer in annotate.
  • Loading branch information
Critsium-xy authored May 21, 2024
1 parent 0f294e8 commit 1a10cfc
Show file tree
Hide file tree
Showing 2 changed files with 297 additions and 0 deletions.
8 changes: 8 additions & 0 deletions source/module_hsolver/kernels/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,12 @@ elseif()
LIBS ${math_libs} base device
SOURCES math_kernel_test.cpp
)
endif()

if(ENABLE_GOOGLEBENCH)
AddTest(
TARGET Perf_Hsolver_Kernels
LIBS ${math_libs} base device
SOURCES perf_math_kernel.cpp
)
endif()
289 changes: 289 additions & 0 deletions source/module_hsolver/kernels/test/perf_math_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
#include "module_base/blas_connector.h"
#include "module_base/constants.h"
#include "module_hsolver/kernels/math_kernel_op.h"
#include "module_psi/kernels/memory_op.h"

#include <complex>
#include <benchmark/benchmark.h>
#include <iostream>
#include <math.h>
#include <stdlib.h>
#include <chrono>

/************************************************
* performace test of class math_kernel_op
***********************************************/

/**
* Tested function:
* - zdot_real_cpu_op
* - vector_div_constant_op_cpu
* - vector_mul_vector_op_cpu
* - vector_div_vector_op_cpu
* - constantvector_addORsub_constantVector_op_cpu
* - axpy_cpu
* - scal_cpu
* - zdot_real_gpu_op
* - vector_div_constant_op_gpu
* - vector_mul_vector_op_gpu
* - vector_div_vector_op_gpu
* - constantvector_addORsub_constantVector_op_gpu
* - axpy_gpu
* - scal_gpu
*/

class PerfModuleHsolverMathKernel : public benchmark::Fixture {
public:

// DEVICE SYMBOL
const psi::DEVICE_CPU* cpu_ctx = {};

int dim_vector = 1;

std::complex<double>* test_zvector_a = nullptr;
std::complex<double>* test_zvector_b = nullptr;
std::complex<double>* result_zvector = nullptr;

double* test_dvector_a = nullptr;

double dconstant_a = 1.0;
double dconstant_b = 1.0;

std::complex<double> zconstant_a = {1.0,1.0};

#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
const psi::DEVICE_GPU * gpu_ctx = {};

// from CPU to GPU
using synchronize_memory_op
= psi::memory::synchronize_memory_op<std::complex<double>, psi::DEVICE_GPU, psi::DEVICE_CPU>;

// form GPU to CPU
using synchronize_memory_op_gpu
= psi::memory::synchronize_memory_op<std::complex<double>, psi::DEVICE_CPU, psi::DEVICE_GPU>;

using resize_memory_op = psi::memory::resize_memory_op<std::complex<double>, psi::DEVICE_GPU>;
using delete_memory_op = psi::memory::delete_memory_op<std::complex<double>, psi::DEVICE_GPU>;
using resize_memory_op_double = psi::memory::resize_memory_op<double, psi::DEVICE_GPU>;
using delete_memory_op_double = psi::memory::delete_memory_op<double, psi::DEVICE_GPU>;
using synchronize_memory_op_double = psi::memory::synchronize_memory_op<double, psi::DEVICE_GPU, psi::DEVICE_CPU>;

using set_memory_op = psi::memory::set_memory_op<std::complex<double>, psi::DEVICE_GPU>;
using set_memory_op_double = psi::memory::set_memory_op<double, psi::DEVICE_GPU>;

std::complex<double>* test_zvector_a_gpu = nullptr;
std::complex<double>* test_zvector_b_gpu = nullptr;
std::complex<double>* result_zvector_gpu = nullptr;

double* test_dvector_a_gpu = nullptr;

#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM

void SetUp(const benchmark::State& state){
dim_vector = state.range(0); // Generate vectors with different diminsions (1,10,100,...,1e6)

// This should be complex vectors
test_zvector_a = new std::complex<double>[dim_vector + 1];
test_zvector_b = new std::complex<double>[dim_vector + 1];
result_zvector = new std::complex<double>[dim_vector + 1];

// The following is double vectors
test_dvector_a = new double[dim_vector + 1];


for (int i=0;i<dim_vector;i++){
// Generate vectors using random
test_zvector_a[i] = std::complex<double>{(double)rand()+(double)rand()/(RAND_MAX+1.0),(double)rand()+(double)rand()/(RAND_MAX+1.0)};
test_zvector_b[i] = std::complex<double>{(double)rand()+(double)rand()/(RAND_MAX+1.0),(double)rand()+(double)rand()/(RAND_MAX+1.0)};
test_dvector_a[i] = (double)rand()+(double)rand()/(RAND_MAX+1.0);
}

dconstant_a = (double)rand()+(double)rand()/(RAND_MAX+1.0);
dconstant_b = (double)rand()+(double)rand()/(RAND_MAX+1.0);

zconstant_a = std::complex<double>{(double)rand()+(double)rand()/(RAND_MAX+1.0),(double)rand()+(double)rand()/(RAND_MAX+1.0)};
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM

resize_memory_op()(gpu_ctx, test_zvector_a_gpu, dim_vector);
resize_memory_op()(gpu_ctx, test_zvector_b_gpu, dim_vector);
synchronize_memory_op()(gpu_ctx, cpu_ctx, test_zvector_a_gpu, test_zvector_a, dim_vector);
synchronize_memory_op()(gpu_ctx, cpu_ctx, test_zvector_b_gpu, test_zvector_b, dim_vector);

resize_memory_op()(gpu_ctx, result_zvector_gpu, dim_vector);
resize_memory_op_double()(gpu_ctx, test_dvector_a_gpu, dim_vector);
synchronize_memory_op_double()(gpu_ctx, cpu_ctx, test_dvector_a_gpu, test_dvector_a, dim_vector);

hsolver::createGpuBlasHandle();


#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
}
void TearDown(const benchmark::State& state){
delete[] test_zvector_a;
delete[] test_zvector_b;
delete[] result_zvector;
delete[] test_dvector_a;
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
hsolver::destoryBLAShandle();
#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
}


// OPs need benchmark
// CPU operator
using zdot_real_cpu_op = hsolver::dot_real_op<std::complex<double>, psi::DEVICE_CPU>;

using vector_div_constant_op_cpu = hsolver::vector_div_constant_op<std::complex<double>, psi::DEVICE_CPU>;
using vector_mul_vector_op_cpu = hsolver::vector_mul_vector_op<std::complex<double>, psi::DEVICE_CPU>;
using vector_div_vector_op_cpu = hsolver::vector_div_vector_op<std::complex<double>, psi::DEVICE_CPU>;
using constantvector_addORsub_constantVector_op_cpu
= hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, psi::DEVICE_CPU>;
using axpy_op_cpu = hsolver::axpy_op<std::complex<double>, psi::DEVICE_CPU>;
using scal_op_cpu = hsolver::scal_op<double, psi::DEVICE_CPU>;
using gemv_op_cpu = hsolver::gemv_op<std::complex<double>, psi::DEVICE_CPU>;

#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM

// GPU operator
using zdot_real_gpu_op = hsolver::dot_real_op<std::complex<double>, psi::DEVICE_GPU>;

using vector_div_constant_op_gpu = hsolver::vector_div_constant_op<std::complex<double>, psi::DEVICE_GPU>;
using vector_mul_vector_op_gpu = hsolver::vector_mul_vector_op<std::complex<double>, psi::DEVICE_GPU>;
using vector_div_vector_op_gpu = hsolver::vector_div_vector_op<std::complex<double>, psi::DEVICE_GPU>;
using constantvector_addORsub_constantVector_op_gpu
= hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, psi::DEVICE_GPU>;
using axpy_op_gpu = hsolver::axpy_op<std::complex<double>, psi::DEVICE_GPU>;
using scal_op_gpu = hsolver::scal_op<double, psi::DEVICE_GPU>;

#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
};


BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_cpu_op)(benchmark::State& state) {
for (auto _ : state) {
double result = zdot_real_cpu_op()(cpu_ctx, dim_vector, test_zvector_a, test_zvector_b, false);
}
}


BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_cpu)(benchmark::State& state) {
for (auto _ : state) {
vector_div_constant_op_cpu()(cpu_ctx, dim_vector, result_zvector, test_zvector_a, dconstant_a);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_cpu)(benchmark::State& state) {
for (auto _ : state) {
vector_mul_vector_op_cpu()(cpu_ctx, dim_vector, result_zvector, test_zvector_a, test_dvector_a);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_cpu)(benchmark::State& state) {
for (auto _ : state) {
vector_div_vector_op_cpu()(cpu_ctx, dim_vector, result_zvector, test_zvector_a, test_dvector_a);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_cpu)(benchmark::State& state) {
for (auto _ : state) {
constantvector_addORsub_constantVector_op_cpu()(cpu_ctx, dim_vector, result_zvector, test_zvector_a, dconstant_a ,test_zvector_b, dconstant_b);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_axpy_op_cpu)(benchmark::State& state) {
for (auto _ : state) {
axpy_op_cpu()(cpu_ctx, dim_vector, &zconstant_a, test_zvector_a, 1 ,test_zvector_b, 1);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_scal_op_cpu)(benchmark::State& state) {
for (auto _ : state) {
scal_op_cpu()(cpu_ctx, dim_vector, &zconstant_a, test_zvector_a, 1);
}
}


BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_cpu_op)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_axpy_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_scal_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);


#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM

// If you want to use manual timer, you can refer to this.
/*
BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)(benchmark::State& state) {
for (auto _ : state) {
auto start = std::chrono::high_resolution_clock::now();
double result = zdot_real_gpu_op()(gpu_ctx, dim_vector, test_zvector_a_gpu, test_zvector_b_gpu, false);
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
state.SetIterationTime(elapsed_seconds.count());
}
}
*/

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)(benchmark::State& state) {
for (auto _ : state) {
double result = zdot_real_gpu_op()(gpu_ctx, dim_vector, test_zvector_a_gpu, test_zvector_b_gpu, false);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_gpu)(benchmark::State& state) {
for (auto _ : state) {
vector_div_constant_op_gpu()(gpu_ctx, dim_vector, result_zvector_gpu, test_zvector_a_gpu, dconstant_a);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_gpu)(benchmark::State& state) {
for (auto _ : state) {
vector_mul_vector_op_gpu()(gpu_ctx, dim_vector, result_zvector_gpu, test_zvector_a_gpu, test_dvector_a_gpu);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_gpu)(benchmark::State& state) {
for (auto _ : state) {
vector_div_vector_op_gpu()(gpu_ctx, dim_vector, result_zvector_gpu, test_zvector_a_gpu, test_dvector_a_gpu);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_gpu)(benchmark::State& state) {
for (auto _ : state) {
constantvector_addORsub_constantVector_op_gpu()(gpu_ctx, dim_vector, result_zvector_gpu, test_zvector_a_gpu, dconstant_a ,test_zvector_b_gpu, dconstant_b);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_axpy_op_gpu)(benchmark::State& state) {
for (auto _ : state) {
axpy_op_gpu()(gpu_ctx, dim_vector, &zconstant_a, test_zvector_a_gpu, 1 ,test_zvector_b_gpu, 1);
}
}

BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_scal_op_gpu)(benchmark::State& state) {
for (auto _ : state) {
scal_op_gpu()(gpu_ctx, dim_vector, &zconstant_a, test_zvector_a_gpu, 1);
}
}

// If you want to use manual timer, you can refer to this.
// BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)->RangeMultiplier(10)->Range(1,10e6)->UseManualTime()->Unit(benchmark::kMicrosecond);

BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_axpy_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_scal_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);

#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM


BENCHMARK_MAIN();

0 comments on commit 1a10cfc

Please sign in to comment.