Merge branch 'TDDFT_GPU_phase_1' of github.com:AsTonyshment/abacus-de…

…velop into TDDFT_GPU_phase_1
deepmodeling · Jan 21, 2025 · 2bdc83f · 2bdc83f
2 parents a02a352 + 8b526a9
commit 2bdc83f
Show file tree

Hide file tree

Showing 85 changed files with 1,826 additions and 868 deletions.
diff --git a/python/pyabacus/CONTRIBUTING.md b/python/pyabacus/CONTRIBUTING.md
@@ -8,10 +8,13 @@ Welcome to the `pyabacus` project! This document provides guidelines and instruc
 
 <!-- toc -->
 
-- [Project structure](#project-structure)
+- [Developer Guide](#developer-guide)
+  - [Introduction](#introduction)
+  - [Project Structure](#project-structure)
     - [Root CMake Configuration](#root-cmake-configuration)
     - [Module CMake Configuration](#module-cmake-configuration)
-- [Development Process](#development-process)
+  - [Development Process](#development-process)
+  - [Conclusion](#conclusion)
 
 <!-- tocstop -->
 
@@ -187,7 +190,7 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
-    ${HSOLVER_PATH}/kernels/math_kernel_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/kernels/math_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     ${BASE_PATH}/module_device/memory_op.cpp

diff --git a/python/pyabacus/src/ModuleBase/CMakeLists.txt b/python/pyabacus/src/ModuleBase/CMakeLists.txt
@@ -1,6 +1,7 @@
 list(APPEND pymodule_base
     ${PROJECT_SOURCE_DIR}/src/ModuleBase/py_base_math.cpp
     ${BASE_PATH}/kernels/math_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     )

diff --git a/python/pyabacus/src/ModuleNAO/CMakeLists.txt b/python/pyabacus/src/ModuleNAO/CMakeLists.txt
@@ -14,6 +14,7 @@ list(APPEND _naos
     ${NAO_PATH}/two_center_table.cpp
     # dependency
     ${ABACUS_SOURCE_DIR}/module_base/kernels/math_op.cpp
+    ${ABACUS_SOURCE_DIR}/module_base/kernels/math_kernel_op.cpp
     # ${ABACUS_SOURCE_DIR}/module_psi/kernels/psi_memory_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/module_device/memory_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/module_device/device.cpp

diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -10,8 +10,8 @@ list(APPEND _diago
 
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
-    ${HSOLVER_PATH}/kernels/math_kernel_op.cpp
     # dependency
+    ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/kernels/math_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     ${BASE_PATH}/module_device/memory_op.cpp

diff --git a/python/pyabacus/src/hsolver/py_hsolver.cpp b/python/pyabacus/src/hsolver/py_hsolver.cpp
@@ -6,7 +6,7 @@
 #include <pybind11/numpy.h>
 
 #include "module_hsolver/diago_dav_subspace.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_base/module_device/types.h"
 
 #include "./py_diago_dav_subspace.hpp"

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
@@ -36,14 +36,14 @@ list(APPEND device_srcs
   module_hamilt_pw/hamilt_stodft/kernels/hpsi_norm_op.cpp
   module_basis/module_pw/kernels/pw_op.cpp
   module_hsolver/kernels/dngvd_op.cpp
-  module_hsolver/kernels/math_kernel_op.cpp
   module_elecstate/kernels/elecstate_op.cpp
 
   # module_psi/kernels/psi_memory_op.cpp
   # module_psi/kernels/device.cpp
 
   module_base/module_device/device.cpp
   module_base/module_device/memory_op.cpp
+  module_base/kernels/math_kernel_op.cpp
 
   module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -64,7 +64,6 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
     module_basis/module_pw/kernels/cuda/pw_op.cu
     module_hsolver/kernels/cuda/dngvd_op.cu
-    module_hsolver/kernels/cuda/math_kernel_op.cu
     module_elecstate/kernels/cuda/elecstate_op.cu
 
     # module_psi/kernels/cuda/memory_op.cu
@@ -75,6 +74,7 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/wf_op.cu
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/vnl_op.cu
     module_base/kernels/cuda/math_op.cu
+    module_base/kernels/cuda/math_kernel_op.cu
     module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
   )
 endif()
@@ -89,7 +89,6 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu
     module_basis/module_pw/kernels/rocm/pw_op.hip.cu
     module_hsolver/kernels/rocm/dngvd_op.hip.cu
-    module_hsolver/kernels/rocm/math_kernel_op.hip.cu
     module_elecstate/kernels/rocm/elecstate_op.hip.cu
 
     # module_psi/kernels/rocm/memory_op.hip.cu
@@ -99,6 +98,7 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu
+    module_base/kernels/rocm/math_kernel_op.hip.cu
     module_base/kernels/rocm/math_op.hip.cu
     module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu
   )

diff --git a/source/Makefile.Objects b/source/Makefile.Objects
@@ -146,11 +146,13 @@ OBJS_BASE=abfs-vector3_order.o\
     math_bspline.o\
     math_chebyshev.o\
     math_op.o\
+    math_kernel_op.o\
     mathzone_add1.o\
     matrix.o\
     matrix3.o\
     memory.o\
     mymath.o\
+    para_gemm.o\
     realarray.o\
     sph_bessel_recursive-d1.o\
     sph_bessel_recursive-d2.o\
@@ -336,7 +338,6 @@ OBJS_HSOLVER=diago_cg.o\
     hsolver_lcaopw.o\
     hsolver_pw_sdft.o\
     diago_iter_assist.o\
-    math_kernel_op.o\
     dngvd_op.o\
     diag_const_nums.o\
     diag_hs_para.o\

diff --git a/source/module_base/CMakeLists.txt b/source/module_base/CMakeLists.txt
@@ -37,6 +37,7 @@ add_library(
     mymath.cpp
     opt_CG.cpp
     opt_DCsrch.cpp
+    para_gemm.cpp
     realarray.cpp
     sph_bessel_recursive-d1.cpp
     sph_bessel_recursive-d2.cpp

diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp
@@ -10,7 +10,7 @@
 #include <base/macros/macros.h>
 #include <cuda_runtime.h>
 #include "cublas_v2.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_base/module_device/memory_op.h"
 
 
@@ -668,7 +668,7 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec
 	}
 	else if (device_type == base_device::AbacusDevice_t::GpuDevice){
 #ifdef __CUDA
-		hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
+		ModuleBase::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
 #endif
 	}
 }
@@ -688,7 +688,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec
 	}
 	else if (device_type == base_device::AbacusDevice_t::GpuDevice){
 #ifdef __CUDA
-		hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
+		ModuleBase::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
 #endif
 	}
 }
@@ -706,7 +706,7 @@ void vector_add_vector(const int& dim, float *result, const float *vector1, cons
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -724,7 +724,7 @@ void vector_add_vector(const int& dim, double *result, const double *vector1, co
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -742,7 +742,7 @@ void vector_add_vector(const int& dim, std::complex<float> *result, const std::c
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -760,7 +760,7 @@ void vector_add_vector(const int& dim, std::complex<double> *result, const std::
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
diff --git a/...le_hsolver/kernels/cuda/math_kernel_op.cu → ...odule_base/kernels/cuda/math_kernel_op.cu b/...le_hsolver/kernels/cuda/math_kernel_op.cu → ...odule_base/kernels/cuda/math_kernel_op.cu
@@ -1,5 +1,5 @@
 #include "module_base/module_device/memory_op.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
@@ -9,7 +9,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/inner_product.h>
 
-namespace hsolver
+namespace ModuleBase
 {
 const int warp_size = 32;
 // const unsigned int full_mask = 0xffffffff;
@@ -24,7 +24,7 @@ template <>
 struct GetTypeReal<thrust::complex<double>> {
     using type = double; /**< The return type specialization for std::complex<double>. */
 };
-namespace hsolver {
+namespace ModuleBase {
 template <typename T>
 struct GetTypeThrust {
     using type = T;
@@ -817,6 +817,27 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEV
     cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx));
 }
 
+template <>
+void gemm_op<float, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                         const char& transa,
+                                                         const char& transb,
+                                                         const int& m,
+                                                         const int& n,
+                                                         const int& k,
+                                                         const float* alpha,
+                                                         const float* a,
+                                                         const int& lda,
+                                                         const float* b,
+                                                         const int& ldb,
+                                                         const float* beta,
+                                                         float* c,
+                                                         const int& ldc)
+{
+    cublasOperation_t cutransA = judge_trans_op(false, transa, "gemm_op");
+    cublasOperation_t cutransB = judge_trans_op(false, transb, "gemm_op");
+    cublasErrcheck(cublasSgemm(cublas_handle, cutransA, cutransB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
 template <>
 void gemm_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
                                                           const char& transa,
@@ -1060,4 +1081,4 @@ template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
-}  // namespace hsolver
+}  // namespace ModuleBase
diff --git a/...module_hsolver/kernels/math_kernel_op.cpp → ...ce/module_base/kernels/math_kernel_op.cpp b/...module_hsolver/kernels/math_kernel_op.cpp → ...ce/module_base/kernels/math_kernel_op.cpp
@@ -1,9 +1,9 @@
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 
 #include <iomanip>
 #include <iostream>
 
-namespace hsolver
+namespace ModuleBase
 {
 
 template <typename T>

diff --git a/...e/module_hsolver/kernels/math_kernel_op.h → source/module_base/kernels/math_kernel_op.h b/...e/module_hsolver/kernels/math_kernel_op.h → source/module_base/kernels/math_kernel_op.h
@@ -17,7 +17,7 @@
 #include "cublas_v2.h"
 #endif //__CUDA || __UT_USE_CUDA
 
-namespace hsolver {
+namespace ModuleBase {
 
 inline std::complex<double> set_real_tocomplex(const std::complex<double> &x) {
   return {x.real(), 0.0};

diff --git a/...solver/kernels/rocm/math_kernel_op.hip.cu → ...e_base/kernels/rocm/math_kernel_op.hip.cu b/...solver/kernels/rocm/math_kernel_op.hip.cu → ...e_base/kernels/rocm/math_kernel_op.hip.cu
@@ -1,5 +1,5 @@
 #include "module_base/module_device/memory_op.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
@@ -20,7 +20,7 @@ struct GetTypeReal<thrust::complex<double>> {
     using type = double; /**< The return type specialization for std::complex<double>. */
 };
 
-namespace hsolver {
+namespace ModuleBase {
 
 template <typename T>
 struct GetTypeThrust {
@@ -735,6 +735,27 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEV
     hipblasErrcheck(hipblasZscal(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incx));
 }
 
+template <>
+void gemm_op<float, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                         const char& transa,
+                                                         const char& transb,
+                                                         const int& m,
+                                                         const int& n,
+                                                         const int& k,
+                                                         const float* alpha,
+                                                         const float* a,
+                                                         const int& lda,
+                                                         const float* b,
+                                                         const int& ldb,
+                                                         const float* beta,
+                                                         float* c,
+                                                         const int& ldc)
+{
+    hipblasOperation_t cutransA = judge_trans_op(false, transa, "gemm_op");
+    hipblasOperation_t cutransB = judge_trans_op(false, transb, "gemm_op");
+    hipblasErrcheck(hipblasSgemm(cublas_handle, cutransA, cutransB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
 template <>
 void gemm_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
                                                           const char& transa,
@@ -968,4 +989,4 @@ template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
-}  // namespace hsolver
+}  // namespace ModuleBase
diff --git a/source/module_base/kernels/test/CMakeLists.txt b/source/module_base/kernels/test/CMakeLists.txt
@@ -3,6 +3,5 @@ remove_definitions(-D__MPI)
 AddTest(
     TARGET Base_Kernels_UTs
     LIBS parameter ${math_libs} base device 
-    SOURCES math_op_test.cpp
+    SOURCES math_op_test.cpp math_kernel_test.cpp
 )
-