Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QST]cutlass gemm error #1151

Closed
zwshan opened this issue Oct 18, 2023 · 2 comments
Closed

[QST]cutlass gemm error #1151

zwshan opened this issue Oct 18, 2023 · 2 comments

Comments

@zwshan
Copy link

zwshan commented Oct 18, 2023

What is your question?
I can run the following code without any issues.

#include <cutlass/gemm/device/gemm.h>
#include <iostream>


cudaError_t cutlass_dgemm_nn(
  cudaStream_t *stream_cut,
  int M, int N, int K,
  double alpha,
  double *A, int lda,
  double *B, int ldb,
  double beta, 
  double *C, int ldc,
  double *D, int ldd
  ) {
  using ElementOutput = double;
  using ElementAccumulator = double;
  using ColumnMajor = cutlass::layout::ColumnMajor;
  using RowMajor = cutlass::layout::RowMajor;
  using CutlassGemm = cutlass::gemm::device::Gemm< 
    double,             // Data-type of A matrix
    RowMajor,        // Layout of A matrix
    double,             // Data-type of B matrix
    RowMajor,        // Layout of B matrix
    ElementOutput,      // Data-type of C matrix
    RowMajor,        // Layout of C matrix , LayoutC = layout::ColumnMajor;                       
    ElementAccumulator,                                     // ElementAccumulator
    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
    cutlass::arch::Sm80,                        // tag indicating target GPU compute architecture
    cutlass::gemm::GemmShape<64, 64, 16>, // Shape to optimize
    cutlass::gemm::GemmShape<32, 32, 16>, // Shape to optimize
    cutlass::gemm::GemmShape<8, 8, 4> // Shape to optimize
  >; 
  // Define a CUTLASS GEMM type
  CutlassGemm gemm_operator;
  cutlass::Status status = gemm_operator(*stream_cut);
  // cutlass::Status status = gemm_operator();
 
  // CutlassGemm::Arguments args({M , N, K},  // Gemm Problem dimensions
  //                             {A, lda},    // Tensor-ref for source matrix A
  //                             {B, ldb},    // Tensor-ref for source matrix B
  //                             {C, ldc},    // Tensor-ref for source matrix C
  //                             {D, ldd},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
  //                             {alpha, beta}); // Scalars used in the Epilogue
  
  CutlassGemm::Arguments args({M , N, K},  // Gemm Problem dimensions
                            {A, lda},    // Tensor-ref for source matrix A
                            {B, ldb},    // Tensor-ref for source matrix B
                            {C, ldc},    // Tensor-ref for source matrix C
                            {C, ldc},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
                            {alpha, beta}); // Scalars used in the Epilogue


  status = gemm_operator(args);
  //
  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
  //
  if (status != cutlass::Status::kSuccess) {
    return cudaErrorUnknown;
  }
  // Return success, if no errors were encountered.
  size_t result_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
  double *result;
  result = (double*)malloc(result_mem_size);  // host端D矩阵分配内存
  cudaMemcpy(result, C, result_mem_size, cudaMemcpyDeviceToHost);
  std::cout << result[0] << std::endl;                          //打印D中第一行第一个数据
  std::cout << result[M * N - 1] << std::endl;   

  return cudaSuccess;
}

void generate_tensor_2D(double *ptr, int i_M, int i_N){        // 二维矩阵填充函数(此处全部填充1)
    for(int i = 0; i < i_M; i++){
        for(int j = 0; j < i_N; j++){
            *(ptr + i*i_N + j ) = 1.0;
        }
    }
}

int main(int argc, const char *arg[]) {
    int M = 3840;           //M
    int N = 4096;           //N
    int K = 4096;           //K
 
    int lda = K;
    int ldb = K;
    int ldc = N;
    int ldd = N;
 
    double alpha = 1.0;      //alpha
    double beta = 1.0;       //beta
 
    double *A;               
    double *B;               
    double *C;               
    double *D;               
 
    size_t A_mem_size = sizeof(double) * M * K; //memory size of matrix A = M * K * sizeof(double)
    size_t B_mem_size = sizeof(double) * K * N; //memory size of matrix B = K * N * sizeof(double)
    size_t C_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
    size_t D_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
    
 
    A = (double*)malloc(A_mem_size);  
    B = (double*)malloc(B_mem_size);  
    C = (double*)malloc(C_mem_size);  
    D = (double*)malloc(D_mem_size);  
 
    generate_tensor_2D(A, M, K);     
    generate_tensor_2D(B, K, N);     
    generate_tensor_2D(C, M, N);    
 
    double *d_A;           
    double *d_B;            
    double *d_C;            
    double *d_D;            
 
    cudaMalloc((void**)&d_A, A_mem_size);  
    cudaMalloc((void**)&d_B, B_mem_size);  
    cudaMalloc((void**)&d_C, C_mem_size);  
    cudaMalloc((void**)&d_D, D_mem_size);  

    cudaStream_t stream[2];
    for (int i = 0; i < 2; ++i)
      cudaStreamCreate(&stream[i]);
    
    cudaMemcpy(d_A, A, A_mem_size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_B, B, B_mem_size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_C, C, C_mem_size, cudaMemcpyHostToDevice); 

    cutlass_dgemm_nn(&stream[1],
                      M, N, K,
                      alpha,
                      d_A, lda,
                      d_B, ldb,
                      beta, 
                      d_C, ldc,
                      d_D, ldd
                      );

    return 0;
}   

When I compile the above code into a library 'lib.a,' it compiles successfully. However, when I run a piece of code that calls 'lib.a,' I encounter the following errors (there are hundreds of such errors).

void cutlass::arch::Mma<cutlass::gemm::GemmShape<8, 8, 4>, 32, double, cutlass::layout::RowMajor, double, cutlass::layout::ColumnMajor, double, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::operator()(cutlass::Array<double, 2, true> &, const cutlass::Array<double, 1, true> &, const cutlass::Array<double, 1, true> &, const cutlass::Array<double, 2, true> &) const not implemented
@zwshan zwshan changed the title [QST]cutlass gemm erro [QST]cutlass gemm error Oct 18, 2023
@hwu36
Copy link
Collaborator

hwu36 commented Oct 18, 2023

maybe you can take a look how cutlass_library is built. https://github.com/NVIDIA/cutlass/blob/main/media/docs/profiler.md turn on KEEP in the cmake to check the full command line.

@zwshan
Copy link
Author

zwshan commented Oct 19, 2023

I have solved my problem, thank you.

@zwshan zwshan closed this as completed Oct 19, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants