Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Matmul verify #1

Open
com9009 opened this issue Oct 2, 2019 · 0 comments
Open

Matmul verify #1

com9009 opened this issue Oct 2, 2019 · 0 comments

Comments

@com9009
Copy link

com9009 commented Oct 2, 2019

I'm testing matmul and find a strange result.
The result looks like shifted left side.
I append the verify code and result.

// Standard Library includes
#include "LSTM.h"
#include "misc.h"

#include <iostream>
#include <sstream>
#include <vector>
#include <cmath>
#include <cstdlib>
#include <sys/time.h>
#include <math.h>
#include <cassert>
#include <limits>
#include <cstring>
#include <iostream>
#include <cooperative_groups.h>
#include <math.h>
#define MM_BLOCK_SIZE 16
#define MM_REG_TILE 4
#define MM_TILE_SIZE 64


/// Naive reference GEMM computation.
__global__ void ReferenceGemm_kernel(
  int M,
  int N,
  int K,
  float alpha,
  float const *A,
  int lda,
  float const *B,
  int ldb,
  float beta,
  float *C,
  int ldc) {

  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int j = threadIdx.y + blockIdx.y * blockDim.y;

  if (i < M && j < N) {
    float accumulator = 0;

    for (int k = 0; k < K; ++k) {
      accumulator += A[i + k * lda] * B[k + j * ldb];
    }

    C[i + j * ldc] = accumulator; //alpha * accumulator + beta * C[i + j * ldc];
  }
}

/// Reference GEMM computation.
cudaError_t ReferenceGemm(
  int M,
  int N,
  int K,
  float alpha,
  float const *A,
  int lda,
  float const *B,
  int ldb,
  float beta,
  float *C,
  int ldc) {

  dim3 block(16, 16);
  dim3 grid(
    (M + block.x - 1) / block.x,
    (N + block.y - 1) / block.y
  );

    ReferenceGemm_kernel<<< grid, block >>>(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);

  return cudaGetLastError();
}

__global__ void grnn_matmul(float * A, float * B, float * C,
                       uint32_t M, uint32_t K, uint32_t N) {

  extern __shared__ float base[];
  float* bufferA = base;
  float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE];

  float regA[MM_REG_TILE]; //4
  float regB[MM_REG_TILE]; // 4
  float regC[MM_REG_TILE][MM_REG_TILE]; // 16

  uint32_t tidx = threadIdx.x;
  uint32_t tidy = threadIdx.y;
  uint32_t id = threadIdx.y * blockDim.x + threadIdx.x;
  uint32_t bidx = blockIdx.x;
  uint32_t bidy = blockIdx.y;

  // Number of rows that are traversed in a single fully coalesced load sequence
  constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE);
  constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE;

  // Zero the intermediate output
  for (uint32_t y = 0; y < MM_REG_TILE; y++) {
    for (uint32_t x = 0; x < MM_REG_TILE; x++) {
      regC[y][x] = 0.0f;
    }
  }

  for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) {

    // Load lhs tile from global memory to shared memory (fully coalesced)
    #pragma unroll
    for (uint32_t j = 0; j < LOAD_STEPS; j++) {
      uint32_t index = j * NUM_THREADS + id;
      if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) {
        bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE];
      } else {
        bufferA[index] = 0.0f;
      }
    }

    // Not necessary for correctness, but improves performance by avoiding thrashing shared memory
    __syncthreads();

    // Load rhs tile from global memory to shared memory (fully coalesced)
    #pragma unroll
    for (uint32_t j = 0; j < LOAD_STEPS; j++) {
      uint32_t index = j * NUM_THREADS + id;
      if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) {
        bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE];
      } else {
        bufferB[index] = 0.0f;
      }
    }

    // Ensures all data is written from global memory to shared memory before it is streamed
    // into register arrays.
    __syncthreads();



    // Loop through full tile
    for (uint32_t j  = 0; j < MM_TILE_SIZE; j++) {

      // Load vector from lhs and rhs
      #pragma unroll
      for (uint32_t l = 0; l < MM_REG_TILE; l++) {
        regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j];
        regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l];
      }

      #pragma unroll
      // Perform a narrow matmul
      for (uint32_t y = 0; y < MM_REG_TILE; y++) {
        for (uint32_t x = 0; x < MM_REG_TILE; x++) {
          regC[y][x] += regA[y] * regB[x];
        }
      }
    }

    __syncthreads();
  }

  // Write register intermediates to shared memory (possibly unnecessary)
  for (uint32_t y = 0; y < MM_REG_TILE; y++) {
    for (uint32_t x = 0; x < MM_REG_TILE; x++) {
      bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x];
    }
  }

  __syncthreads();


  for (uint32_t j = 0; j < LOAD_STEPS; j++) {
    uint32_t index = j * NUM_THREADS + id;
    if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) {
      C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE +  (index % MM_TILE_SIZE)] = bufferA[index];
    }
  }
}
/// Kernel to initialize a matrix with small integers.
__global__ void InitializeMatrix_kernel(
  float *matrix,
  int ldm,
  int rows,
  int columns,
  int seed = 0) {

  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int j = threadIdx.y + blockIdx.y * blockDim.y;

  if (i < rows && j < columns) {
    int offset = i + j * ldm;

    // Generate arbitrary elements.
    int const k = 16807;
    int const m = 16;
    float value = float(((offset + seed) * k % m) - m / 2);

    matrix[offset] = value;
  }
}

/// Simple function to initialize a matrix to arbitrary small integers.
cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {

  dim3 block(16, 16);
  dim3 grid(
    (rows + block.x - 1) / block.x,
    (columns + block.y - 1) / block.y
  );

  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);

  return cudaGetLastError();
}

/// Allocates device memory for a matrix then fills with arbitrary small integers.
cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
  cudaError_t result;

  size_t sizeof_matrix = sizeof(float) * ldm * columns;

  // Allocate device memory.
  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);

  if (result != cudaSuccess) {
    std::cerr << "Failed to allocate matrix: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }

  // Clear the allocation.
  result = cudaMemset(*matrix, 0, sizeof_matrix);

  if (result != cudaSuccess) {
    std::cerr << "Failed to clear matrix device memory: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }

  // Initialize matrix elements to arbitrary small integers.
  result = InitializeMatrix(*matrix, ldm, rows, columns, seed);

  if (result != cudaSuccess) {
    std::cerr << "Failed to initialize matrix: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }

  return result;
}


int main(int argc, const char *arg[]) {
	float alpha = 1;
	float beta = 0;
	int M = 32;
	int K = 32;
	int N = 32;

	float *A;
	float *B;
	float *C_grnn;
	float *C_reference;

	AllocateMatrix(&A, M, M, K, 1);
	AllocateMatrix(&B, K, K, N, 2);
	AllocateMatrix(&C_grnn, M, M, N, 0);
	AllocateMatrix(&C_reference, M, M, N, 0);

	// Reference GEMM
	ReferenceGemm(M, N, K, alpha, A, M, B, K, beta, C_reference, M);

	// grnn
	dim3 mm_grid = dim3((N + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (M + MM_TILE_SIZE - 1) / MM_TILE_SIZE);
	dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE);

	void* paramsMM[6];
	paramsMM[0] = (void*) &A;
	paramsMM[1] = (void*) &B;
	paramsMM[2] = (void*) &C_grnn;
	paramsMM[3] = (void*) &M;
	paramsMM[4] = (void*) &K;
	paramsMM[5] = (void*) &N;

	size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float);

	cudaLaunchKernel((void *)grnn_matmul, mm_grid, mm_block, paramsMM, mm_sm_requirement);

	// verification of both results
	std::vector<float> reference_result(M * N, 0);
	std::vector<float> grnn_result(N * N, 0);
	size_t sizeof_C = sizeof(float) * M * N;
	cudaMemcpy(reference_result.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
	cudaMemcpy(grnn_result.data(), C_grnn, sizeof_C, cudaMemcpyDeviceToHost);

	std::cout << "Reference" << std::endl;
	for(int i=0; i<M*N; i++){
		std::cout << reference_result[i] << ' ';
		if ((i+1)%N == 0){
			std::cout << std::endl;
		}
	}

	std::cout << "GRNN" << std::endl;
	for(int i=0; i<M*N; i++){
		std::cout << grnn_result[i] << ' ';
		if ((i+1)%N == 0){
			std::cout << std::endl;
		}
	}
}
Reference
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
GRNN
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant