We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
I'm testing matmul and find a strange result. The result looks like shifted left side. I append the verify code and result.
// Standard Library includes #include "LSTM.h" #include "misc.h" #include <iostream> #include <sstream> #include <vector> #include <cmath> #include <cstdlib> #include <sys/time.h> #include <math.h> #include <cassert> #include <limits> #include <cstring> #include <iostream> #include <cooperative_groups.h> #include <math.h> #define MM_BLOCK_SIZE 16 #define MM_REG_TILE 4 #define MM_TILE_SIZE 64 /// Naive reference GEMM computation. __global__ void ReferenceGemm_kernel( int M, int N, int K, float alpha, float const *A, int lda, float const *B, int ldb, float beta, float *C, int ldc) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; if (i < M && j < N) { float accumulator = 0; for (int k = 0; k < K; ++k) { accumulator += A[i + k * lda] * B[k + j * ldb]; } C[i + j * ldc] = accumulator; //alpha * accumulator + beta * C[i + j * ldc]; } } /// Reference GEMM computation. cudaError_t ReferenceGemm( int M, int N, int K, float alpha, float const *A, int lda, float const *B, int ldb, float beta, float *C, int ldc) { dim3 block(16, 16); dim3 grid( (M + block.x - 1) / block.x, (N + block.y - 1) / block.y ); ReferenceGemm_kernel<<< grid, block >>>(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); return cudaGetLastError(); } __global__ void grnn_matmul(float * A, float * B, float * C, uint32_t M, uint32_t K, uint32_t N) { extern __shared__ float base[]; float* bufferA = base; float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE]; float regA[MM_REG_TILE]; //4 float regB[MM_REG_TILE]; // 4 float regC[MM_REG_TILE][MM_REG_TILE]; // 16 uint32_t tidx = threadIdx.x; uint32_t tidy = threadIdx.y; uint32_t id = threadIdx.y * blockDim.x + threadIdx.x; uint32_t bidx = blockIdx.x; uint32_t bidy = blockIdx.y; // Number of rows that are traversed in a single fully coalesced load sequence constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE); constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE; // Zero the intermediate output for (uint32_t y = 0; y < MM_REG_TILE; y++) { for (uint32_t x = 0; x < MM_REG_TILE; x++) { regC[y][x] = 0.0f; } } for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) { // Load lhs tile from global memory to shared memory (fully coalesced) #pragma unroll for (uint32_t j = 0; j < LOAD_STEPS; j++) { uint32_t index = j * NUM_THREADS + id; if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) { bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE]; } else { bufferA[index] = 0.0f; } } // Not necessary for correctness, but improves performance by avoiding thrashing shared memory __syncthreads(); // Load rhs tile from global memory to shared memory (fully coalesced) #pragma unroll for (uint32_t j = 0; j < LOAD_STEPS; j++) { uint32_t index = j * NUM_THREADS + id; if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) { bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE]; } else { bufferB[index] = 0.0f; } } // Ensures all data is written from global memory to shared memory before it is streamed // into register arrays. __syncthreads(); // Loop through full tile for (uint32_t j = 0; j < MM_TILE_SIZE; j++) { // Load vector from lhs and rhs #pragma unroll for (uint32_t l = 0; l < MM_REG_TILE; l++) { regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j]; regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l]; } #pragma unroll // Perform a narrow matmul for (uint32_t y = 0; y < MM_REG_TILE; y++) { for (uint32_t x = 0; x < MM_REG_TILE; x++) { regC[y][x] += regA[y] * regB[x]; } } } __syncthreads(); } // Write register intermediates to shared memory (possibly unnecessary) for (uint32_t y = 0; y < MM_REG_TILE; y++) { for (uint32_t x = 0; x < MM_REG_TILE; x++) { bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x]; } } __syncthreads(); for (uint32_t j = 0; j < LOAD_STEPS; j++) { uint32_t index = j * NUM_THREADS + id; if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) { C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)] = bufferA[index]; } } } /// Kernel to initialize a matrix with small integers. __global__ void InitializeMatrix_kernel( float *matrix, int ldm, int rows, int columns, int seed = 0) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; if (i < rows && j < columns) { int offset = i + j * ldm; // Generate arbitrary elements. int const k = 16807; int const m = 16; float value = float(((offset + seed) * k % m) - m / 2); matrix[offset] = value; } } /// Simple function to initialize a matrix to arbitrary small integers. cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) { dim3 block(16, 16); dim3 grid( (rows + block.x - 1) / block.x, (columns + block.y - 1) / block.y ); InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed); return cudaGetLastError(); } /// Allocates device memory for a matrix then fills with arbitrary small integers. cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) { cudaError_t result; size_t sizeof_matrix = sizeof(float) * ldm * columns; // Allocate device memory. result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix); if (result != cudaSuccess) { std::cerr << "Failed to allocate matrix: " << cudaGetErrorString(result) << std::endl; return result; } // Clear the allocation. result = cudaMemset(*matrix, 0, sizeof_matrix); if (result != cudaSuccess) { std::cerr << "Failed to clear matrix device memory: " << cudaGetErrorString(result) << std::endl; return result; } // Initialize matrix elements to arbitrary small integers. result = InitializeMatrix(*matrix, ldm, rows, columns, seed); if (result != cudaSuccess) { std::cerr << "Failed to initialize matrix: " << cudaGetErrorString(result) << std::endl; return result; } return result; } int main(int argc, const char *arg[]) { float alpha = 1; float beta = 0; int M = 32; int K = 32; int N = 32; float *A; float *B; float *C_grnn; float *C_reference; AllocateMatrix(&A, M, M, K, 1); AllocateMatrix(&B, K, K, N, 2); AllocateMatrix(&C_grnn, M, M, N, 0); AllocateMatrix(&C_reference, M, M, N, 0); // Reference GEMM ReferenceGemm(M, N, K, alpha, A, M, B, K, beta, C_reference, M); // grnn dim3 mm_grid = dim3((N + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (M + MM_TILE_SIZE - 1) / MM_TILE_SIZE); dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE); void* paramsMM[6]; paramsMM[0] = (void*) &A; paramsMM[1] = (void*) &B; paramsMM[2] = (void*) &C_grnn; paramsMM[3] = (void*) &M; paramsMM[4] = (void*) &K; paramsMM[5] = (void*) &N; size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float); cudaLaunchKernel((void *)grnn_matmul, mm_grid, mm_block, paramsMM, mm_sm_requirement); // verification of both results std::vector<float> reference_result(M * N, 0); std::vector<float> grnn_result(N * N, 0); size_t sizeof_C = sizeof(float) * M * N; cudaMemcpy(reference_result.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost); cudaMemcpy(grnn_result.data(), C_grnn, sizeof_C, cudaMemcpyDeviceToHost); std::cout << "Reference" << std::endl; for(int i=0; i<M*N; i++){ std::cout << reference_result[i] << ' '; if ((i+1)%N == 0){ std::cout << std::endl; } } std::cout << "GRNN" << std::endl; for(int i=0; i<M*N; i++){ std::cout << grnn_result[i] << ' '; if ((i+1)%N == 0){ std::cout << std::endl; } } }
Reference 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 GRNN -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
The text was updated successfully, but these errors were encountered:
No branches or pull requests
I'm testing matmul and find a strange result.
The result looks like shifted left side.
I append the verify code and result.
The text was updated successfully, but these errors were encountered: