-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgemmbasic.cu
128 lines (102 loc) · 3.33 KB
/
gemmbasic.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
//learn how to write a simple CUDA program that performs matrix multiplication.
#include <stdio.h>
#include <sys/time.h>
#define TILE_WIDTH 16
__global__ void matrixMul(float *A, float *B, float *C, int m, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < m && col < k) {
float sum = 0.0f;
for (int i = 0; i < n; ++i) {
sum += A[row * n + i] * B[i * k + col];
}
C[row * k + col] = sum;
}
}
void matrixMulCPU(float *A, float *B, float *C, int m, int n, int k) {
for (int i = 0; i < m; ++i) {
for (int j = 0; j < k; ++j) {
float sum = 0.0f;
for (int p = 0; p < n; ++p) {
sum += A[i * n + p] * B[p * k + j];
}
C[i * k + j] = sum;
}
}
}
int main() {
int m = 16384;
int n = 16384;
int k = 16384;
float *h_A, *h_B, *h_C_CPU, *h_C_CUDA;
size_t size_A = m * n * sizeof(float);
size_t size_B = n * k * sizeof(float);
size_t size_C = m * k * sizeof(float);
cudaEvent_t startWMMA;
cudaEvent_t stopWMMA;
cudaEventCreate(&startWMMA);
cudaEventCreate(&stopWMMA);
// Allocate host memory
h_A = (float *)malloc(size_A);
h_B = (float *)malloc(size_B);
h_C_CPU = (float *)malloc(size_C);
h_C_CUDA = (float *)malloc(size_C);
// Initialize matrices A and B
for (int i = 0; i < m * n; ++i) {
h_A[i] = 1.0f;
}
for (int i = 0; i < n * k; ++i) {
h_B[i] = 2.0f;
}
// Allocate device memory
float *d_A, *d_B, *d_C;
cudaMalloc((void **)&d_A, size_A);
cudaMalloc((void **)&d_B, size_B);
cudaMalloc((void **)&d_C, size_C);
// Transfer data from host to device
cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 blockDim(TILE_WIDTH, TILE_WIDTH); //use loop tiling in GPU.
dim3 gridDim((k + TILE_WIDTH - 1) / TILE_WIDTH, (m + TILE_WIDTH - 1) / TILE_WIDTH);
// Launch kernel
cudaEventRecord(startWMMA);
matrixMul<<<gridDim, blockDim>>>(d_A, d_B, d_C, m, n, k);
cudaEventRecord(stopWMMA);
cudaEventSynchronize(stopWMMA);
// Transfer results from device to host
cudaMemcpy(h_C_CUDA, d_C, size_C, cudaMemcpyDeviceToHost);
struct timeval start;
struct timeval end;
gettimeofday(&start, NULL);
// matrixMulCPU(h_A, h_B, h_C_CPU, m, n, k);
gettimeofday(&end, NULL);
long long elapsed = (end.tv_sec - start.tv_sec) * 1000000LL + (end.tv_usec - start.tv_usec);
// Compare results from CPU and CUDA
bool isEqual = true;
for (int i = 0; i < m * k; ++i) {
if (h_C_CPU[i] != h_C_CUDA[i]) {
isEqual = false;
break;
}
}
if (isEqual) {
printf("Results match between CPU and CUDA.\n");
} else {
printf("Results do not match between CPU and CUDA.\n");
}
float gpuTime;
cudaEventElapsedTime(&gpuTime, startWMMA, stopWMMA);
printf("GPU took %fms\n", gpuTime);
printf("CPU elapsed time = %lld ms\n", elapsed/1000);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C_CPU);
free(h_C_CUDA);
return 0;
}