-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathgemm_test.cc
193 lines (169 loc) · 5.59 KB
/
gemm_test.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#include <cassert>
#include <chrono>
#include <cstring>
#include <functional>
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include "gemm_cpu_naive.h"
#include "gemm_cpu_simd.h"
#include "gemm_gpu_1thread.h"
#include "gemm_gpu_mult_thread.h"
#include "gemm_gpu_mult_block.h"
#include "gemm_gpu_mult_block_no_restrict.h"
#include "gemm_gpu_mult_block_no_restrict_reg.h"
#include "gemm_gpu_tiling.h"
// gemm_impl_t - A function pointer type for gemm implementations
typedef void (*gemm_impl_t)(
int* __restrict__ C,
const int* __restrict__ A,
const int* __restrict__ B,
const int n,
const int m,
const int k
);
struct GemmImpl {
std::string name;
gemm_impl_t impl;
bool is_gpu;
};
// All GEMM implementations to benchmark
std::vector<GemmImpl> gemm_impls = {
{ "cpu_naive", gemm_cpu_naive, false },
{ "cpu_simd", gemm_cpu_simd, false },
{ "gpu_1thread", gemm_gpu_1thread, true },
{ "gpu_mult_thread", gemm_gpu_mult_thread, true },
{ "gpu_mult_block", gemm_gpu_mult_block, true },
{ "gpu_mult_block_no_restrict", gemm_gpu_mult_block_no_restrict, true },
{ "gpu_mult_block_no_restrict_reg", gemm_gpu_mult_block_no_restrict_reg, true },
{ "gpu_tiling", gemm_gpu_tiling, true }
};
// cuda_sync_check_error - Sync with the CUDA device, check if there
// is any error, and print the error message if there is any.
void cuda_sync_check_error_helper(const char* filename, const int line) {
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("CUDA error at %s:%d: %s\n", filename, line, cudaGetErrorString(error));
exit(1);
}
}
#define cuda_sync_check_error() cuda_sync_check_error_helper(__FILE__, __LINE__)
constexpr int BENCHMARK_ROUNDS = 8;
// benchmark_gemm_impl - Benchmark a gemm implementation, return the
// avg time usage on BENCKMARK_ROUNDS rounds
double benchmark_gemm_impl(
GemmImpl gemm_impl,
const int n,
const int m,
const int k
) {
// Prepare test data
// Allocate A, B, C, and C_ref on CPU
int* A = new int[n * k];
int* B = new int[k * m];
int* C = new int[n * m];
int* C_ref = new int[n * m];
// Allocate A_gpu, B_gpu, and C_gpu on GPU
int* A_gpu;
int* B_gpu;
int* C_gpu;
cudaMalloc(&A_gpu, sizeof(int) * n * k);
cudaMalloc(&B_gpu, sizeof(int) * k * m);
cudaMalloc(&C_gpu, sizeof(int) * n * m);
// Initialize A and B and copy them to GPU
for (int i = 0; i < n * k; ++i) A[i] = rand() % 1000;
for (int i = 0; i < k * m; ++i) B[i] = rand() % 1000;
cudaMemcpy(A_gpu, A, sizeof(int) * n * k, cudaMemcpyHostToDevice);
cudaMemcpy(B_gpu, B, sizeof(int) * k * m, cudaMemcpyHostToDevice);
// Initialize C_ref
memset(C_ref, 0, sizeof(int) * n * m);
gemm_cpu_naive(C_ref, A, B, n, m, k);
// run_once: Run the gemm_impl once, and return its time usage (in microseconds (us))
std::function<long(void)> run_once = [&]() -> long {
if (gemm_impl.is_gpu) {
cudaMemset(C_gpu, 0, sizeof(int) * n * m);
auto start = std::chrono::high_resolution_clock::now();
gemm_impl.impl(C_gpu, A_gpu, B_gpu, n, m, k);
cuda_sync_check_error();
auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
} else {
memset(C, 0, sizeof(int) * n * m);
auto start = std::chrono::high_resolution_clock::now();
gemm_impl.impl(C, A, B, n, m, k);
auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
}
};
// Warm up
printf("Warming up...\n");
run_once();
// Verift its correctness
if (gemm_impl.is_gpu) {
cudaMemcpy(C, C_gpu, sizeof(int) * n * m, cudaMemcpyDeviceToHost);
}
for (int i = 0; i < n * m; ++i) {
if (C[i] != C_ref[i]) {
printf("Verification failed!\n");
printf("C[%d, %d] = %d, C_ref[%d, %d] = %d\n", i / m, i % m, C[i], i / m, i % m, C_ref[i]);
return -1;
}
}
std::cout << "Verification passed!" << std::endl;
// Warm up again since correct verification may corrupt cache
printf("Warming up (again)...\n");
run_once();
// Benchmark
long total_time_usage = 0;
for (int round = 0; round < BENCHMARK_ROUNDS; ++round) {
long time_usage = run_once();
printf("Round %d: %ld us\n", round, time_usage);
total_time_usage += time_usage;
}
double avg_time_usage = total_time_usage / (double)BENCHMARK_ROUNDS;
// Free memory
delete[] A;
delete[] B;
delete[] C;
delete[] C_ref;
cudaFree(A_gpu);
cudaFree(B_gpu);
cudaFree(C_gpu);
return avg_time_usage;
}
int main(int argc, char* argv[]) {
if (argc != 4 && argc != 5) {
printf("Usage: %s <n> <m> <k> [implementation]\n", argv[0]);
exit(1);
}
// Parse command line arguments
int n = atoi(argv[1]);
int m = atoi(argv[2]);
int k = atoi(argv[3]);
std::string impl = argc == 5 ? argv[4] : "*";
assert (n > 0 && m > 0 && k > 0);
// We allocate a small block of memory on GPU here to initialize the
// CUDA context. If we does not do this and the user hasn't enabled
// GPU's persistent mode, then the first CUDA call will take a long
// time to finish
int* dummy;
cudaMalloc(&dummy, sizeof(int));
std::vector<std::pair<std::string, double>> results;
for (auto gemm_impl : gemm_impls) {
if (impl == "*" || gemm_impl.name == impl) {
printf("----------------\n");
printf("Benchmarking %s...\n", gemm_impl.name.c_str());
double avg_time_usage = benchmark_gemm_impl(gemm_impl, n, m, k);
printf("Average time usage: %lf us\n", avg_time_usage);
results.push_back({ gemm_impl.name, avg_time_usage });
}
}
// Print results
printf("----------------\n");
printf("Results:\n");
for (auto result : results) {
printf("%16s %16.2lf us\n", result.first.c_str(), result.second);
}
return 0;
}