-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathgemm_cpu_simd.cc
28 lines (27 loc) · 854 Bytes
/
gemm_cpu_simd.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#include "gemm_cpu_simd.h"
// gemm_cpu_simd - A SIMD gemm (GEneral Matrix Multiply) implementation on CPU
// Input:
// - A: A n x k matrix
// - B: A k x m matrix
// - n: The number of rows of A
// - m: The number of columns of B
// - k: The number of columns of A and the number of rows of B
// Output:
// - C: A n x m matrix. The result of A * B
// Requirements:
// - Please make sure C is initialized to 0 before calling this function
__attribute__((optimize("O3"))) // Enforce O3 optimization to utilize loop unrolling and SIMD
void gemm_cpu_simd(
int* __restrict__ C, // [n, m]
const int* __restrict__ A, // [n, k]
const int* __restrict__ B, // [k, m]
const int n,
const int m,
const int k
) {
for (int i = 0; i < n; ++i)
for (int l = 0; l < k; ++l)
for (int j = 0; j < m; ++j) {
C[i * m + j] += A[i * k + l] * B[l * m + j];
}
}