forked from google/XNNPACK
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathx16-packw.cc
131 lines (111 loc) · 4.73 KB
/
x16-packw.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <algorithm>
#include <cfloat>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <functional>
#include <mutex>
#include <random>
#include <vector>
#include <benchmark/benchmark.h>
#include "bench/bgemm.h"
#include "bench/utils.h"
#include <xnnpack/aligned-allocator.h>
#include <xnnpack/common.h>
#include <xnnpack/pack.h>
#include <xnnpack/packw.h>
static void x16_packw(benchmark::State& state,
xnn_x16_packw_gemm_goi_ukernel_fn packw,
size_t nr, size_t kr, size_t sr,
benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (isa_check != nullptr && !isa_check(state)) {
return;
}
const size_t batch = state.range(0); // batch is g parameter for packw
const size_t dim_n = state.range(2); // dim_n is nc parameter
const size_t dim_k = state.range(3); // dim_k is kc parameter
const size_t rounded_n = benchmark::utils::RoundUp(dim_n, nr);
const size_t rounded_k = benchmark::utils::RoundUp(dim_k, kr * sr);
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto u16rng = std::bind(std::uniform_int_distribution<uint16_t>(), std::ref(rng));
// Computer num_buffers that fit cache with source weights + packed_weights.
const size_t num_buffers = 1 +
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
sizeof(uint16_t) * batch * (dim_n * dim_k + rounded_n * rounded_k + rounded_n));
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> weights(num_buffers * batch * dim_n * dim_k);
std::generate(weights.begin(), weights.end(), std::ref(u16rng));
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(num_buffers * batch * (rounded_n * rounded_k + rounded_n));
std::fill(packed_weights.begin(), packed_weights.end(), 0);
size_t buffer_index = 0;
for (auto _ : state) {
if (++buffer_index == num_buffers) {
buffer_index = 0;
}
packw(batch, dim_n, dim_k, nr, kr, sr,
reinterpret_cast<uint16_t*>(weights.data() + buffer_index * batch * dim_n * dim_k),
/*bias=*/nullptr,
reinterpret_cast<uint16_t*>(packed_weights.data() + buffer_index * batch * (rounded_n * rounded_k + rounded_n)),
/*extra_bytes=*/0, /*params=*/nullptr);
}
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}
const size_t elements_per_iteration = batch * dim_n * dim_k;
state.counters["elements"] =
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
const size_t bytes_per_iteration = (elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n)) * sizeof(float);
state.counters["bytes"] =
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
}
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
static void x16_packw_x16__neon_ld4lane_x4(benchmark::State& state, const char* net) {
x16_packw(state,
xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_x4,
/*nr=*/16, /*kr=*/1, /*sr=*/1,
benchmark::utils::CheckNEON);
}
static void x16_packw_x16__neon_ld4lane_prfm_x4(benchmark::State& state, const char* net) {
x16_packw(state,
xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_prfm_x4,
/*nr=*/16, /*kr=*/1, /*sr=*/1,
benchmark::utils::CheckNEON);
}
static void x16_packw_x8__neon_ld4lane_x4(benchmark::State& state, const char* net) {
x16_packw(state,
xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_x4,
/*nr=*/8, /*kr=*/1, /*sr=*/1,
benchmark::utils::CheckNEON);
}
static void x16_packw_x8__neon_ld4lane_prfm_x4(benchmark::State& state, const char* net) {
x16_packw(state,
xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_prfm_x4,
/*nr=*/8, /*kr=*/1, /*sr=*/1,
benchmark::utils::CheckNEON);
}
BENCHMARK_BGEMM(x16_packw_x16__neon_ld4lane_x4)
BENCHMARK_BGEMM(x16_packw_x16__neon_ld4lane_prfm_x4)
BENCHMARK_BGEMM(x16_packw_x8__neon_ld4lane_x4)
BENCHMARK_BGEMM(x16_packw_x8__neon_ld4lane_prfm_x4)
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
static void x16_packw_x16__scalar_int_x4(benchmark::State& state, const char* net) {
x16_packw(state,
xnn_x16_packw_gemm_goi_ukernel_x16__scalar_int_x4,
/*nr=*/16, /*kr=*/1, /*sr=*/1);
}
static void x16_packw_x8__scalar_int_x4(benchmark::State& state, const char* net) {
x16_packw(state,
xnn_x16_packw_gemm_goi_ukernel_x8__scalar_int_x4,
/*nr=*/8, /*kr=*/1, /*sr=*/1);
}
BENCHMARK_BGEMM(x16_packw_x16__scalar_int_x4)
BENCHMARK_BGEMM(x16_packw_x8__scalar_int_x4)
#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif