-
Notifications
You must be signed in to change notification settings - Fork 0
/
fma_24x.c
148 lines (125 loc) · 6.29 KB
/
fma_24x.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// This code was taken from https://colfaxresearch.com/skl-avx512
#include <stdio.h>
// #include <omp.h>
// unsigned long rdtsc();
// unsigned long rdtscp();
// unsigned long tacc_rdtscp(int *chip, int *core);
// int tacc_get_core_number();
// int tacc_get_socket_number();
// unsigned long rdpmc_instructions();
// unsigned long rdpmc_actual_cycles();
// unsigned long rdpmc_reference_cycles();
// unsigned long rdpmc(int c);
// float get_TSC_frequency();
#include "low_overhead_timers.h"
#ifndef VECTOR_WIDTH
#define VECTOR_WIDTH 8
#endif
#ifndef UNROLLFACTOR
#define UNROLLFACTOR 1
#endif
const int n_trials = 1000000000; // Enough to keep cores busy for a while and observe a steady state
const int flops_per_calc = 2; // Multiply + add = 2 instructions
const int n_chained_fmas = 24; // Must be tuned for architectures here and in blocks (R) and in (E)
int main() {
float TSC_freq;
double t0, t1;
long tsc_start,tsc_end,ref_cycle_start,ref_cycle_end,core_cycle_start,core_cycle_end,instr_start,instr_end;
long delta_tsc, delta_ref_cycle, delta_core_cycle, delta_instr;
long pmc0_start, pmc1_start, pmc0_end, pmc1_end;
long delta_pmc0, delta_pmc1;
int pmc_width;
// #pragma omp parallel
{ // Benchmark in all threads
// double t0 = omp_get_wtime(); // start timer
double fa[VECTOR_WIDTH*n_chained_fmas] __attribute__((aligned(64)));
double fb[VECTOR_WIDTH] __attribute__((aligned(64)));
double fc[VECTOR_WIDTH] __attribute__((aligned(64)));
int i, j;
for (i = 0; i < VECTOR_WIDTH; i++) {
fb[i] = 0.1;
fc[i] = 0.2;
for (j = 0; j < n_chained_fmas; j++) {
fa[i*n_chained_fmas+j] = 0.0;
}
}
TSC_freq = get_TSC_frequency();
pmc_width = get_core_counter_width();
ref_cycle_start = rdpmc_reference_cycles();
core_cycle_start = rdpmc_actual_cycles();
instr_start = rdpmc_instructions();
tsc_start = rdtscp();
pmc0_start = rdpmc(0);
pmc1_start = rdpmc(1);
#pragma unroll_and_jam(UNROLLFACTOR)
for (i = 0; i < n_trials; i++) {
for (j = 0; j < VECTOR_WIDTH; j++) { // VECTOR_WIDTH=4 for AVX2, =8 for AVX-512
fa[j + 0*VECTOR_WIDTH] = fa[j + 0*VECTOR_WIDTH]*fb[j] + fc[j]; // This is block (E)
fa[j + 1*VECTOR_WIDTH] = fa[j + 1*VECTOR_WIDTH]*fb[j] + fc[j]; // To tune for a specific architecture,
fa[j + 2*VECTOR_WIDTH] = fa[j + 2*VECTOR_WIDTH]*fb[j] + fc[j]; // more or fewer such FMA constructs
fa[j + 3*VECTOR_WIDTH] = fa[j + 3*VECTOR_WIDTH]*fb[j] + fc[j]; // must be used
fa[j + 4*VECTOR_WIDTH] = fa[j + 4*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 5*VECTOR_WIDTH] = fa[j + 5*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 6*VECTOR_WIDTH] = fa[j + 6*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 7*VECTOR_WIDTH] = fa[j + 7*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 8*VECTOR_WIDTH] = fa[j + 8*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 9*VECTOR_WIDTH] = fa[j + 9*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 10*VECTOR_WIDTH] = fa[j + 10*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 11*VECTOR_WIDTH] = fa[j + 11*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 12*VECTOR_WIDTH] = fa[j + 12*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 13*VECTOR_WIDTH] = fa[j + 13*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 14*VECTOR_WIDTH] = fa[j + 14*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 15*VECTOR_WIDTH] = fa[j + 15*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 16*VECTOR_WIDTH] = fa[j + 16*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 17*VECTOR_WIDTH] = fa[j + 17*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 18*VECTOR_WIDTH] = fa[j + 18*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 19*VECTOR_WIDTH] = fa[j + 19*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 20*VECTOR_WIDTH] = fa[j + 20*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 21*VECTOR_WIDTH] = fa[j + 21*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 22*VECTOR_WIDTH] = fa[j + 22*VECTOR_WIDTH]*fb[j] + fc[j];
fa[j + 23*VECTOR_WIDTH] = fa[j + 23*VECTOR_WIDTH]*fb[j] + fc[j];
}
}
ref_cycle_end = rdpmc_reference_cycles();
core_cycle_end = rdpmc_actual_cycles();
instr_end = rdpmc_instructions();
tsc_end = rdtscp();
pmc0_end = rdpmc(0);
pmc1_end = rdpmc(1);
delta_tsc = tsc_end - tsc_start;
delta_ref_cycle = corrected_pmc_delta(ref_cycle_end, ref_cycle_start, pmc_width);
delta_core_cycle = corrected_pmc_delta(core_cycle_end, core_cycle_start, pmc_width);
delta_instr = corrected_pmc_delta(instr_end, instr_start, pmc_width);
delta_pmc0 = corrected_pmc_delta(pmc0_end, pmc0_start, pmc_width);
delta_pmc1 = corrected_pmc_delta(pmc1_end, pmc1_start, pmc_width);
const double utilization = (double)delta_ref_cycle / (double)delta_tsc;
const double active_frequency = (double)delta_core_cycle / (double)delta_ref_cycle * TSC_freq;
const double net_frequency = (double)delta_core_cycle / (double)delta_tsc * TSC_freq;
const double instr_per_cycle = (double)delta_instr / (double)delta_core_cycle;
// double t1 = omp_get_wtime();
// Do something with fa
double sum = 0.0;
for (i = 0; i < VECTOR_WIDTH*n_chained_fmas; i++) {
sum += fa[i];
}
t0 = 0.0;
t1 = (double) (delta_tsc) / TSC_freq;
const double gflops = 1.0e-9*(double)VECTOR_WIDTH*(double)n_trials*(double)flops_per_calc* (double)n_chained_fmas;
printf("Sum=%.1f, Chained FMAs=%d, vector width=%d, GFLOPs=%.1f, time=%.6f s, performance=%.1f GFLOP/s\n", sum, n_chained_fmas, VECTOR_WIDTH, gflops, t1 - t0, gflops/(t1 - t0));
printf("TSC_cycles %ld ref_cycles %ld core_cycles %ld instructions %ld pmc0 %ld pmc1 %ld\n",
delta_tsc, delta_ref_cycle, delta_core_cycle, delta_instr, delta_pmc0, delta_pmc1);
printf("Utilization %f\n",utilization);
printf("average core frequency while active %f\n",active_frequency);
printf("average core frequency per wall second %f\n",net_frequency);
printf("average instructions per core cycle %f\n",instr_per_cycle);
printf("average pmc0 per core cycle %f\n",(double)delta_pmc0/(double)delta_core_cycle);
printf("average pmc1 per core cycle %f\n",(double)delta_pmc1/(double)delta_core_cycle);
// #pragma omp master
// {
// const double gflops = 1.0e-9*(double)VECTOR_WIDTH*(double)n_trials*(double)flops_per_calc* (double)omp_get_max_threads()*(double)n_chained_fmas;
// printf("Sum=%.1f, Chained FMAs=%d, vector width=%d, GFLOPs=%.1f, time=%.6f s, performance=%.1f GFLOP/s\n", sum, n_chained_fmas, VECTOR_WIDTH, gflops, t1 - t0, gflops/(t1 -
//t0));
// }
}
return 0;
}