-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcycle_counting.rs
1463 lines (1401 loc) · 64.3 KB
/
cycle_counting.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! This binary runs various microbenchmarks, measuring the CPU cycles spent executing them.
//!
//! It measures the number of CPU cycles spent executing a given number of iterations of a various
//! looping blocks of assembly code, and tries to define a set of rules that describe/predict how
//! many CPU cycles are spent executing each benchmark,
//!
//! WARNING: The observed results, and the rules defined in this example, are inherently fragile and
//! chip-specific. They have currently only been run on an ESP32-C3 and an ESP32-C6 chip. I can
//! re-run the benchmark and consistently get the same results when using a given toolchain version,
//! but I wouldn't be surprised if some of the results change when run by others or when run with
//! newer/older versions of the toolchain. The benchmarks are most useful when they reflect the
//! particular situation you're interested in as closely as possible, and it's easy to add new
//! benchmarks for additional cases.
//!
//! Some of the benchmarks seem particularly fragile. For example, the memory load/store benchmarks
//! seems to be dependent on the location and/or alignment of the memory buffers. In the past I've
//! noticed that a benchmark result would change as a result of adding another unrelated benchmark
//! to the set, with the only impact on the load/store benchmark assembly code being that the data
//! buffer's address changed as a result. I've tried to make the code more robust to this by
//! introducing variations of some benchmarks with buffers with different levels of guaranteed
//! alignment, but I wouldn't be surprised if some amount of non-determinism is still hiding
//! somewhere.
//!
//! The set of 'rules' I've come up with also aren't fully consistent. E.g.
//! [cycles::BRANCH_FWD_TAKEN_EXTRA_SLOW] represents a situation in which a taken conditional
//! forward branch takes an extra CPU cycle compared to all other cases of taken conditional forward
//! branches, which I can't fully explain.
//!
//! However, the upside is that all or most of the exceptions to the rules seem to apply only to the
//! first or second iterations, but not the subsequent repeated iterations after that. This means
//! that even with some exceptions to the rule, we can probably reason about the behavior of a loop
//! as long as there's enough repetitions of it.j
#![no_std]
#![no_main]
#![feature(asm_const)]
#[cfg(feature = "esp32c3")]
use esp32c3_hal as hal;
#[cfg(feature = "esp32c6")]
use esp32c6_hal as hal;
use core::arch::asm;
use esp_backtrace as _;
use hal::{clock::ClockControl, peripherals::Peripherals, prelude::*, Delay, IO};
use log::{info, log};
use niccle_proc_macros::asm_with_perf_counter;
/// The address of the Machine Performance Counter Event Register.
const MPCER: usize = 0x7E0;
/// The address of the Machine Performance Counter Mode Register.
const MPCMR: usize = 0x7E1;
/// This CSR isn't defined by the esp-rs/esp-hal framework yet (nor by the espressif/svd files),
/// so we hardcode their addresses. See the "1.14 Dedicated IO" chapter of the technical
/// reference manual.
const CSR_CPU_GPIO_OUT: u32 = 0x805;
/// For now the implementation is hardcoded to always use CPU output signal 0 when using
/// dedicated IO on the TX pin. In the future we could try to make this more flexible, e.g.
/// using an argument, or perhaps even a const generic parameter with a const expression bound.
#[cfg(feature = "esp32c3")]
pub const TX_CPU_OUTPUT_SIGNAL: hal::gpio::OutputSignal = hal::gpio::OutputSignal::CPU_GPIO_0;
#[cfg(feature = "esp32c6")]
pub const TX_CPU_OUTPUT_SIGNAL: hal::gpio::OutputSignal = hal::gpio::OutputSignal::CPU_GPIO_OUT0;
/// The dedicated IO output signal index to use (from 0 to 7, where 0 corresponds to
/// CPU_GPIO_OUT0). Based on whatever [TX_CPU_OUTPUT_SIGNAL] is set to.
#[cfg(feature = "esp32c3")]
const TX_CPU_OUTPUT_SIGNAL_CSR_IDX: isize =
(TX_CPU_OUTPUT_SIGNAL as isize) - hal::gpio::OutputSignal::CPU_GPIO_0 as isize;
#[cfg(feature = "esp32c6")]
const TX_CPU_OUTPUT_SIGNAL_CSR_IDX: isize =
(TX_CPU_OUTPUT_SIGNAL as isize) - hal::gpio::OutputSignal::CPU_GPIO_OUT0 as isize;
/// Runs the set of benchmarks in a loop, printing results to the serial line.
#[entry]
fn main() -> ! {
let peripherals = Peripherals::take();
let system = peripherals.SYSTEM.split();
let clocks = ClockControl::max(system.clock_control).freeze();
esp_println::logger::init_logger_from_env();
info!("Booted up!");
// Set up the dedicated GPIO feature on pin GPIO5, which some benchmarks use to allow us to
// observe the CPU timing with an oscilloscope.
let io = IO::new(peripherals.GPIO, peripherals.IO_MUX);
let mut tx_pin = io.pins.gpio7;
tx_pin.set_to_push_pull_output();
tx_pin.connect_peripheral_to_output_with_options(
TX_CPU_OUTPUT_SIGNAL,
false,
false,
true,
false,
);
let mut delay = Delay::new(&clocks);
loop {
do_bench(branch_fwd_usually_taken, &mut delay);
do_bench(branch_fwd_usually_taken_w_extra_instr, &mut delay);
do_bench(branch_fwd_rarely_taken, &mut delay);
do_bench(branch_fwd_rarely_taken_w_unaligned_jmp_back, &mut delay);
do_bench(branch_back_rarely_taken, &mut delay);
do_bench(branch_back_rarely_taken_w_gpio, &mut delay);
do_bench(branch_back_usually_taken_aligned, &mut delay);
do_bench(branch_back_usually_taken_unaligned, &mut delay);
do_bench(branch_back_usually_taken_with_gpio_aligned, &mut delay);
do_bench(branch_back_usually_taken_with_gpio_unaligned, &mut delay);
do_bench(branch_back_usually_taken_with_final_nop, &mut delay);
do_bench(branch_back_usually_taken_w_dead_branch_fwd, &mut delay);
do_bench(branch_back_usually_taken_w_aligned_jmp_fwd, &mut delay);
do_bench(branch_back_usually_taken_w_unaligned_jmp_fwd, &mut delay);
do_bench(branch_back_usually_taken_w_lbu_align1, &mut delay);
do_bench(branch_back_usually_taken_w_lbu_align2, &mut delay);
do_bench(branch_back_usually_taken_w_lbu_align4, &mut delay);
do_bench(branch_back_usually_taken_w_lbu_align8, &mut delay);
do_bench(branch_back_usually_taken_w_lbu_align256, &mut delay);
do_bench(branch_back_usually_taken_w_lbu_align1_w_gpio, &mut delay);
do_bench(branch_back_usually_taken_w_lw_align1, &mut delay);
do_bench(branch_back_usually_taken_w_lw_align2, &mut delay);
do_bench(branch_back_usually_taken_w_lw_align4, &mut delay);
do_bench(branch_back_usually_taken_w_lw_align8, &mut delay);
do_bench(branch_back_usually_taken_w_lw_align256, &mut delay);
do_bench(branch_back_usually_taken_w_sb_align1, &mut delay);
do_bench(branch_back_usually_taken_w_sb_align2, &mut delay);
do_bench(branch_back_usually_taken_w_sb_align4, &mut delay);
do_bench(branch_back_usually_taken_w_sb_align8, &mut delay);
do_bench(branch_back_usually_taken_w_sb_align256, &mut delay);
do_bench(branch_back_usually_taken_w_sb_align1_w_gpio, &mut delay);
do_bench(branch_back_usually_taken_w_sw_align1, &mut delay);
do_bench(branch_back_usually_taken_w_sw_align2, &mut delay);
do_bench(branch_back_usually_taken_w_sw_align4, &mut delay);
do_bench(branch_back_usually_taken_w_sw_align8, &mut delay);
do_bench(branch_back_usually_taken_w_sw_align256, &mut delay);
info!("--------------------------------------------------");
}
}
/// Runs a given benchmark for 1, 2, 3, 4, and 1000 iterations, to validate that our predictions are
/// valid for any number of iterations. I so far haven't seen any CPU behavior that differs between
/// the 3rd and subsequent iterations.
fn do_bench(bench: fn(u32) -> (&'static str, u32, u32), delay: &mut Delay) {
do_bench_single(bench, delay, 1);
do_bench_single(bench, delay, 2);
do_bench_single(bench, delay, 3);
do_bench_single(bench, delay, 4);
do_bench_single(bench, delay, 1000);
}
/// Runs a given number of iterations for a single benchmark, and prints results.
fn do_bench_single(bench: fn(u32) -> (&'static str, u32, u32), delay: &mut Delay, iters: u32) {
// Enable performance counting, starting with CPU cycles.
unsafe {
asm!(
// What to count:
// - 1<<0: CPU cycles
// - 1<<1: instructions
// - 1<<2: load hazards
// - 1<<3: jump hazards
// - 1<<4: idle cycles
"csrw {mpcer}, (1<<0)",
"csrw {mpcmr}, 0", // Disable the counter
"csrw {mpcmr}, 1", // Enable the counter.
mpcer = const(MPCER),
mpcmr = const(MPCMR),
);
}
let (name, cpu_predicted, cpu_actual) = bench(iters);
// Count instructions (we ignore the "predicted" return value).
unsafe { asm!("csrwi {mpcer}, (1<<1)", mpcer=const(MPCER)) };
let (_, _, instrs_actual) = bench(iters);
// Count load hazards.
unsafe { asm!("csrwi {mpcer}, (1<<2)", mpcer=const(MPCER)) };
let (_, _, load_hazards_actual) = bench(iters);
// Count jump hazards.
unsafe { asm!("csrwi {mpcer}, (1<<3)", mpcer=const(MPCER)) };
let (_, _, jump_hazards_actual) = bench(iters);
// Count idle cycles.
unsafe { asm!("csrwi {mpcer}, (1<<4)", mpcer=const(MPCER)) };
let (_, _, idle_actual) = bench(iters);
// Print at error level if the prediction is wrong.
let log_level = if cpu_predicted == cpu_actual {
log::Level::Info
} else {
log::Level::Error
};
// Print the results of the benchmark.
log!(
log_level,
"{name:45} {iters:4}x | \
{cpu_predicted:4} vs {cpu_actual:4} actual \
({instrs_actual:4} INS, {idle_actual:4} IDL, {load_hazards_actual:4} LH, \
{jump_hazards_actual} JH)"
);
// Throttle the benchmarks a bit so that we don't flood the serial display.
delay.delay_ms(25u32);
}
/// A set of constants describing the number of CPU cycles various instructions take, in various
/// situations, and on the ESP32-C6 chip I used.
///
/// I arrived at these by trying to work backwards from the measured results I observed in the
/// benchmarks below.
///
/// The RISC-V instruction manual has the following to say about conditional branch instructions.
///
/// Software should be optimized such that the sequential code path is the most common path, with
/// less-frequently taken code paths placed out of line.
///
/// Software should also assume that backward branches will be predicted taken and forward
/// branches as not taken, at least the first time they are encountered.
///
/// Dynamic predictors should quickly learn any predictable branch behavior.
///
/// We'll reference this part of the spec in some of the branch instruction comments below.
pub mod cycles {
/// A macro that selects one the provided values, based on the chip currently being targeted.
macro_rules! chip_dependent {
(esp32c3=$esp32c3:expr, esp32c6=$esp32c6:expr) => {
if cfg!(feature = "esp32c3") {
$esp32c3
} else if cfg!(feature = "esp32c6") {
$esp32c6
} else {
panic!("pick a feature, either esp32c3 or esp32c6")
}
};
}
/// The `nop` instruction.
pub const NOP: u32 = 1;
/// The `csrr*` instructions.
pub const CSRR: u32 = 1;
/// The `add` and `addi` instructions.
pub const ADD: u32 = 1;
/// The `xor` instructions.
pub const XOR: u32 = 1;
/// The `j` instruction. On ESP32-C6 some jumps take only one cycle. On ESP32-C3 they all take
/// two cycles.
pub const JUMP: u32 = chip_dependent!(esp32c3 = JUMP_EXTRA_SLOW, esp32c6 = 1);
/// The `j` instruction. On ESP32-C6 sometimes jumps take two cycles instead of one. At first I
/// thought it happened with all unaligned jumps, but that doesn't seem be the case
/// consistently, at least not consistently.
pub const JUMP_EXTRA_SLOW: u32 = 2;
/// The `lb`, `lbu`, `lw` instructions.
pub const LOAD: u32 = 2;
/// The `sb` and `sw` instructions.
pub const STORE: u32 = 1;
// --- INITIAL ENCOUNTERS OF CONDITIONAL BRANCHES WITH EXPECTED OUTCOMES.
//
/// The `beq`, `bne`, and similar instructions with a forward target label. When a forward
/// branch is encountered for the first time and is not taken (in line with the RISC-V manual
/// spec), it takes two cycles on ESP32-C6, but only one cycle on ESP32-C3. In both cases this
/// is faster than a *taken* forward branch ([BRANCH_FWD_TAKEN]), which would be unexpected as
/// per the spec.
pub const BRANCH_FWD_NOT_TAKEN_INITIAL: u32 =
chip_dependent!(esp32c3 = BRANCH_FWD_NOT_TAKEN_SUBSEQUENT, esp32c6 = 2);
/// The `beq`, `bne`, and similar instructions with a backward target label. When a backward
/// branch is encountered for the first time and is taken (in line with the RISC-V manual spec),
/// it takes two cycles on ESP32-C6 and three cycles on ESP32-C3. On ESP32-C6 this is faster
/// than a *non-taken* backward branch ([BRANCH_BACK_NOT_TAKEN]), which would be unexpected as
/// per the spec. One ESP32-C3 this is actually slower than [BRANCH_BACK_NOT_TAKEN].
pub const BRANCH_BACK_TAKEN_INITIAL: u32 = chip_dependent!(esp32c3 = 3, esp32c6 = 2);
// --- REPEATED ENCOUNTERS OF CONDITIONAL BRANCHES WITH EXPECTED OUTCOMES.
//
/// The `beq`, `bne`, and similar instructions with a forward target label. When a forward
/// non-taken branch is encountered more than once, it only takes one cycle. Presumably because
/// some further branch prediction mechanism or pipelining kicks in. This is the same as for
/// repeated backward taken branches.
pub const BRANCH_FWD_NOT_TAKEN_SUBSEQUENT: u32 = 1;
/// The `beq`, `bne`, and similar instructions with a backward target label. When a backward
/// taken branch is encountered more than once, it only takes one cycle on ESP32-C6. Presumably
/// because some further branch prediction mechanism or pipelining kicks in. This is the same as
/// for repeated forward non-taken branches. On ESP32-C3 it actually takes three cycles, the
/// same as when the instruction is initially encountered, as if no branch prediction exists for
/// such branches.
pub const BRANCH_BACK_TAKEN_SUBSEQUENT: u32 =
chip_dependent!(esp32c3 = BRANCH_BACK_TAKEN_INITIAL, esp32c6 = 1);
// --- ENCOUNTERS OF CONDITIONAL BRANCHES WITH UNEXPECTED OUTCOMES.
//
/// The `beq`, `bne`, and similar instructions with a forward target label. When a forward taken
/// branch is encountered it takes three (C6) / four (C3) cycles. This is the same as a
/// non-taken backward branch. It is also longer than a non-taken branch, and probably because
/// they're not expected to be taken in the usual case, as per the RISC-V manual.
pub const BRANCH_FWD_TAKEN: u32 = 3;
/// The `beq`, `bne`, and similar instructions with a forward target label. Sometimes a taken
/// forward branch an extra cycle compared to the usual [BRANCH_FWD_TAKEN]. In our benchmark
/// this is the case when there's a non-jump instruction behind the conditional branch
/// instruction.
pub const BRANCH_FWD_TAKEN_EXTRA_SLOW: u32 = chip_dependent!(esp32c3 = 3, esp32c6 = 4);
/// The `beq`, `bne`, and similar instructions with a backward target label. On ESP32-C6 when a
/// backward non-taken branch is encountered it takes three cycles in some cases, or four cycles
/// in other cases ([BRANCH_BACK_NOT_TAKEN_INSTR_UNALIGNED]). This is longer than a non-taken branch,
/// probably because they are expected to be taken in the usual case, as per the RISC-V spec.
/// One ESP32-C3 backward non-taken branches take a single cycle, just like non-taken forward
/// branches.
pub const BRANCH_BACK_NOT_TAKEN: u32 = chip_dependent!(esp32c3 = 1, esp32c6 = 3);
/// The `beq`, `bne`, and similar instructions with a backward target label sometimes take an
/// extra cycle on ESP32-C6, when the instruction falls on a non-4 byte-aligned address (ending
/// in 0x2, 0x6, ...).
pub const BRANCH_BACK_NOT_TAKEN_INSTR_UNALIGNED: u32 =
chip_dependent!(esp32c3 = BRANCH_BACK_NOT_TAKEN, esp32c6 = 4);
}
/// Measures the case of a loop with a forward branch instruction that is taken for all but the last
/// iteration. This goes against the RISC-V manual guidance that forward branch instructions will be
/// predicted not-taken initially.
#[ram]
fn branch_fwd_usually_taken(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"addi {i}, {i}, 1",
"bne {i}, {iters}, 2f",
"j 3f",
".align 2",
"2: j 1b",
".align 2",
"3:",
iters = in(reg) iters,
i = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_INITIAL + cycles::JUMP;
let predicted_iter_rest = cycles::ADD + cycles::BRANCH_FWD_TAKEN + cycles::JUMP;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2.. => predicted_iter_last + (predicted_iter_rest) * (iters - 1),
};
("BNE FWD usually taken", predicted, cycles)
}
/// Like [branch_fwd_usually_taken] but with an extra `nop` instruction after the `bne` instruction,
/// which seems to slow the 2nd iteration down a bit (see [cycles::BRANCH_FWD_TAKEN_EXTRA_SLOW]). I
/// can't really explain why.
#[ram]
fn branch_fwd_usually_taken_w_extra_instr(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"addi {i}, {i}, 1",
"bne {i}, {iters}, 2f",
"nop",
"j 3f",
".align 2",
"2: j 1b",
".align 2",
"3:",
iters = in(reg) iters,
i = inout(reg) 0 => _
)
};
let predicted_iter_last =
cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_INITIAL + cycles::NOP + cycles::JUMP;
let predicted_iter_first = cycles::ADD + cycles::BRANCH_FWD_TAKEN_EXTRA_SLOW + cycles::JUMP;
let predicted_iter_rest = cycles::ADD + cycles::BRANCH_FWD_TAKEN + cycles::JUMP;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
(
"BNE FWD usually taken w/ extra instruction",
predicted,
cycles,
)
}
/// Measures the case of a loop with a forward branch instruction that is taken only in the last
/// iteration of the loop. This is in line with the RISC-V manual guidance that forward branch
/// instructions will be predicted not-taken initially, and is a type of loop you might actually
/// encounter in real code (e.g. a bail-out break condition).
///
/// See [branch_fwd_rarely_taken_w_unaligned_jmp_back] for a similar benchmark but with an unaligned
/// jump target for the unconditional jump instruction.
#[ram]
fn branch_fwd_rarely_taken(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"addi {y}, {y}, 1",
"beq {y}, {iters}, 2f",
"j 1b",
".align 2",
"2:",
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::ADD + cycles::BRANCH_FWD_TAKEN;
let predicted_iter_first = cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_INITIAL + cycles::JUMP;
let predicted_iter_rest = cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_SUBSEQUENT + cycles::JUMP;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BEQ FWD rarely taken", predicted, cycles)
}
/// Like [branch_fwd_rarely_taken] but with an unaligned jump target for the unconditional jump
/// instruction.
///
/// The behavior ends up being exactly the same (even though I've seen unaligned jumps take two
/// cycles in other situations...).
#[ram]
fn branch_fwd_rarely_taken_w_unaligned_jmp_back(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"nop",
"1:", // Not 4 byte-aligned, due to the 16-bit instruction right before this one.
"addi {y}, {y}, 1",
"beq {y}, {iters}, 2f",
"j 1b",
".align 2",
"2:",
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::NOP + cycles::ADD + cycles::BRANCH_FWD_TAKEN;
let predicted_iter_first = cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_INITIAL + cycles::JUMP;
let predicted_iter_rest = cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_SUBSEQUENT + cycles::JUMP;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
(
"BEQ FWD rarely taken w/ unaligned J BACK",
predicted,
cycles,
)
}
/// Measures the case of a loop with a backward branch instruction that is only taken in the last
/// iteration of the loop. This goes against the RISC-V manual guidance that backward branch
/// instructions will be predicted taken initially, and is not something you'd commonly write.
#[ram]
fn branch_back_rarely_taken(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"j 2f", // 16-bit instruction.
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned. (NEVER EXECUTED)
"1:", // Hence, 4 byte-aligned.
"j 3f", // 16-bit instruction.
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned. (NEVER EXECUTED)
"2:",
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned.
"addi {y}, {y}, 1", // 16-bit instruction.
"beq {y}, {iters}, 1b", // Hence, 4 byte-aligned.
"j 2b", // Also 4 byte-aligned.
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned. (NEVER EXECUTED)
"3:", // Hence, 4 byte-aligned.
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::JUMP_EXTRA_SLOW
+ cycles::NOP
+ cycles::ADD
+ cycles::BRANCH_BACK_TAKEN_INITIAL
+ cycles::JUMP_EXTRA_SLOW;
let predicted_iter_rest =
cycles::NOP + cycles::ADD + cycles::BRANCH_BACK_NOT_TAKEN + cycles::JUMP;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2.. => predicted_iter_last + (predicted_iter_rest) * (iters - 1),
};
("BEQ BACK rarely taken", predicted, cycles)
}
/// Like [branch_back_rarely_taken], but with dedicated GPIO instructions inserted between some of
/// the each instructions in the loop. This allows us to inspect the per-iteration timing with an
/// oscilloscope. When we do so, we can confirm that it is the jump instructions that take one extra
/// cycle each, in the last iteration (rather than, say, the taken branch instruction taken longer).
#[ram]
fn branch_back_rarely_taken_w_gpio(iters: u32) -> (&'static str, u32, u32) {
// Make sure the pin is set low at the start of the benchmark, without including this
// instruction in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
let cycles = unsafe {
asm_with_perf_counter!(
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
"j 4f", // 16-bit instruction.
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned. (NEVER EXECUTED)
"1:", // Hence, 4 byte-aligned.
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction
"j 3f", // 16-bit instruction.
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned. (NEVER EXECUTED)
"4:",
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction
"2:",
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned.
"addi {y}, {y}, 1", // 16-bit instruction.
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
"beq {y}, {iters}, 1b", // Hence, 4 byte-aligned.
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction
"j 2b", // Also 4 byte-aligned.
"nop", // 16-bit instruction to ensure the next one is 4 byte-aligned. (NEVER EXECUTED)
"3:", // Hence, 4 byte-aligned.
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX),
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
// Make sure the pin is set low at the end of the benchmark, without including this instruction
// in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
let predicted_iter_last = cycles::CSRR
+ cycles::JUMP_EXTRA_SLOW
+ cycles::CSRR * 2
+ cycles::NOP
+ cycles::ADD
+ cycles::CSRR * 2
+ cycles::BRANCH_BACK_TAKEN_INITIAL
+ cycles::CSRR
+ cycles::JUMP_EXTRA_SLOW
+ cycles::CSRR;
let predicted_iter_rest = cycles::CSRR
+ cycles::NOP
+ cycles::ADD
+ cycles::CSRR * 2
+ cycles::BRANCH_BACK_NOT_TAKEN
+ cycles::CSRR
+ cycles::JUMP;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2.. => predicted_iter_last + (predicted_iter_rest) * (iters - 1),
};
("BEQ BACK rarely taken w/ GPIO", predicted, cycles)
}
/// Measures the case of a loop with a backward branch instruction that is taken in for all but the
/// last iteration of the loop. This is in line with the RISC-V manual guidance that backward branch
/// instructions will be predicted taken initially, and is a type of loop you might actually
/// encounter in real code (e.g. a simple for loop with a counter as the exit condition).
#[ram]
fn branch_back_usually_taken_aligned(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:", // This label and the first instruction are two-byte aligned.
"addi {y}, {y}, 1", // This is a 16-bit instruction.
"nop", // This is a 16-bit instruction, to ensure the next one is 4 byte-aligned.
"bne {y}, {iters}, 1b", // Hence, this is a 4 byte-aligned instruction.
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_NOT_TAKEN;
let predicted_iter_first = cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest = cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BNE BACK usually taken aligned", predicted, cycles)
}
/// Like [branch_back_usually_taken_aligned] but with an unaligned jump target. This seems to have the same
/// behavior, but it's good to verify since I previously saw unaligned unconditional jumps take
/// longer than aligned unconditional jumps.
#[ram]
fn branch_back_usually_taken_unaligned(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:", // This label and the first instruction are two-byte aligned.
"addi {y}, {y}, 1", // This is a 16-bit instruction
"bne {y}, {iters}, 1b", // Hence this instruction's address is *not* 4-byte-aligned.
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
// The branch is extra slow b/c the instruction is not 4 byte-aligned.
let predicted_iter_last = cycles::ADD + cycles::BRANCH_BACK_NOT_TAKEN_INSTR_UNALIGNED;
let predicted_iter_first = cycles::ADD + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest = cycles::ADD + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BNE BACK usually taken unaligned", predicted, cycles)
}
/// Like [branch_back_usually_taken_aligned], but with dedicated GPIO instructions inserted between
/// each instruction in the loop. This allows us to inspect the per-iteration timing with an
/// oscilloscope. When we do so, we can confirm that in all cases the final iteration takes the same
/// number of CPU cycles (i.e. the untaken backward branch always take the same number of cycles,
/// regardless of whether one, two or more iterations are run).
#[ram]
fn branch_back_usually_taken_with_gpio_aligned(iters: u32) -> (&'static str, u32, u32) {
// Make sure the pin is set low at the start of the benchmark, without including this
// instruction in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
// Run the benchmark, setting and clearing the pin between each instruction.
let cycles = unsafe {
asm_with_perf_counter!(
"1:", // 4 byte-aligned.
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
"addi {y}, {y}, 1", // 16-bit instruction
"nop", // 16-bit instruction, to ensure next instruction is 4 byte-aligned.
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction.
"bne {y}, {iters}, 1b", // Hence, 4 byte-aligned.
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX),
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
// Make sure the pin is set low at the end of the benchmark, without including this instruction
// in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
let predicted_iter_last = cycles::CSRR
+ cycles::ADD
+ cycles::NOP
+ cycles::CSRR
+ cycles::BRANCH_BACK_NOT_TAKEN
+ cycles::CSRR;
let predicted_iter_first =
cycles::CSRR + cycles::ADD + cycles::NOP + cycles::CSRR + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest = cycles::CSRR
+ cycles::ADD
+ cycles::NOP
+ cycles::CSRR
+ cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BNE BACK usually taken w/ GPIO aligned", predicted, cycles)
}
/// Like [branch_back_usually_taken_unaligned], but with dedicated GPIO instructions inserted
/// between each instruction in the loop. This allows us to inspect the per-iteration timing with an
/// oscilloscope. When we do so, we can confirm that in all cases the final iteration takes the same
/// number of CPU cycles (i.e. the untaken backward branch always take the same number of cycles,
/// regardless of whether one, two or more iterations are run).
#[ram]
fn branch_back_usually_taken_with_gpio_unaligned(iters: u32) -> (&'static str, u32, u32) {
// Make sure the pin is set low at the start of the benchmark, without including this
// instruction in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
// Run the benchmark, setting and clearing the pin between each instruction.
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
"addi {y}, {y}, 1", // 16-bit instruction
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction
"bne {y}, {iters}, 1b", // Hence, not 4 byte-aligned. !!!
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX),
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
// Make sure the pin is set low at the end of the benchmark, without including this instruction
// in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
let predicted_iter_last = cycles::CSRR
+ cycles::ADD
+ cycles::CSRR
// The branch is extra slow b/c the instruction is not 4 byte-aligned.
+ cycles::BRANCH_BACK_NOT_TAKEN_INSTR_UNALIGNED
+ cycles::CSRR;
let predicted_iter_first =
cycles::CSRR + cycles::ADD + cycles::CSRR + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest =
cycles::CSRR + cycles::ADD + cycles::CSRR + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
(
"BNE BACK usually taken w/ GPIO unaligned",
predicted,
cycles,
)
}
/// Like [branch_back_usually_taken_aligned] but with a final nop at the end. In this case the branch
/// instruction one less CPU cycle, but the nop takes one additional CPU cycle, resulting in the
/// same total number of CPU cycles.
#[ram]
fn branch_back_usually_taken_with_final_nop(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"addi {y}, {y}, 1",
"bne {y}, {iters}, 1b",
"nop", // This nop causes the non-taken branch instruction to take one fewer CPU cycle.
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::ADD + cycles::BRANCH_BACK_NOT_TAKEN + cycles::NOP;
let predicted_iter_first = cycles::ADD + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest = cycles::ADD + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BNE BACK usually taken w/ final NOP", predicted, cycles)
}
/// Measures the case of a loop with a backward branch instruction that is taken in for all but the
/// last iteration of the loop. The loop in turn also contains a forward branch instruction that is
/// never taken. This represents a common scenario of a conditional loop with a rarely-taken
/// bail-out break condition.
#[ram]
fn branch_back_usually_taken_w_dead_branch_fwd(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"addi {y}, {y}, 1", // 16 bit instruction
"beq zero, {tmp}, 2f", // never taken, i.e. dead branch, 32bit instruction
"bne {y}, {iters}, 1b", // 32 bit instruction
"2:", // Guaranteed to be aligned
iters = in(reg) iters,
y = inout(reg) 0 => _,
tmp = inout(reg) 1 => _
)
};
let predicted_iter_last =
cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_INITIAL + cycles::BRANCH_BACK_NOT_TAKEN;
let predicted_iter_first =
cycles::ADD + cycles::BRANCH_FWD_NOT_TAKEN_SUBSEQUENT + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest = cycles::ADD
+ cycles::BRANCH_FWD_NOT_TAKEN_SUBSEQUENT
+ cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BNE BACK usually taken w/ dead FWD BEQ", predicted, cycles)
}
/// Measures the case of a loop with a backward branch instruction that is taken in for all but the
/// last iteration of the loop. The loop in turn also contains a forward jump instruction to an
/// aligned target.
///
/// This is meant to gauge the cost of the unconditional jump instructions.
#[ram]
fn branch_back_usually_taken_w_aligned_jmp_fwd(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"j 2f",
".align 4",
"2:",
"addi {y}, {y}, 1", // 16-bit instruction
"nop", // 16-bit instruction
"bne {y}, {iters}, 1b", // Hence, 4 byte-aligned.
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last =
cycles::JUMP_EXTRA_SLOW + cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_NOT_TAKEN;
let predicted_iter_first =
cycles::JUMP + cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest =
cycles::JUMP_EXTRA_SLOW + cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
("BNE BACK usually taken w/ aligned J FWD", predicted, cycles)
}
/// Like [branch_back_usually_taken_w_aligned_jmp_fwd] but with unconditional jump target that is
/// *not* 4 byte-aligned. This is meant to check whether the jump target alignment matters (it
/// doesn't seem to).
#[ram]
fn branch_back_usually_taken_w_unaligned_jmp_fwd(iters: u32) -> (&'static str, u32, u32) {
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"j 2f",
".align 4",
"nop", // 16-bit instruction
"2:", // Guaranteed to be unaligned.
"addi {y}, {y}, 1", // 16-bit-instruction
"bne {y}, {iters}, 1b", // Hence, 4 byte-aligned.
iters = in(reg) iters,
y = inout(reg) 0 => _
)
};
let predicted_iter_last = cycles::JUMP_EXTRA_SLOW + cycles::ADD + cycles::BRANCH_BACK_NOT_TAKEN;
let predicted_iter_first = cycles::JUMP + cycles::ADD + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest =
cycles::JUMP_EXTRA_SLOW + cycles::ADD + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
(
"BNE BACK usually taken w/ unaligned J FWD",
predicted,
cycles,
)
}
/// See [_branch_back_usually_taken_w_lbu]. This runs the benchmark with 1-byte aligned data.
#[ram]
fn branch_back_usually_taken_w_lbu_align1(iters: u32) -> (&'static str, u32, u32) {
_branch_back_usually_taken_w_lbu(iters, "BNE BACK usually taken w/ LBU align(1)", unsafe {
&mut DATA_U8_ALIGN1.data
})
}
/// See [_branch_back_usually_taken_w_lbu]. This runs the benchmark with 2-byte aligned data.
#[ram]
fn branch_back_usually_taken_w_lbu_align2(iters: u32) -> (&'static str, u32, u32) {
_branch_back_usually_taken_w_lbu(iters, "BNE BACK usually taken w/ LBU align(2)", unsafe {
&mut DATA_U8_ALIGN2.data
})
}
/// See [_branch_back_usually_taken_w_lbu]. This runs the benchmark with 4 byte-aligned data.
#[ram]
fn branch_back_usually_taken_w_lbu_align4(iters: u32) -> (&'static str, u32, u32) {
_branch_back_usually_taken_w_lbu(iters, "BNE BACK usually taken w/ LBU align(4)", unsafe {
&mut DATA_U8_ALIGN4.data
})
}
/// See [_branch_back_usually_taken_w_lbu]. This runs the benchmark with 8-byte aligned data.
#[ram]
fn branch_back_usually_taken_w_lbu_align8(iters: u32) -> (&'static str, u32, u32) {
_branch_back_usually_taken_w_lbu(iters, "BNE BACK usually taken w/ LBU align(8)", unsafe {
&mut DATA_U8_ALIGN8.data
})
}
/// See [_branch_back_usually_taken_w_lbu]. This runs the benchmark with 256-byte aligned data.
#[ram]
fn branch_back_usually_taken_w_lbu_align256(iters: u32) -> (&'static str, u32, u32) {
_branch_back_usually_taken_w_lbu(iters, "BNE BACK usually taken w/ LBU align(256) ", unsafe {
&mut DATA_U8_ALIGN256.data
})
}
/// Measures the case of a loop with a backward branch instruction that is taken in for all but the
/// last iteration of the loop. The loop in turn also contains a load byte (`lbu`) instruction,
/// which takes a varying number of CPU cycles.
#[ram]
fn _branch_back_usually_taken_w_lbu(
iters: u32,
name: &'static str,
data: &mut [u8],
) -> (&'static str, u32, u32) {
// Ensure that the data is filled with ones, since we rely on that to end the loop iteration.
data.fill(1u8);
let data_ptr_range = data[0..iters as usize].as_ptr_range();
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
// LBU takes 2 cycles here. However, we must ensure there's actually a data dependency
// on the result of the load instruction, otherwise the CPU can just pipeline it, and
// then it will often take just a single cycle (or more precisely, it seems to take 1
// cycle every 3 out of 4 iterations, and 2 cycles every 4th iteration).
"lbu {tmp}, 0({data_ptr})", // 32-bit instruction
"add {data_ptr}, {data_ptr}, {tmp}", // 16-bit instruction
"nop", // 16-bit instruction
"bne {data_ptr}, {data_end_ptr}, 1b", // Hence, 4 byte-aligned.
data_ptr = inout(reg) data_ptr_range.start => _,
data_end_ptr = in(reg) data_ptr_range.end,
tmp = out(reg) _
)
};
let predicted_iter_last =
cycles::LOAD + cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_NOT_TAKEN;
let predicted_iter_first =
cycles::LOAD + cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_TAKEN_INITIAL;
let predicted_iter_rest =
cycles::LOAD + cycles::ADD + cycles::NOP + cycles::BRANCH_BACK_TAKEN_SUBSEQUENT;
let predicted = match iters {
0 => 0,
1 => predicted_iter_last,
2 => predicted_iter_last + predicted_iter_first,
3.. => predicted_iter_last + predicted_iter_first + (predicted_iter_rest) * (iters - 2),
};
(name, predicted, cycles)
}
/// Like [branch_back_usually_taken_w_lbu_align1], but with dedicated GPIO instructions inserted
/// between some of the each instructions in the loop. This allows us to inspect the per-iteration
/// timing with an oscilloscope. When we do so, we can confirm that the load instruction takes the
/// same number of CPU cycles in all cases, and does not have differing behavior between the first
/// execution and subsequent executions.
#[ram]
fn branch_back_usually_taken_w_lbu_align1_w_gpio(iters: u32) -> (&'static str, u32, u32) {
// Ensure that the data is filled with ones, since we rely on that to end the loop iteration.
let data_ptr_range;
unsafe {
DATA_U8_ALIGN1.data.fill(1u8);
data_ptr_range = DATA_U8_ALIGN1.data[0..iters as usize].as_ptr_range();
}
// Make sure the pin is set low at the start of the benchmark, without including this
// instruction in the cycle count.
unsafe {
asm!(
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX))
};
// Run the benchmark, setting and clearing the pin between each instruction.
let cycles = unsafe {
asm_with_perf_counter!(
"1:",
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set, 32-bit instruction
// Note that just like in _branch_back_usually_taken_w_lbu, we must use an `add`
// instruction to introduce a data dependency. Even more importantly in this benchmark:
// the `add` instruction must immediately follow the `lbu` instruction, we must not
// place the `csrr` instruction in between them, otherwise the CPU can seemingly make
// use of the instruction pipelining, and then the `lbu` instruction only takes a single
// CPU cycle three out of four times it's executed.
"lbu {tmp}, 0({data_ptr})", // 32-bit instruction
"add {data_ptr}, {data_ptr}, {tmp}", // 16-bit instruction
"nop", // 16-bit instruction
"csrrci zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Clear, 32-bit instruction
"bne {data_ptr}, {data_end_ptr}, 1b", // Hence, 4 byte-aligned.
"csrrsi zero, {csr_cpu_gpio_out}, 1<<{cpu_signal_idx}", // Set
csr_cpu_gpio_out = const(CSR_CPU_GPIO_OUT),
cpu_signal_idx= const(TX_CPU_OUTPUT_SIGNAL_CSR_IDX),
data_ptr = inout(reg) data_ptr_range.start => _,
data_end_ptr = in(reg) data_ptr_range.end,
tmp = out(reg) _
)
};
// Make sure the pin is set low at the end of the benchmark, without including this instruction
// in the cycle count.