qsdr_benchmarks/
asm.rs

1use std::collections::HashMap;
2
3const BENCHMARK_ITERATIONS: usize = 10000;
4
5#[doc(hidden)]
6pub fn run_benchmark(f: impl FnMut() -> u64) -> HashMap<u64, usize> {
7    let results: Vec<_> = std::iter::repeat_with(f)
8        .take(BENCHMARK_ITERATIONS)
9        .collect();
10    let mut histogram = HashMap::new();
11    for &r in &results {
12        histogram
13            .entry(r)
14            .and_modify(|count| *count += 1)
15            .or_insert(1);
16    }
17    histogram
18}
19
20#[doc(hidden)]
21#[cfg(target_arch = "aarch64")]
22#[macro_export]
23macro_rules! time_asm {
24    ($($instruction:expr),*,; $($extra:tt)*) => {
25        {
26            let start: u64;
27            let end: u64;
28            // The following 32 nop's are used to flush the pipeline after
29            // branching back to the beginning of this block in a loop, which
30            // can affect the timing of some load instructions (for instance
31            // ld1.4s {v0-v3} shows an issue latency of 10 cycles if these nop's
32            // are not included).
33            std::arch::asm!(
34                "nop",
35                "nop",
36                "nop",
37                "nop",
38                "nop",
39                "nop",
40                "nop",
41                "nop",
42                "nop",
43                "nop",
44                "nop",
45                "nop",
46                "nop",
47                "nop",
48                "nop",
49                "nop",
50                "nop",
51                "nop",
52                "nop",
53                "nop",
54                "nop",
55                "nop",
56                "nop",
57                "nop",
58                "nop",
59                "nop",
60                "nop",
61                "nop",
62                "nop",
63                "nop",
64                "nop",
65                "nop",
66                "mrs {__time_asm_start}, pmccntr_el0",
67                $($instruction),*,
68                "mrs {__time_asm_end}, pmccntr_el0",
69                __time_asm_start = out(reg) start,
70                __time_asm_end = out(reg) end,
71                $($extra)*);
72            end - start - 1
73        }
74    }
75}
76
77#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
78#[inline(always)]
79pub fn get_cpu_cycles() -> u64 {
80    0
81}
82
83// Note that this counts reference clock cycles, so the count is not affected by
84// frequency scaling (and it should be).
85#[cfg(target_arch = "x86_64")]
86#[inline(always)]
87pub fn get_cpu_cycles() -> u64 {
88    let rax: u64;
89    let rdx: u64;
90    unsafe {
91        std::arch::asm!("rdtsc", out("rax") rax, out("rdx") rdx);
92    }
93    (rdx << 32) | rax
94}
95
96#[cfg(target_arch = "aarch64")]
97#[inline(always)]
98pub fn get_cpu_cycles() -> u64 {
99    let ret;
100    unsafe {
101        std::arch::asm!("mrs {0}, pmccntr_el0", out(reg) ret);
102    }
103    ret
104}
105
106#[macro_export]
107macro_rules! benchmark {
108    ($benchmark_name:expr; $expected_cycles:expr;
109     $($instruction:expr),*,; $($extra:tt)*) => {
110        {
111            use owo_colors::OwoColorize;
112
113            let hist = $crate::asm::run_benchmark(
114                || $crate::time_asm!($($instruction),*, ; $($extra)*)
115            );
116
117            let name = $benchmark_name;
118            println!();
119            println!("{}", name.blue());
120            println!("{}", std::iter::repeat("=").take(name.len()).collect::<String>().blue());
121            println!();
122
123            { $(println!("    {}", $instruction);)* }
124            println!();
125
126            let (&mode_cycles, &mode_count) = hist.iter().max_by_key(|(_, &v)| v).unwrap();
127
128            println!("{}", "cycles | runs".blue());
129            println!("{}", "-------------".blue());
130            let mut cycles: Vec<u64> = hist.keys().copied().collect();
131            cycles.sort_unstable();
132            for cyc in &cycles {
133                let color = if *cyc == mode_cycles {
134                    owo_colors::AnsiColors::Green
135                } else {
136                    owo_colors::AnsiColors::Default
137                };
138                println!("{:6} {} {:4}",
139                         cyc.color(color),
140                         "|".blue(),
141                         hist[cyc].color(color));
142            }
143            println!();
144
145            let mode_threshold = 922; // ~90% of 1024
146            #[cfg(test)]
147            assert!(mode_count >= mode_threshold,
148                    "mode of test cycles not obtained in enough runs; benchmark results unreliable");
149
150            if let Some(expected_cycles) = $expected_cycles {
151                assert_eq!(mode_cycles, expected_cycles,
152                           "mode of test cycles does not match expected for this CPU: {expected_cycles}");
153            }
154
155            mode_cycles
156        }
157    }
158}
159
160#[macro_export]
161macro_rules! expected_cycles {
162    ($($cpu:expr => $cycles:expr),*$(,)?) => {
163        {
164            if let Ok(cpu_env) = std::env::var("CPU") {
165                match &cpu_env[..] {
166                    $($cpu => Some($cycles)),*,
167                    _ => None,
168                }
169            } else {
170                None
171            }
172        }
173    };
174}