tango_bench/
lib.rs

1#[cfg(feature = "async")]
2pub use asynchronous::async_benchmark_fn;
3use core::ptr;
4use num_traits::ToPrimitive;
5use rand::{rngs::SmallRng, Rng, SeedableRng};
6use std::{
7    cmp::Ordering,
8    hint::black_box,
9    io, mem,
10    ops::{Deref, RangeInclusive},
11    str::Utf8Error,
12    time::Duration,
13};
14use thiserror::Error;
15use timer::{ActiveTimer, Timer};
16
17pub mod cli;
18pub mod dylib;
19#[cfg(target_os = "linux")]
20pub mod linux;
21
22#[derive(Debug, Error)]
23pub enum Error {
24    #[error("No measurements given")]
25    NoMeasurements,
26
27    #[error("Invalid string pointer from FFI")]
28    InvalidFFIString(Utf8Error),
29
30    #[error("Spi::self() was already called")]
31    SpiSelfWasMoved,
32
33    #[error("Unable to load library symbol")]
34    UnableToLoadSymbol(#[source] libloading::Error),
35
36    #[error("Unknown sampler type. Available options are: flat and linear")]
37    UnknownSamplerType,
38
39    #[error("Invalid test name given")]
40    InvalidTestName,
41
42    #[error("IO Error")]
43    IOError(#[from] io::Error),
44}
45
46/// Registers benchmark in the system
47///
48/// Macros accepts a list of functions that produce any [`IntoBenchmarks`] type. All of the benchmarks
49/// created by those functions are registered in the harness.
50///
51/// ## Example
52/// ```rust
53/// use std::time::Instant;
54/// use tango_bench::{benchmark_fn, IntoBenchmarks, tango_benchmarks};
55///
56/// fn time_benchmarks() -> impl IntoBenchmarks {
57///     [benchmark_fn("current_time", |b| b.iter(|| Instant::now()))]
58/// }
59///
60/// tango_benchmarks!(time_benchmarks());
61/// ```
62#[macro_export]
63macro_rules! tango_benchmarks {
64    ($($func_expr:expr),+) => {
65        /// Type checking tango_init() function
66        const TANGO_INIT: $crate::dylib::ffi::InitFn = tango_init;
67
68        /// Exported function for initializing the benchmark harness
69        #[no_mangle]
70        unsafe extern "C" fn tango_init() {
71            let mut benchmarks = vec![];
72            $(benchmarks.extend($crate::IntoBenchmarks::into_benchmarks($func_expr));)*
73            $crate::dylib::__tango_init(benchmarks)
74        }
75
76    };
77}
78
79/// Main entrypoint for benchmarks
80///
81/// This macro generate `main()` function for the benchmark harness. Can be used in a form with providing
82/// measurement settings:
83/// ```rust
84/// use tango_bench::{tango_main, tango_benchmarks, MeasurementSettings};
85///
86/// // Register benchmarks
87/// tango_benchmarks!([]);
88///
89/// tango_main!(MeasurementSettings {
90///     samples_per_haystack: 1000,
91///     min_iterations_per_sample: 10,
92///     max_iterations_per_sample: 10_000,
93///     ..Default::default()
94/// });
95/// ```
96#[macro_export]
97macro_rules! tango_main {
98    ($settings:expr) => {
99        fn main() -> $crate::cli::Result<std::process::ExitCode> {
100            // Initialize Tango for SelfVTable usage
101            unsafe { tango_init() };
102            $crate::cli::run($settings)
103        }
104    };
105    () => {
106        tango_main! {$crate::MeasurementSettings::default()}
107    };
108}
109
110pub struct BenchmarkParams {
111    pub seed: u64,
112}
113
114pub struct Bencher {
115    params: BenchmarkParams,
116}
117
118impl Deref for Bencher {
119    type Target = BenchmarkParams;
120
121    fn deref(&self) -> &Self::Target {
122        &self.params
123    }
124}
125
126impl Bencher {
127    pub fn iter<O: 'static, F: FnMut() -> O + 'static>(self, func: F) -> Box<dyn ErasedSampler> {
128        Box::new(Sampler(func))
129    }
130}
131
132struct Sampler<F>(F);
133
134pub trait ErasedSampler {
135    /// Measures the performance if the function
136    ///
137    /// Returns the cumulative execution time (all iterations) with nanoseconds precision,
138    /// but not necessarily accuracy. Usually this time is get by `clock_gettime()` call or some other
139    /// platform-specific call.
140    ///
141    /// This method should use the same arguments for measuring the test function unless [`prepare_state()`]
142    /// method is called. Only then new set of input arguments should be generated. It is NOT allowed
143    /// to call this method without first calling [`prepare_state()`].
144    ///
145    /// [`prepare_state()`]: Self::prepare_state()
146    fn measure(&mut self, iterations: usize) -> u64;
147
148    /// Estimates the number of iterations achievable within given time.
149    ///
150    /// Time span is given in milliseconds (`time_ms`). Estimate can be an approximation and it is important
151    /// for implementation to be fast (in the order of 10 ms).
152    /// If possible the same input arguments should be used when building the estimate.
153    /// If the single call of a function is longer than provided timespan the implementation should return 0.
154    fn estimate_iterations(&mut self, time_ms: u32) -> usize {
155        let mut iters = 1;
156        let time_ns = Duration::from_millis(time_ms as u64).as_nanos() as u64;
157
158        for _ in 0..5 {
159            // Never believe short measurements because they are very unreliable. Pretending that
160            // measurement at least took 1us guarantees that we won't end up with an unreasonably large number
161            // of iterations
162            let time = self.measure(iters).max(1_000);
163            let time_per_iteration = (time / iters as u64).max(1);
164            let new_iters = (time_ns / time_per_iteration) as usize;
165
166            // Do early stop if new estimate has the same order of magnitude. It is good enough.
167            if new_iters < 2 * iters {
168                return new_iters;
169            }
170
171            iters = new_iters;
172        }
173
174        iters
175    }
176}
177
178impl<O, F: FnMut() -> O> ErasedSampler for Sampler<F> {
179    fn measure(&mut self, iterations: usize) -> u64 {
180        let start = ActiveTimer::start();
181        for _ in 0..iterations {
182            black_box((self.0)());
183        }
184        ActiveTimer::stop(start)
185    }
186}
187
188pub struct Benchmark {
189    name: String,
190    sampler_factory: Box<dyn SamplerFactory>,
191}
192
193pub fn benchmark_fn<F: FnMut(Bencher) -> Box<dyn ErasedSampler> + 'static>(
194    name: impl Into<String>,
195    sampler_factory: F,
196) -> Benchmark {
197    let name = name.into();
198    assert!(!name.is_empty());
199    Benchmark {
200        name,
201        sampler_factory: Box::new(SyncSampleFactory(sampler_factory)),
202    }
203}
204
205pub trait SamplerFactory {
206    fn create_sampler(&mut self, params: BenchmarkParams) -> Box<dyn ErasedSampler>;
207}
208
209struct SyncSampleFactory<F>(F);
210
211impl<F: FnMut(Bencher) -> Box<dyn ErasedSampler>> SamplerFactory for SyncSampleFactory<F> {
212    fn create_sampler(&mut self, params: BenchmarkParams) -> Box<dyn ErasedSampler> {
213        (self.0)(Bencher { params })
214    }
215}
216
217impl Benchmark {
218    /// Generates next haystack for the measurement
219    ///
220    /// Calling this method should update internal haystack used for measurement.
221    /// Returns `true` if update happens, `false` if implementation doesn't support haystack generation.
222    /// Haystack/Needle distinction is described in [`Generator`] trait.
223    pub fn prepare_state(&mut self, seed: u64) -> Box<dyn ErasedSampler> {
224        self.sampler_factory
225            .create_sampler(BenchmarkParams { seed })
226    }
227
228    /// Name of the benchmark
229    pub fn name(&self) -> &str {
230        self.name.as_str()
231    }
232}
233
234/// Converts the implementing type into a vector of [`Benchmark`].
235pub trait IntoBenchmarks {
236    fn into_benchmarks(self) -> Vec<Benchmark>;
237}
238
239impl<const N: usize> IntoBenchmarks for [Benchmark; N] {
240    fn into_benchmarks(self) -> Vec<Benchmark> {
241        self.into_iter().collect()
242    }
243}
244
245impl IntoBenchmarks for Vec<Benchmark> {
246    fn into_benchmarks(self) -> Vec<Benchmark> {
247        self
248    }
249}
250
251/// Describes basic settings for the benchmarking process
252///
253/// This structure is passed to [`cli::run()`].
254///
255/// Should be created only with overriding needed properties, like so:
256/// ```rust
257/// use tango_bench::MeasurementSettings;
258///
259/// let settings = MeasurementSettings {
260///     min_iterations_per_sample: 1000,
261///     ..Default::default()
262/// };
263/// ```
264#[derive(Clone, Copy, Debug)]
265pub struct MeasurementSettings {
266    pub filter_outliers: bool,
267
268    /// The number of samples per one generated haystack
269    pub samples_per_haystack: usize,
270
271    /// Minimum number of iterations in a sample for each of 2 tested functions
272    pub min_iterations_per_sample: usize,
273
274    /// The number of iterations in a sample for each of 2 tested functions
275    pub max_iterations_per_sample: usize,
276
277    pub sampler_type: SampleLengthKind,
278
279    /// If true scheduler performs warmup iterations before measuring function
280    pub warmup_enabled: bool,
281
282    /// Size of a CPU cache firewall in KBytes
283    ///
284    /// If set, the scheduler will perform a dummy data read between samples generation to spoil the CPU cache
285    ///
286    /// Cache firewall is a way to reduce the impact of the CPU cache on the benchmarking process. It tries
287    /// to minimize discrepancies in performance between two algorithms due to the CPU cache state.
288    pub cache_firewall: Option<usize>,
289
290    /// If true, scheduler will perform a yield of control back to the OS before taking each sample
291    ///
292    /// Yielding control to the OS is a way to reduce the impact of OS scheduler on the benchmarking process.
293    pub yield_before_sample: bool,
294
295    /// If set, use alloca to allocate a random offset for the stack each sample.
296    /// This to reduce memory alignment effects on the benchmarking process.
297    ///
298    /// May cause UB if the allocation is larger then the thread stack size.
299    pub randomize_stack: Option<usize>,
300}
301
302#[derive(Clone, Copy, Debug)]
303pub enum SampleLengthKind {
304    Flat,
305    Linear,
306    Random,
307}
308
309/// Performs a dummy reads from memory to spoil given amount of CPU cache
310///
311/// Uses cache aligned data arrays to perform minimum amount of reads possible to spoil the cache
312struct CacheFirewall {
313    cache_lines: Vec<CacheLine>,
314}
315
316impl CacheFirewall {
317    fn new(bytes: usize) -> Self {
318        let n = bytes / mem::size_of::<CacheLine>();
319        let cache_lines = vec![CacheLine::default(); n];
320        Self { cache_lines }
321    }
322
323    fn issue_read(&self) {
324        for line in &self.cache_lines {
325            // Because CacheLine is aligned on 64 bytes it is enough to read single element from the array
326            // to spoil the whole cache line
327            unsafe { ptr::read_volatile(&line.0[0]) };
328        }
329    }
330}
331
332#[repr(C)]
333#[repr(align(64))]
334#[derive(Default, Clone, Copy)]
335struct CacheLine([u16; 32]);
336
337pub const DEFAULT_SETTINGS: MeasurementSettings = MeasurementSettings {
338    filter_outliers: false,
339    samples_per_haystack: 1,
340    min_iterations_per_sample: 1,
341    max_iterations_per_sample: 5000,
342    sampler_type: SampleLengthKind::Random,
343    cache_firewall: None,
344    yield_before_sample: false,
345    warmup_enabled: true,
346    randomize_stack: None,
347};
348
349impl Default for MeasurementSettings {
350    fn default() -> Self {
351        DEFAULT_SETTINGS
352    }
353}
354
355/// Responsible for determining the number of iterations to run for each sample
356///
357/// Different sampler strategies can influence the results heavily. For example, if function is dependent heavily
358/// on a memory subsystem, then it should be tested with different number of iterations to be representative
359/// for different memory access patterns and cache states.
360trait SampleLength {
361    /// Returns the number of iterations to run for the next sample
362    ///
363    /// Accepts the number of iteration being run starting from 0 and cumulative time spent by both functions
364    fn next_sample_iterations(&mut self, iteration_no: usize, estimate: usize) -> usize;
365}
366
367/// Runs the same number of iterations for each sample
368///
369/// Estimates the number of iterations based on the number of iterations achieved in 10 ms and uses
370/// this number as a base for the number of iterations for each sample.
371struct FlatSampleLength {
372    min: usize,
373    max: usize,
374}
375
376impl FlatSampleLength {
377    fn new(settings: &MeasurementSettings) -> Self {
378        FlatSampleLength {
379            min: settings.min_iterations_per_sample.max(1),
380            max: settings.max_iterations_per_sample,
381        }
382    }
383}
384
385impl SampleLength for FlatSampleLength {
386    fn next_sample_iterations(&mut self, _iteration_no: usize, estimate: usize) -> usize {
387        estimate.clamp(self.min, self.max)
388    }
389}
390
391struct LinearSampleLength {
392    min: usize,
393    max: usize,
394}
395
396impl LinearSampleLength {
397    fn new(settings: &MeasurementSettings) -> Self {
398        Self {
399            min: settings.min_iterations_per_sample.max(1),
400            max: settings.max_iterations_per_sample,
401        }
402    }
403}
404
405impl SampleLength for LinearSampleLength {
406    fn next_sample_iterations(&mut self, iteration_no: usize, estimate: usize) -> usize {
407        let estimate = estimate.clamp(self.min, self.max);
408        (iteration_no % estimate) + 1
409    }
410}
411
412/// Sampler that randomly determines the number of iterations to run for each sample
413///
414/// This sampler uses a random number generator to decide the number of iterations for each sample.
415struct RandomSampleLength {
416    rng: SmallRng,
417    min: usize,
418    max: usize,
419}
420
421impl RandomSampleLength {
422    pub fn new(settings: &MeasurementSettings, seed: u64) -> Self {
423        Self {
424            rng: SmallRng::seed_from_u64(seed),
425            min: settings.min_iterations_per_sample.max(1),
426            max: settings.max_iterations_per_sample,
427        }
428    }
429}
430
431impl SampleLength for RandomSampleLength {
432    fn next_sample_iterations(&mut self, _iteration_no: usize, estimate: usize) -> usize {
433        let estimate = estimate.clamp(self.min, self.max);
434        self.rng.gen_range(1..=estimate)
435    }
436}
437
438/// Calculates the result of the benchmarking run
439///
440/// Return None if no measurements were made
441pub(crate) fn calculate_run_result<N: Into<String>>(
442    name: N,
443    baseline: &[u64],
444    candidate: &[u64],
445    iterations_per_sample: &[usize],
446    filter_outliers: bool,
447) -> Option<RunResult> {
448    assert!(baseline.len() == candidate.len());
449    assert!(baseline.len() == iterations_per_sample.len());
450
451    let mut iterations_per_sample = iterations_per_sample.to_vec();
452
453    let mut diff = candidate
454        .iter()
455        .zip(baseline.iter())
456        // Calculating difference between candidate and baseline
457        .map(|(&c, &b)| (c as f64 - b as f64))
458        .zip(iterations_per_sample.iter())
459        // Normalizing difference to iterations count
460        .map(|(diff, &iters)| diff / iters as f64)
461        .collect::<Vec<_>>();
462
463    // need to save number of original samples to calculate number of outliers correctly
464    let n = diff.len();
465
466    // Normalizing measurements to iterations count
467    let mut baseline = baseline
468        .iter()
469        .zip(iterations_per_sample.iter())
470        .map(|(&v, &iters)| (v as f64) / (iters as f64))
471        .collect::<Vec<_>>();
472    let mut candidate = candidate
473        .iter()
474        .zip(iterations_per_sample.iter())
475        .map(|(&v, &iters)| (v as f64) / (iters as f64))
476        .collect::<Vec<_>>();
477
478    // Calculating measurements range. All measurements outside this interval considered outliers
479    let range = if filter_outliers {
480        iqr_variance_thresholds(diff.to_vec())
481    } else {
482        None
483    };
484
485    // Cleaning measurements from outliers if needed
486    if let Some(range) = range {
487        // We filtering outliers to build statistical Summary and the order of elements in arrays
488        // doesn't matter, therefore swap_remove() is used. But we need to make sure that all arrays
489        // has the same length
490        assert_eq!(diff.len(), baseline.len());
491        assert_eq!(diff.len(), candidate.len());
492
493        let mut i = 0;
494        while i < diff.len() {
495            if range.contains(&diff[i]) {
496                i += 1;
497            } else {
498                diff.swap_remove(i);
499                iterations_per_sample.swap_remove(i);
500                baseline.swap_remove(i);
501                candidate.swap_remove(i);
502            }
503        }
504    };
505
506    let diff_summary = Summary::from(&diff)?;
507    let baseline_summary = Summary::from(&baseline)?;
508    let candidate_summary = Summary::from(&candidate)?;
509
510    let diff_estimate = DiffEstimate::build(&baseline_summary, &diff_summary);
511
512    Some(RunResult {
513        baseline: baseline_summary,
514        candidate: candidate_summary,
515        diff: diff_summary,
516        name: name.into(),
517        diff_estimate,
518        outliers: n - diff_summary.n,
519    })
520}
521
522/// Contains the estimation of how much faster or slower is candidate function compared to baseline
523pub(crate) struct DiffEstimate {
524    // Percentage of difference between candidate and baseline
525    //
526    // Negative value means that candidate is faster than baseline, positive - slower.
527    pct: f64,
528
529    // Is the difference statistically significant
530    significant: bool,
531}
532
533impl DiffEstimate {
534    /// Builds [`DiffEstimate`] from flat sampling
535    ///
536    /// Flat sampling is a sampling where each measurement is normalized by the number of iterations.
537    /// This is needed to make measurements comparable between each other. Linear sampling is more
538    /// robust to outliers, but it is requiring more iterations.
539    ///
540    /// It is assumed that baseline and candidate are already normalized by iterations count.
541    fn build(baseline: &Summary<f64>, diff: &Summary<f64>) -> Self {
542        let std_dev = diff.variance.sqrt();
543        let std_err = std_dev / (diff.n as f64).sqrt();
544        let z_score = diff.mean / std_err;
545
546        // significant result is far away from 0 and have more than 0.5% base/candidate difference
547        // z_score = 2.6 corresponds to 99% significance level
548        let significant = z_score.abs() >= 2.6
549            && (diff.mean / baseline.mean).abs() > 0.005
550            && diff.mean.abs() >= ActiveTimer::precision() as f64;
551        let pct = diff.mean / baseline.mean * 100.0;
552
553        Self { pct, significant }
554    }
555}
556
557/// Describes the results of a single benchmark run
558pub(crate) struct RunResult {
559    /// name of a test
560    name: String,
561
562    /// statistical summary of baseline function measurements
563    baseline: Summary<f64>,
564
565    /// statistical summary of candidate function measurements
566    candidate: Summary<f64>,
567
568    /// individual measurements of a benchmark (candidate - baseline)
569    diff: Summary<f64>,
570
571    diff_estimate: DiffEstimate,
572
573    /// Numbers of detected and filtered outliers
574    outliers: usize,
575}
576
577/// Statistical summary for a given iterator of numbers.
578///
579/// Calculates all the information using single pass over the data. Mean and variance are calculated using
580/// streaming algorithm described in _Art of Computer Programming, Vol 2, page 232_.
581#[derive(Clone, Copy)]
582pub struct Summary<T> {
583    pub n: usize,
584    pub min: T,
585    pub max: T,
586    pub mean: f64,
587    pub variance: f64,
588}
589
590impl<T: PartialOrd> Summary<T> {
591    pub fn from<'a, C>(values: C) -> Option<Self>
592    where
593        C: IntoIterator<Item = &'a T>,
594        T: ToPrimitive + Copy + Default + 'a,
595    {
596        Self::running(values.into_iter().copied()).last()
597    }
598
599    pub fn running<I>(iter: I) -> impl Iterator<Item = Summary<T>>
600    where
601        T: ToPrimitive + Copy + Default,
602        I: Iterator<Item = T>,
603    {
604        RunningSummary {
605            iter,
606            n: 0,
607            min: T::default(),
608            max: T::default(),
609            mean: 0.,
610            s: 0.,
611        }
612    }
613}
614
615struct RunningSummary<T, I> {
616    iter: I,
617    n: usize,
618    min: T,
619    max: T,
620    mean: f64,
621    s: f64,
622}
623
624impl<T, I> Iterator for RunningSummary<T, I>
625where
626    T: Copy + PartialOrd,
627    I: Iterator<Item = T>,
628    T: ToPrimitive,
629{
630    type Item = Summary<T>;
631
632    fn next(&mut self) -> Option<Self::Item> {
633        let value = self.iter.next()?;
634        let fvalue = value.to_f64().expect("f64 overflow detected");
635
636        if self.n == 0 {
637            self.min = value;
638            self.max = value;
639        }
640
641        if let Some(Ordering::Less) = value.partial_cmp(&self.min) {
642            self.min = value;
643        }
644        if let Some(Ordering::Greater) = value.partial_cmp(&self.max) {
645            self.max = value;
646        }
647
648        self.n += 1;
649        let mean_p = self.mean;
650        self.mean += (fvalue - self.mean) / self.n as f64;
651        self.s += (fvalue - mean_p) * (fvalue - self.mean);
652        let variance = if self.n > 1 {
653            self.s / (self.n - 1) as f64
654        } else {
655            0.
656        };
657
658        Some(Summary {
659            n: self.n,
660            min: self.min,
661            max: self.max,
662            mean: self.mean,
663            variance,
664        })
665    }
666}
667
668/// Outlier detection algorithm based on interquartile range
669///
670/// Observations that are 1.5 IQR away from the corresponding quartile are consideted as outliers
671/// as described in original Tukey's paper.
672pub fn iqr_variance_thresholds(mut input: Vec<f64>) -> Option<RangeInclusive<f64>> {
673    const MINIMUM_IQR: f64 = 1.;
674
675    input.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
676    let (q1, q3) = (input.len() / 4, input.len() * 3 / 4 - 1);
677    if q1 >= q3 || q3 >= input.len() {
678        return None;
679    }
680    // In case q1 and q3 are equal, we need to make sure that IQR is not 0
681    // In the future it would be nice to measure system timer precision empirically.
682    let iqr = (input[q3] - input[q1]).max(MINIMUM_IQR);
683
684    let low_threshold = input[q1] - iqr * 1.5;
685    let high_threshold = input[q3] + iqr * 1.5;
686
687    // Calculating the indices of the thresholds in an dataset
688    let low_threshold_idx =
689        match input[0..q1].binary_search_by(|probe| probe.total_cmp(&low_threshold)) {
690            Ok(idx) => idx,
691            Err(idx) => idx,
692        };
693
694    let high_threshold_idx =
695        match input[q3..].binary_search_by(|probe| probe.total_cmp(&high_threshold)) {
696            Ok(idx) => idx,
697            Err(idx) => idx,
698        };
699
700    if low_threshold_idx == 0 || high_threshold_idx >= input.len() {
701        return None;
702    }
703
704    // Calculating the equal number of observations which should be removed from each "side" of observations
705    let outliers_cnt = low_threshold_idx.min(input.len() - high_threshold_idx);
706
707    Some(input[outliers_cnt]..=(input[input.len() - outliers_cnt - 1]))
708}
709
710mod timer {
711    use std::time::Instant;
712
713    #[cfg(all(feature = "hw-timer", target_arch = "x86_64"))]
714    pub(super) type ActiveTimer = x86::RdtscpTimer;
715
716    #[cfg(not(all(feature = "hw-timer", target_arch = "x86_64")))]
717    pub(super) type ActiveTimer = PlatformTimer;
718
719    pub(super) trait Timer<T> {
720        fn start() -> T;
721        fn stop(start_time: T) -> u64;
722
723        /// Timer precision in nanoseconds
724        ///
725        /// The results less than the precision of a timer are considered not significant
726        fn precision() -> u64 {
727            1
728        }
729    }
730
731    pub(super) struct PlatformTimer;
732
733    impl Timer<Instant> for PlatformTimer {
734        #[inline]
735        fn start() -> Instant {
736            Instant::now()
737        }
738
739        #[inline]
740        fn stop(start_time: Instant) -> u64 {
741            start_time.elapsed().as_nanos() as u64
742        }
743    }
744
745    #[cfg(all(feature = "hw-timer", target_arch = "x86_64"))]
746    pub(super) mod x86 {
747        use super::Timer;
748        use std::arch::x86_64::{__rdtscp, _mm_mfence};
749
750        pub struct RdtscpTimer;
751
752        impl Timer<u64> for RdtscpTimer {
753            #[inline]
754            fn start() -> u64 {
755                unsafe {
756                    _mm_mfence();
757                    __rdtscp(&mut 0)
758                }
759            }
760
761            #[inline]
762            fn stop(start: u64) -> u64 {
763                unsafe {
764                    let end = __rdtscp(&mut 0);
765                    _mm_mfence();
766                    end - start
767                }
768            }
769        }
770    }
771}
772
773#[cfg(feature = "async")]
774pub mod asynchronous {
775    use super::{Benchmark, BenchmarkParams, ErasedSampler, Sampler, SamplerFactory};
776    use std::{future::Future, ops::Deref};
777
778    pub fn async_benchmark_fn<R, F>(
779        name: impl Into<String>,
780        runtime: R,
781        sampler_factory: F,
782    ) -> Benchmark
783    where
784        R: AsyncRuntime + 'static,
785        F: FnMut(AsyncBencher<R>) -> Box<dyn ErasedSampler> + 'static,
786    {
787        let name = name.into();
788        assert!(!name.is_empty());
789        Benchmark {
790            name,
791            sampler_factory: Box::new(AsyncSampleFactory(sampler_factory, runtime)),
792        }
793    }
794
795    pub struct AsyncSampleFactory<F, R>(pub F, pub R);
796
797    impl<R: AsyncRuntime, F: FnMut(AsyncBencher<R>) -> Box<dyn ErasedSampler>> SamplerFactory
798        for AsyncSampleFactory<F, R>
799    {
800        fn create_sampler(&mut self, params: BenchmarkParams) -> Box<dyn ErasedSampler> {
801            (self.0)(AsyncBencher {
802                params,
803                runtime: self.1,
804            })
805        }
806    }
807
808    pub struct AsyncBencher<R> {
809        params: BenchmarkParams,
810        runtime: R,
811    }
812
813    impl<R: AsyncRuntime + 'static> AsyncBencher<R> {
814        pub fn iter<O, Fut, F>(self, func: F) -> Box<dyn ErasedSampler>
815        where
816            O: 'static,
817            Fut: Future<Output = O>,
818            F: FnMut() -> Fut + Copy + 'static,
819        {
820            Box::new(Sampler(move || self.runtime.block_on(func)))
821        }
822    }
823
824    impl<R> Deref for AsyncBencher<R> {
825        type Target = BenchmarkParams;
826
827        fn deref(&self) -> &Self::Target {
828            &self.params
829        }
830    }
831
832    pub trait AsyncRuntime: Copy {
833        fn block_on<O, Fut: Future<Output = O>, F: FnMut() -> Fut>(&self, f: F) -> O;
834    }
835
836    #[cfg(feature = "async-tokio")]
837    pub mod tokio {
838        use super::*;
839        use ::tokio::runtime::Builder;
840
841        #[derive(Copy, Clone)]
842        pub struct TokioRuntime;
843
844        impl AsyncRuntime for TokioRuntime {
845            fn block_on<O, Fut: Future<Output = O>, F: FnMut() -> Fut>(&self, mut f: F) -> O {
846                let runtime = Builder::new_current_thread().build().unwrap();
847                runtime.block_on(f())
848            }
849        }
850    }
851}
852
853#[cfg(test)]
854mod tests {
855    use super::*;
856    use rand::{rngs::SmallRng, Rng, RngCore, SeedableRng};
857    use std::{
858        iter::Sum,
859        ops::{Add, Div},
860        thread,
861        time::Duration,
862    };
863
864    #[test]
865    fn check_iqr_variance_thresholds() {
866        let mut rng = SmallRng::from_entropy();
867
868        // Generate 20 random values in range [-50, 50]
869        // and add 10 outliers in each of two ranges [-1000, -200] and [200, 1000]
870        // This way IQR is no more than 100 and thresholds should be within [-50, 50] range
871        let mut values = vec![];
872        values.extend((0..20).map(|_| rng.gen_range(-50.0..=50.)));
873        values.extend((0..10).map(|_| rng.gen_range(-1000.0..=-200.0)));
874        values.extend((0..10).map(|_| rng.gen_range(200.0..=1000.0)));
875
876        let thresholds = iqr_variance_thresholds(values).unwrap();
877
878        assert!(
879            -50. <= *thresholds.start() && *thresholds.end() <= 50.,
880            "Invalid range: {:?}",
881            thresholds
882        );
883    }
884
885    /// This tests checks that the algorithm is stable in case of zero difference between 25 and 75 percentiles
886    #[test]
887    fn check_outliers_zero_iqr() {
888        let mut rng = SmallRng::from_entropy();
889
890        let mut values = vec![];
891        values.extend(std::iter::repeat(0.).take(20));
892        values.extend((0..10).map(|_| rng.gen_range(-1000.0..=-200.0)));
893        values.extend((0..10).map(|_| rng.gen_range(200.0..=1000.0)));
894
895        let thresholds = iqr_variance_thresholds(values).unwrap();
896
897        assert!(
898            0. <= *thresholds.start() && *thresholds.end() <= 0.,
899            "Invalid range: {:?}",
900            thresholds
901        );
902    }
903
904    #[test]
905    fn check_summary_statistics() {
906        for i in 2u32..100 {
907            let range = 1..=i;
908            let values = range.collect::<Vec<_>>();
909            let stat = Summary::from(&values).unwrap();
910
911            let sum = (i * (i + 1)) as f64 / 2.;
912            let expected_mean = sum / i as f64;
913            let expected_variance = naive_variance(values.as_slice());
914
915            assert_eq!(stat.min, 1);
916            assert_eq!(stat.n, i as usize);
917            assert_eq!(stat.max, i);
918            assert!(
919                (stat.mean - expected_mean).abs() < 1e-5,
920                "Expected close to: {}, given: {}",
921                expected_mean,
922                stat.mean
923            );
924            assert!(
925                (stat.variance - expected_variance).abs() < 1e-5,
926                "Expected close to: {}, given: {}",
927                expected_variance,
928                stat.variance
929            );
930        }
931    }
932
933    #[test]
934    fn check_summary_statistics_types() {
935        Summary::from(<&[i64]>::default());
936        Summary::from(<&[u32]>::default());
937        Summary::from(&Vec::<i64>::default());
938    }
939
940    #[test]
941    fn check_naive_variance() {
942        assert_eq!(naive_variance(&[1, 2, 3]), 1.0);
943        assert_eq!(naive_variance(&[1, 2, 3, 4, 5]), 2.5);
944    }
945
946    #[test]
947    fn check_running_variance() {
948        let input = [1i64, 2, 3, 4, 5, 6, 7];
949        let variances = Summary::running(input.into_iter())
950            .map(|s| s.variance)
951            .collect::<Vec<_>>();
952        let expected = &[0., 0.5, 1., 1.6666, 2.5, 3.5, 4.6666];
953
954        assert_eq!(variances.len(), expected.len());
955
956        for (value, expected_value) in variances.iter().zip(expected) {
957            assert!(
958                (value - expected_value).abs() < 1e-3,
959                "Expected close to: {}, given: {}",
960                expected_value,
961                value
962            );
963        }
964    }
965
966    #[test]
967    fn check_running_variance_stress_test() {
968        let rng = RngIterator(SmallRng::seed_from_u64(0)).map(|i| i as i64);
969        let mut variances = Summary::running(rng).map(|s| s.variance);
970
971        assert!(variances.nth(1_000_000).unwrap() > 0.)
972    }
973
974    /// Basic check of measurement code
975    ///
976    /// This test is quite brittle. There is no guarantee the OS scheduler will wake up the thread
977    /// soon enough to meet measurement target. We try to mitigate this possibility using several strategies:
978    /// 1. repeating test several times and taking median as target measurement.
979    /// 2. using more liberal checking condition (allowing 1 order of magnitude error in measurement)
980    #[test]
981    fn check_measure_time() {
982        let expected_delay = 1;
983        let mut target = benchmark_fn("foo", move |b| {
984            b.iter(move || thread::sleep(Duration::from_millis(expected_delay)))
985        });
986        target.prepare_state(0);
987
988        let median = median_execution_time(&mut target, 10).as_millis() as u64;
989        assert!(median < expected_delay * 10);
990    }
991
992    struct RngIterator<T>(T);
993
994    impl<T: RngCore> Iterator for RngIterator<T> {
995        type Item = u32;
996
997        fn next(&mut self) -> Option<Self::Item> {
998            Some(self.0.next_u32())
999        }
1000    }
1001
1002    fn naive_variance<T>(values: &[T]) -> f64
1003    where
1004        T: Sum + Copy,
1005        f64: From<T>,
1006    {
1007        let n = values.len() as f64;
1008        let mean = f64::from(values.iter().copied().sum::<T>()) / n;
1009        let mut sum_of_squares = 0.;
1010        for value in values.iter().copied() {
1011            sum_of_squares += (f64::from(value) - mean).powi(2);
1012        }
1013        sum_of_squares / (n - 1.)
1014    }
1015
1016    fn median_execution_time(target: &mut Benchmark, iterations: u32) -> Duration {
1017        assert!(iterations >= 1);
1018        let mut state = target.prepare_state(0);
1019        let measures: Vec<_> = (0..iterations).map(|_| state.measure(1)).collect();
1020        let time = median(measures).max(1);
1021        Duration::from_nanos(time)
1022    }
1023
1024    fn median<T: Copy + Ord + Add<Output = T> + Div<Output = T>>(mut measures: Vec<T>) -> T {
1025        assert!(!measures.is_empty(), "Vec is empty");
1026        measures.sort_unstable();
1027        measures[measures.len() / 2]
1028    }
1029}