static_init 1.0.4

pub use inner::TickCounter;

#[cfg(target_arch = "x86_64")]
mod inner {

    // The instruction cpuid / rdtsc / rdtscp are used to benchmark
    // because the execution time of those instruction is very short
    // so that we get more chance to have thread trying to get the
    // lock in the same time.
    //
    // Cpuid is used to serialize instructions see:
    //https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
    use core::arch::x86_64::{__cpuid, __rdtscp, _rdtsc};
    use core::sync::atomic::{compiler_fence, Ordering};
    use criterion::black_box;
    use std::time::{Duration, Instant};

    #[derive(Copy, Clone)]
    pub struct TickCounter(u64, f64);

    impl TickCounter {
        pub fn new() -> TickCounter {
            #![allow(clippy::many_single_char_names)]
            let n = 10000;
            let mut arr = Vec::with_capacity(n);

            for _ in 1..1000 {
                let s = Self::raw_start();
                let e = Self::raw_end();
                black_box(e - s);
            }

            std::thread::yield_now();

            for _ in 1..10 {
                let s = Self::raw_start();
                let e = Self::raw_end();
                black_box(e - s);
            }

            for _ in 0..n {
                let s = Self::raw_start();
                let e = Self::raw_end();
                arr.push(e - s);
            }

            arr.sort_unstable();
            for k in 0..n / 10 {
                arr[k] = arr[n / 10];
            }
            for k in n - n / 10..n {
                arr[k] = arr[n - n / 10 - 1];
            }
            let s = arr.iter().fold(0, |cur, v| cur + *v);
            let zero = s / 10000;

            // Now estimate the time/tick
            let n = 200;
            let mut arr = Vec::with_capacity(n);

            //heat up
            for _ in 1..100 {
                let _ = Instant::now().elapsed();
            }

            std::thread::yield_now();

            for _ in 1..10 {
                let _ = Instant::now().elapsed();
            }

            let mut i = 0;
            while i < n {
                let e = Instant::now();
                let e0 = black_box(Self::raw_start());
                for _ in 0..i + 1 {
                    black_box(Self::raw_start());
                    black_box(Self::raw_end());
                }
                let e1 = black_box(Self::raw_start());
                let y = e.elapsed();
                if e1 < e0 {
                    continue;
                } else {
                    i += 1;
                }
                let dx = e1 - e0;
                let x = dx.saturating_sub(zero);
                arr.push((x as u32, y));
            }

            //Regularize
            let mut arr_1 = vec![];
            for v in arr.into_iter() {
                let v0 = v.0 as f64;
                arr_1.push((v0, v.1.as_nanos() as f64 / v0));
            }

            //Windsorize
            arr_1.sort_unstable_by(|a, b| PartialOrd::partial_cmp(&a.1, &b.1).unwrap());

            for k in 0..n / 10 {
                arr_1[k].1 = arr_1[n / 10].1;
            }
            for k in n - n / 10..n {
                arr_1[k].1 = arr_1[n - n / 10 - 1].1;
            }

            //the linear function that minimize quadratic error sum goes
            //through the middle point yeah!!
            let xm = arr_1.iter().fold(0f64, |v, x| v + x.0);
            let ym = arr_1.iter().fold(0f64, |v, x| v + (x.0 * x.1));

            let ns_per_tick = ym / xm;
            println!(
                "Estimated processor frequency: {}",
                (100f64 / ns_per_tick).round() / 100f64
            );
            TickCounter(zero, ns_per_tick)
        }
        #[inline(always)]
        pub fn time<R, F: FnOnce() -> R>(&self, f: F) -> Option<Duration> {
            let s = Self::raw_start();
            black_box(f());
            let e = Self::raw_end();
            if e < s {
                return None;
            }
            let v = (e - s) as f64;
            let v = (v - self.0 as f64) * self.1;
            let v = v.round();
            if v >= 0f64 {
                Some(Duration::from_nanos(v as u64))
            } else {
                Some(Duration::from_nanos(0))
            }
        }
        #[inline(always)]
        fn raw_start() -> u64 {
            compiler_fence(Ordering::AcqRel);
            let r = unsafe {
                //__cpuid(0);
                _rdtsc()
            };
            compiler_fence(Ordering::AcqRel);
            r
            //let cpuid_ask: u64 = 0;
            //let high: u64;
            //let low: u64;
            //unsafe {
            //    asm!(
            //         "cpuid",
            //         "rdtsc",
            //         out("rdx") high,
            //         inout("rax") cpuid_ask => low,
            //         out("rbx") _,
            //         out("rcx") _,
            //         options(nostack,preserves_flags)
            //    )
            //};
            //(high << 32) | low
        }
        #[inline(always)]
        fn raw_end() -> u64 {
            let mut v = 0;
            compiler_fence(Ordering::AcqRel);
            let c = unsafe {
                let c = __rdtscp(&mut v);
                __cpuid(0);
                c
            };
            compiler_fence(Ordering::AcqRel);
            c
            //let high: u64;
            //let low: u64;
            //unsafe {
            //    asm!(
            //         "rdtscp",
            //         "mov {high}, rdx",
            //         "mov {low}, rax",
            //         "mov rax, 0",
            //         "cpuid",
            //         high = out(reg) high,
            //         low = out(reg) low,
            //         out("rax")  _,
            //         out("rbx")  _,
            //         out("rcx")  _,
            //         out("rdx")  _,
            //         options(nostack,preserves_flags)
            //    )
            //};
            //(high << 32) | low
        }
    }
}

#[cfg(not(target_arch = "x86_64"))]
mod inner {
    use criterion::black_box;
    use std::time::{Duration, Instant};

    #[derive(Copy, Clone)]
    pub struct TickCounter(Duration);

    impl TickCounter {
        pub fn new() -> TickCounter {
            let mut arr = [Duration::from_secs(0); 10000];
            for _ in 1..1000 {
                let s = Self::raw_start();
                let e = Self::raw_end();
                black_box(e - s);
            }
            for v in arr.iter_mut() {
                let s = Self::raw_start();
                let e = Self::raw_end();
                *v = e - s;
            }
            arr.sort_unstable();
            for k in 0..1000 {
                arr[k] = arr[1000];
            }
            for k in 9000..10000 {
                arr[k] = arr[8999];
            }
            let s = arr.iter().fold(Duration::from_secs(0), |cur, v| cur + *v);
            let zero = s / 10000;
            TickCounter(zero)
        }
        #[inline(always)]
        pub fn time<R, F: FnOnce() -> R>(&self, f: F) -> Option<Duration> {
            let s = Self::raw_start();
            black_box(f());
            let e = Self::raw_end();
            if e < s {
                return None;
            }
            let v = e - s;
            if v >= self.0 {
                Some(v - self.0)
            } else {
                Some(Duration::from_nanos(0))
            }
        }
        #[inline(always)]
        fn raw_start() -> Instant {
            Instant::now()
        }
        #[inline(always)]
        fn raw_end() -> Instant {
            Instant::now()
        }
    }
}