#[cfg(feature = "async")]
pub use asynchronous::async_benchmark_fn;
use core::ptr;
use dylib::ffi::TANGO_API_VERSION;
use metrics::WallClock;
use num_traits::ToPrimitive;
use rand::{rngs::SmallRng, Rng, SeedableRng};
use std::{
cmp::Ordering,
hint::black_box,
io,
marker::PhantomData,
mem,
ops::{Deref, RangeInclusive},
str::Utf8Error,
time::Duration,
};
use thiserror::Error;
pub mod cli;
pub mod dylib;
#[cfg(target_os = "linux")]
pub mod linux;
pub mod platform;
#[cfg(target_os = "windows")]
pub mod windows;
#[derive(Debug, Error)]
pub enum Error {
#[error("No measurements given")]
NoMeasurements,
#[error("Invalid string pointer from FFI")]
InvalidFFIString(Utf8Error),
#[error("Spi::self() was already called")]
SpiSelfWasMoved,
#[error("Benchmark is missing")]
BenchmarkNotFound,
#[error("Unable to load benchmark")]
UnableToLoadBenchmark(#[source] libloading::Error),
#[cfg(target_os = "windows")]
#[error("Unable to patch IAT")]
UnableToPatchIat(#[source] windows::Error),
#[error("Unable to load library symbol: {0}")]
UnableToLoadSymbol(String, #[source] libloading::Error),
#[error("Unknown sampler type. Available options are: flat and linear")]
UnknownSamplerType,
#[error("Invalid test name given")]
InvalidTestName,
#[error("IO Error")]
IOError(#[from] io::Error),
#[error("FFI Error: {0}")]
FFIError(String),
#[error("Unknown FFI Error")]
UnknownFFIError,
#[error(
"Non matching tango version. Expected: {}, got: {0}",
TANGO_API_VERSION
)]
IncorrectVersion(u32),
}
#[macro_export]
macro_rules! tango_benchmarks {
($($func_expr:expr),+) => {
const TANGO_INIT: $crate::dylib::ffi::InitFn = tango_init;
#[no_mangle]
unsafe extern "C" fn tango_init() {
let mut benchmarks = vec![];
$(benchmarks.extend($crate::IntoBenchmarks::into_benchmarks($func_expr));)*
$crate::dylib::__tango_init(benchmarks)
}
};
}
#[macro_export]
macro_rules! tango_main {
($settings:expr) => {
fn main() -> $crate::cli::Result<std::process::ExitCode> {
unsafe { tango_init() };
$crate::cli::run($settings)
}
};
() => {
tango_main! {$crate::MeasurementSettings::default()}
};
}
pub struct BenchmarkParams {
pub seed: u64,
}
pub struct Bencher<M> {
params: BenchmarkParams,
metric: PhantomData<M>,
}
impl<M> Deref for Bencher<M> {
type Target = BenchmarkParams;
fn deref(&self) -> &Self::Target {
&self.params
}
}
impl<M: Metric + 'static> Bencher<M> {
pub fn metric<T: Metric>(self) -> Bencher<T> {
Bencher {
params: self.params,
metric: PhantomData::<T>,
}
}
pub fn iter<O: 'static, F: FnMut() -> O + 'static>(self, func: F) -> Box<dyn ErasedSampler> {
Box::new(Sampler::<_, M>::new(func))
}
}
struct Sampler<F, M> {
func: F,
metric: PhantomData<M>,
}
impl<F, M> Sampler<F, M> {
fn new(func: F) -> Self {
Self {
func,
metric: PhantomData::<M>,
}
}
}
pub trait ErasedSampler {
fn measure(&mut self, iterations: usize) -> u64;
fn estimate_iterations(&mut self, time_ms: u32) -> usize {
let mut iters = 1;
let time_ns = Duration::from_millis(time_ms as u64).as_nanos() as u64;
for _ in 0..5 {
let time = self.measure(iters).max(1_000);
let time_per_iteration = (time / iters as u64).max(1);
let new_iters = (time_ns / time_per_iteration) as usize;
if new_iters < 2 * iters {
return new_iters;
}
iters = new_iters;
}
iters
}
}
impl<O, F: FnMut() -> O, M: Metric> ErasedSampler for Sampler<F, M> {
fn measure(&mut self, iterations: usize) -> u64 {
M::measure_fn(|| {
for _ in 0..iterations {
black_box((self.func)());
}
})
}
}
pub struct Benchmark {
name: String,
sampler_factory: Box<dyn SamplerFactory>,
}
pub fn benchmark_fn<F: FnMut(Bencher<WallClock>) -> Box<dyn ErasedSampler> + 'static>(
name: impl Into<String>,
sampler_factory: F,
) -> Benchmark {
let name = name.into();
assert!(!name.is_empty());
Benchmark {
name,
sampler_factory: Box::new(SyncSampleFactory(sampler_factory)),
}
}
pub trait SamplerFactory {
fn create_sampler(&mut self, params: BenchmarkParams) -> Box<dyn ErasedSampler>;
}
struct SyncSampleFactory<F>(F);
impl<F: FnMut(Bencher<WallClock>) -> Box<dyn ErasedSampler>> SamplerFactory
for SyncSampleFactory<F>
{
fn create_sampler(&mut self, params: BenchmarkParams) -> Box<dyn ErasedSampler> {
(self.0)(Bencher {
params,
metric: PhantomData::<WallClock>,
})
}
}
impl Benchmark {
pub fn prepare_state(&mut self, seed: u64) -> Box<dyn ErasedSampler> {
self.sampler_factory
.create_sampler(BenchmarkParams { seed })
}
pub fn name(&self) -> &str {
self.name.as_str()
}
}
pub trait IntoBenchmarks {
fn into_benchmarks(self) -> Vec<Benchmark>;
}
impl<const N: usize> IntoBenchmarks for [Benchmark; N] {
fn into_benchmarks(self) -> Vec<Benchmark> {
self.into_iter().collect()
}
}
impl IntoBenchmarks for Vec<Benchmark> {
fn into_benchmarks(self) -> Vec<Benchmark> {
self
}
}
#[derive(Clone, Copy, Debug)]
pub struct MeasurementSettings {
pub filter_outliers: bool,
pub samples_per_haystack: usize,
pub min_iterations_per_sample: usize,
pub max_iterations_per_sample: usize,
pub sampler_type: SampleLengthKind,
pub warmup_enabled: bool,
pub cache_firewall: Option<usize>,
pub yield_before_sample: bool,
pub randomize_stack: Option<usize>,
}
#[derive(Clone, Copy, Debug)]
pub enum SampleLengthKind {
Flat,
Linear,
Random,
}
struct CacheFirewall {
cache_lines: Vec<CacheLine>,
}
impl CacheFirewall {
fn new(bytes: usize) -> Self {
let n = bytes / mem::size_of::<CacheLine>();
let cache_lines = vec![CacheLine::default(); n];
Self { cache_lines }
}
fn issue_read(&self) {
for line in &self.cache_lines {
unsafe { ptr::read_volatile(&line.0[0]) };
}
}
}
#[repr(C)]
#[repr(align(64))]
#[derive(Default, Clone, Copy)]
struct CacheLine([u16; 32]);
pub const DEFAULT_SETTINGS: MeasurementSettings = MeasurementSettings {
filter_outliers: false,
samples_per_haystack: 1,
min_iterations_per_sample: 1,
max_iterations_per_sample: 5000,
sampler_type: SampleLengthKind::Random,
cache_firewall: None,
yield_before_sample: false,
warmup_enabled: true,
randomize_stack: None,
};
impl Default for MeasurementSettings {
fn default() -> Self {
DEFAULT_SETTINGS
}
}
trait SampleLength {
fn next_sample_iterations(&mut self, iteration_no: usize, estimate: usize) -> usize;
}
struct FlatSampleLength {
min: usize,
max: usize,
}
impl FlatSampleLength {
fn new(settings: &MeasurementSettings) -> Self {
FlatSampleLength {
min: settings.min_iterations_per_sample.max(1),
max: settings.max_iterations_per_sample,
}
}
}
impl SampleLength for FlatSampleLength {
fn next_sample_iterations(&mut self, _iteration_no: usize, estimate: usize) -> usize {
estimate.clamp(self.min, self.max)
}
}
struct LinearSampleLength {
min: usize,
max: usize,
}
impl LinearSampleLength {
fn new(settings: &MeasurementSettings) -> Self {
Self {
min: settings.min_iterations_per_sample.max(1),
max: settings.max_iterations_per_sample,
}
}
}
impl SampleLength for LinearSampleLength {
fn next_sample_iterations(&mut self, iteration_no: usize, estimate: usize) -> usize {
let estimate = estimate.clamp(self.min, self.max);
(iteration_no % estimate) + 1
}
}
struct RandomSampleLength {
rng: SmallRng,
min: usize,
max: usize,
}
impl RandomSampleLength {
pub fn new(settings: &MeasurementSettings, seed: u64) -> Self {
Self {
rng: SmallRng::seed_from_u64(seed),
min: settings.min_iterations_per_sample.max(1),
max: settings.max_iterations_per_sample,
}
}
}
impl SampleLength for RandomSampleLength {
fn next_sample_iterations(&mut self, _iteration_no: usize, estimate: usize) -> usize {
let estimate = estimate.clamp(self.min, self.max);
self.rng.gen_range(1..=estimate)
}
}
pub(crate) fn calculate_run_result<N: Into<String>>(
name: N,
baseline: &[u64],
candidate: &[u64],
iterations_per_sample: &[usize],
filter_outliers: bool,
) -> Option<RunResult> {
assert!(baseline.len() == candidate.len());
assert!(baseline.len() == iterations_per_sample.len());
let mut iterations_per_sample = iterations_per_sample.to_vec();
let mut diff = candidate
.iter()
.zip(baseline.iter())
.map(|(&c, &b)| c as f64 - b as f64)
.zip(iterations_per_sample.iter())
.map(|(diff, &iters)| diff / iters as f64)
.collect::<Vec<_>>();
let n = diff.len();
let mut baseline = baseline
.iter()
.zip(iterations_per_sample.iter())
.map(|(&v, &iters)| (v as f64) / (iters as f64))
.collect::<Vec<_>>();
let mut candidate = candidate
.iter()
.zip(iterations_per_sample.iter())
.map(|(&v, &iters)| (v as f64) / (iters as f64))
.collect::<Vec<_>>();
let range = if filter_outliers {
iqr_variance_thresholds(diff.to_vec())
} else {
None
};
if let Some(range) = range {
assert_eq!(diff.len(), baseline.len());
assert_eq!(diff.len(), candidate.len());
let mut i = 0;
while i < diff.len() {
if range.contains(&diff[i]) {
i += 1;
} else {
diff.swap_remove(i);
iterations_per_sample.swap_remove(i);
baseline.swap_remove(i);
candidate.swap_remove(i);
}
}
};
let diff_summary = Summary::from(&diff)?;
let baseline_summary = Summary::from(&baseline)?;
let candidate_summary = Summary::from(&candidate)?;
let diff_estimate = DiffEstimate::build(&baseline_summary, &diff_summary);
Some(RunResult {
baseline: baseline_summary,
candidate: candidate_summary,
diff: diff_summary,
name: name.into(),
diff_estimate,
outliers: n - diff_summary.n,
})
}
pub(crate) struct DiffEstimate {
pct: f64,
significant: bool,
}
impl DiffEstimate {
fn build(baseline: &Summary<f64>, diff: &Summary<f64>) -> Self {
let std_dev = diff.variance.sqrt();
let std_err = std_dev / (diff.n as f64).sqrt();
let z_score = diff.mean / std_err;
let significant = z_score.abs() >= 2.6 && (diff.mean / baseline.mean).abs() > 0.005;
let pct = diff.mean / baseline.mean * 100.0;
Self { pct, significant }
}
}
pub(crate) struct RunResult {
name: String,
baseline: Summary<f64>,
candidate: Summary<f64>,
diff: Summary<f64>,
diff_estimate: DiffEstimate,
outliers: usize,
}
#[derive(Clone, Copy)]
pub struct Summary<T> {
pub n: usize,
pub min: T,
pub max: T,
pub mean: f64,
pub variance: f64,
}
impl<T: PartialOrd> Summary<T> {
pub fn from<'a, C>(values: C) -> Option<Self>
where
C: IntoIterator<Item = &'a T>,
T: ToPrimitive + Copy + Default + 'a,
{
Self::running(values.into_iter().copied()).last()
}
pub fn running<I>(iter: I) -> impl Iterator<Item = Summary<T>>
where
T: ToPrimitive + Copy + Default,
I: Iterator<Item = T>,
{
RunningSummary {
iter,
n: 0,
min: T::default(),
max: T::default(),
mean: 0.,
s: 0.,
}
}
}
struct RunningSummary<T, I> {
iter: I,
n: usize,
min: T,
max: T,
mean: f64,
s: f64,
}
impl<T, I> Iterator for RunningSummary<T, I>
where
T: Copy + PartialOrd,
I: Iterator<Item = T>,
T: ToPrimitive,
{
type Item = Summary<T>;
fn next(&mut self) -> Option<Self::Item> {
let value = self.iter.next()?;
let fvalue = value.to_f64().expect("f64 overflow detected");
if self.n == 0 {
self.min = value;
self.max = value;
}
if let Some(Ordering::Less) = value.partial_cmp(&self.min) {
self.min = value;
}
if let Some(Ordering::Greater) = value.partial_cmp(&self.max) {
self.max = value;
}
self.n += 1;
let mean_p = self.mean;
self.mean += (fvalue - self.mean) / self.n as f64;
self.s += (fvalue - mean_p) * (fvalue - self.mean);
let variance = if self.n > 1 {
self.s / (self.n - 1) as f64
} else {
0.
};
Some(Summary {
n: self.n,
min: self.min,
max: self.max,
mean: self.mean,
variance,
})
}
}
pub fn iqr_variance_thresholds(mut input: Vec<f64>) -> Option<RangeInclusive<f64>> {
const MINIMUM_IQR: f64 = 1.;
input.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
let (q1, q3) = (input.len() / 4, input.len() * 3 / 4 - 1);
if q1 >= q3 || q3 >= input.len() {
return None;
}
let iqr = (input[q3] - input[q1]).max(MINIMUM_IQR);
let low_threshold = input[q1] - iqr * 1.5;
let high_threshold = input[q3] + iqr * 1.5;
let low_threshold_idx =
match input[0..q1].binary_search_by(|probe| probe.total_cmp(&low_threshold)) {
Ok(idx) => idx,
Err(idx) => idx,
};
let high_threshold_idx =
match input[q3..].binary_search_by(|probe| probe.total_cmp(&high_threshold)) {
Ok(idx) => idx,
Err(idx) => idx,
};
if low_threshold_idx == 0 || high_threshold_idx >= input.len() {
return None;
}
let outliers_cnt = low_threshold_idx.min(input.len() - high_threshold_idx);
Some(input[outliers_cnt]..=(input[input.len() - outliers_cnt - 1]))
}
pub trait Metric {
fn measure_fn(f: impl FnMut()) -> u64;
}
pub mod metrics {
use crate::Metric;
pub struct WallClock;
impl Metric for WallClock {
#[cfg(not(all(feature = "hw-timer", target_arch = "x86_64")))]
fn measure_fn(mut f: impl FnMut()) -> u64 {
use std::time::Instant;
let start = Instant::now();
f();
start.elapsed().as_nanos() as u64
}
#[cfg(all(feature = "hw-timer", target_arch = "x86_64"))]
fn measure_fn(mut f: impl FnMut()) -> u64 {
use std::arch::x86_64::{__rdtscp, _mm_mfence};
let start = unsafe {
_mm_mfence();
__rdtscp(&mut 0)
};
f();
unsafe {
let end = __rdtscp(&mut 0);
_mm_mfence();
end - start
}
}
}
}
#[cfg(feature = "async")]
pub mod asynchronous {
use crate::metrics::WallClock;
use super::{Benchmark, BenchmarkParams, ErasedSampler, Sampler, SamplerFactory};
use std::{future::Future, ops::Deref};
pub fn async_benchmark_fn<R, F>(
name: impl Into<String>,
runtime: R,
sampler_factory: F,
) -> Benchmark
where
R: AsyncRuntime + 'static,
F: FnMut(AsyncBencher<R>) -> Box<dyn ErasedSampler> + 'static,
{
let name = name.into();
assert!(!name.is_empty());
Benchmark {
name,
sampler_factory: Box::new(AsyncSampleFactory(sampler_factory, runtime)),
}
}
pub struct AsyncSampleFactory<F, R>(pub F, pub R);
impl<R: AsyncRuntime, F: FnMut(AsyncBencher<R>) -> Box<dyn ErasedSampler>> SamplerFactory
for AsyncSampleFactory<F, R>
{
fn create_sampler(&mut self, params: BenchmarkParams) -> Box<dyn ErasedSampler> {
(self.0)(AsyncBencher {
params,
runtime: self.1,
})
}
}
pub struct AsyncBencher<R> {
params: BenchmarkParams,
runtime: R,
}
impl<R: AsyncRuntime + 'static> AsyncBencher<R> {
pub fn iter<O, Fut, F>(self, func: F) -> Box<dyn ErasedSampler>
where
O: 'static,
Fut: Future<Output = O>,
F: FnMut() -> Fut + Copy + 'static,
{
Box::new(Sampler::<_, WallClock>::new(move || {
self.runtime.block_on(func)
}))
}
}
impl<R> Deref for AsyncBencher<R> {
type Target = BenchmarkParams;
fn deref(&self) -> &Self::Target {
&self.params
}
}
pub trait AsyncRuntime: Copy {
fn block_on<O, Fut: Future<Output = O>, F: FnMut() -> Fut>(&self, f: F) -> O;
}
#[cfg(feature = "async-tokio")]
pub mod tokio {
use super::*;
use ::tokio::runtime::Builder;
#[derive(Copy, Clone, Default)]
pub struct TokioRuntime;
impl AsyncRuntime for TokioRuntime {
fn block_on<O, Fut: Future<Output = O>, F: FnMut() -> Fut>(&self, mut f: F) -> O {
let mut builder = Builder::new_current_thread();
#[cfg(feature = "async-tokio-all-drivers")]
builder.enable_all();
let runtime = builder.build().unwrap();
runtime.block_on(f())
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::{rngs::SmallRng, Rng, RngCore, SeedableRng};
use std::{
iter::Sum,
ops::{Add, Div},
thread,
time::Duration,
};
#[test]
fn check_iqr_variance_thresholds() {
let mut rng = SmallRng::from_entropy();
let mut values = vec![];
values.extend((0..20).map(|_| rng.gen_range(-50.0..=50.)));
values.extend((0..10).map(|_| rng.gen_range(-1000.0..=-200.0)));
values.extend((0..10).map(|_| rng.gen_range(200.0..=1000.0)));
let thresholds = iqr_variance_thresholds(values).unwrap();
assert!(
-50. <= *thresholds.start() && *thresholds.end() <= 50.,
"Invalid range: {:?}",
thresholds
);
}
#[test]
fn check_outliers_zero_iqr() {
let mut rng = SmallRng::from_entropy();
let mut values = vec![];
values.extend([0.; 20]);
values.extend((0..10).map(|_| rng.gen_range(-1000.0..=-200.0)));
values.extend((0..10).map(|_| rng.gen_range(200.0..=1000.0)));
let thresholds = iqr_variance_thresholds(values).unwrap();
assert!(
0. <= *thresholds.start() && *thresholds.end() <= 0.,
"Invalid range: {:?}",
thresholds
);
}
#[test]
fn check_summary_statistics() {
for i in 2u32..100 {
let range = 1..=i;
let values = range.collect::<Vec<_>>();
let stat = Summary::from(&values).unwrap();
let sum = (i * (i + 1)) as f64 / 2.;
let expected_mean = sum / i as f64;
let expected_variance = naive_variance(values.as_slice());
assert_eq!(stat.min, 1);
assert_eq!(stat.n, i as usize);
assert_eq!(stat.max, i);
assert!(
(stat.mean - expected_mean).abs() < 1e-5,
"Expected close to: {}, given: {}",
expected_mean,
stat.mean
);
assert!(
(stat.variance - expected_variance).abs() < 1e-5,
"Expected close to: {}, given: {}",
expected_variance,
stat.variance
);
}
}
#[test]
fn check_summary_statistics_types() {
Summary::from(<&[i64]>::default());
Summary::from(<&[u32]>::default());
Summary::from(&Vec::<i64>::default());
}
#[test]
fn check_naive_variance() {
assert_eq!(naive_variance(&[1, 2, 3]), 1.0);
assert_eq!(naive_variance(&[1, 2, 3, 4, 5]), 2.5);
}
#[test]
fn check_running_variance() {
let input = [1i64, 2, 3, 4, 5, 6, 7];
let variances = Summary::running(input.into_iter())
.map(|s| s.variance)
.collect::<Vec<_>>();
let expected = &[0., 0.5, 1., 1.6666, 2.5, 3.5, 4.6666];
assert_eq!(variances.len(), expected.len());
for (value, expected_value) in variances.iter().zip(expected) {
assert!(
(value - expected_value).abs() < 1e-3,
"Expected close to: {}, given: {}",
expected_value,
value
);
}
}
#[test]
fn check_running_variance_stress_test() {
let rng = RngIterator(SmallRng::seed_from_u64(0)).map(|i| i as i64);
let mut variances = Summary::running(rng).map(|s| s.variance);
assert!(variances.nth(1_000_000).unwrap() > 0.)
}
#[test]
fn check_measure_time() {
let expected_delay = 100;
let mut target = benchmark_fn("foo", move |b| {
b.metric::<WallClock>()
.iter(move || thread::sleep(Duration::from_millis(expected_delay)))
});
target.prepare_state(0);
let median = median_execution_time(&mut target, 10).as_millis() as u64;
assert!(median < expected_delay * 10, "Median {median} is too large");
}
struct RngIterator<T>(T);
impl<T: RngCore> Iterator for RngIterator<T> {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
Some(self.0.next_u32())
}
}
fn naive_variance<T>(values: &[T]) -> f64
where
T: Sum + Copy,
f64: From<T>,
{
let n = values.len() as f64;
let mean = f64::from(values.iter().copied().sum::<T>()) / n;
let mut sum_of_squares = 0.;
for value in values.iter().copied() {
sum_of_squares += (f64::from(value) - mean).powi(2);
}
sum_of_squares / (n - 1.)
}
fn median_execution_time(target: &mut Benchmark, iterations: u32) -> Duration {
assert!(iterations >= 1);
let mut state = target.prepare_state(0);
let measures: Vec<_> = (0..iterations).map(|_| state.measure(1)).collect();
let time = median(measures).max(1);
Duration::from_nanos(time)
}
fn median<T: Copy + Ord + Add<Output = T> + Div<Output = T>>(mut measures: Vec<T>) -> T {
assert!(!measures.is_empty(), "Vec is empty");
measures.sort_unstable();
measures[measures.len() / 2]
}
}