use crate::{Alignment, BenchmarkKind, BenchmarkReport, BenchmarkResult, TableFormatter};
use crate::session::BenchmarkSession;
use crate::threading::{
DetectionResult, detect_performance_cores, pin_current_thread_to_core,
};
use std::time::Instant;
use std::{
hint::black_box,
io::{self, IsTerminal, Write},
sync::atomic::{AtomicBool, Ordering},
time::Duration,
};
#[cfg(target_os = "linux")]
use perf_event::{Builder, Group, events::Hardware};
#[cfg(target_os = "linux")]
use std::sync::{Mutex, OnceLock};
const MIN_CHUNK_SIZE: usize = 100_000; const MAX_CHUNK_SIZE: usize = 50_000_000; const TARGET_CHUNK_DURATION: Duration = Duration::from_millis(50); const WARM_UP_DURATION: Duration = Duration::from_secs(1); const MIN_BENCHMARK_DURATION: Duration = Duration::from_secs(5); const MIN_SAMPLES: usize = 20; const MAX_SAMPLES: usize = 100; const MIN_PMU_ACTIVE_PERCENT: f64 = 90.0;
type BenchFunction<T> = fn(&mut T, usize, usize);
fn flush_stdout() {
let _ = io::stdout().flush();
}
fn rewrite_line(text: &str) {
print!("\r\x1b[2K{text}");
flush_stdout();
}
fn clear_line() {
print!("\r\x1b[2K\r");
flush_stdout();
}
fn colorize_label(text: &str) -> String {
if !std::io::stdout().is_terminal() {
return text.to_string();
}
let color = if text.contains("Throughput") {
"32"
} else if text.contains("Latency") || text == "P95" || text == "MAD" {
"33"
} else {
"36"
};
format!("\x1b[{color}m{text}\x1b[0m")
}
fn colorize_value(text: &str) -> String {
if !std::io::stdout().is_terminal() {
return text.to_string();
}
format!("\x1b[97m{text}\x1b[0m")
}
fn safe_ratio_f64(numerator: f64, denominator: f64) -> f64 {
if denominator > 0.0 && denominator.is_finite() && numerator.is_finite() {
numerator / denominator
} else {
0.0
}
}
fn throughput_ops_per_sec(result: &Results) -> Option<f64> {
let seconds = result.duration.as_secs_f64();
if seconds <= f64::EPSILON || !seconds.is_finite() || result.iterations == 0 {
return None;
}
Some(result.iterations as f64 / seconds)
}
fn coefficient_of_variation_percent(samples: &[Results]) -> f64 {
let throughputs: Vec<f64> = samples.iter().filter_map(throughput_ops_per_sec).collect();
if throughputs.is_empty() {
return 0.0;
}
let mean = throughputs.iter().sum::<f64>() / throughputs.len() as f64;
if mean <= f64::EPSILON || !mean.is_finite() {
return 0.0;
}
let variance = throughputs
.iter()
.map(|&throughput| (throughput - mean).powi(2))
.sum::<f64>()
/ throughputs.len() as f64;
if !variance.is_finite() || variance < 0.0 {
return 0.0;
}
(variance.sqrt() / mean) * 100.0
}
fn sample_mops_per_sec(samples: &[Results]) -> Vec<f64> {
samples.iter().filter_map(throughput_ops_per_sec).map(|v| v / 1_000_000.0).collect()
}
fn sample_ns_per_op(samples: &[Results]) -> Vec<f64> {
samples
.iter()
.filter_map(|sample| {
if sample.iterations == 0 {
return None;
}
let ns = safe_ratio_f64(sample.duration.as_nanos() as f64, sample.iterations as f64);
if ns.is_finite() { Some(ns) } else { None }
})
.collect()
}
fn percentile(sorted_values: &[f64], percentile: f64) -> f64 {
if sorted_values.is_empty() {
return 0.0;
}
let percentile = percentile.clamp(0.0, 1.0);
let last_index = sorted_values.len() - 1;
let position = percentile * last_index as f64;
let lower = position.floor() as usize;
let upper = position.ceil() as usize;
if lower == upper {
return sorted_values[lower];
}
let weight = position - lower as f64;
sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight
}
fn median(sorted_values: &[f64]) -> f64 {
percentile(sorted_values, 0.5)
}
fn median_absolute_deviation(values: &[f64], median_value: f64) -> f64 {
if values.is_empty() {
return 0.0;
}
let mut deviations: Vec<f64> = values.iter().map(|value| (value - median_value).abs()).collect();
deviations.sort_by(|a, b| a.total_cmp(b));
median(&deviations)
}
fn tukey_outlier_count(sorted_values: &[f64]) -> usize {
if sorted_values.len() < 4 {
return 0;
}
let q1 = percentile(sorted_values, 0.25);
let q3 = percentile(sorted_values, 0.75);
let iqr = q3 - q1;
let lower = q1 - 1.5 * iqr;
let upper = q3 + 1.5 * iqr;
sorted_values
.iter()
.filter(|value| **value < lower || **value > upper)
.count()
}
fn scale_multiplexed_count(raw: u64, enabled_ns: u64, running_ns: u64) -> u64 {
if raw == 0 {
return 0;
}
if enabled_ns == 0 || running_ns == 0 {
return raw;
}
if running_ns >= enabled_ns {
return raw;
}
((raw as u128 * enabled_ns as u128) / running_ns as u128).min(u64::MAX as u128) as u64
}
fn pmu_active_percent(results: &Results) -> f64 {
safe_ratio_f64(
results.pmu_time_running_ns as f64,
results.pmu_time_enabled_ns as f64,
) * 100.0
}
fn enforce_pmu_quality(name: &str, has_perf_counters: bool, results: &Results) {
if !has_perf_counters || results.pmu_time_enabled_ns == 0 || results.pmu_time_running_ns == 0 {
return;
}
let active_percent = pmu_active_percent(results);
if active_percent < MIN_PMU_ACTIVE_PERCENT {
eprintln!(
"⚠️ PMU counters were multiplexed too heavily for benchmark '{name}': active {active_percent:.1}% < {MIN_PMU_ACTIVE_PERCENT:.1}%"
);
}
}
#[cfg(target_os = "linux")]
fn perf_issues() -> &'static Mutex<Vec<String>> {
static PERF_ISSUES: OnceLock<Mutex<Vec<String>>> = OnceLock::new();
PERF_ISSUES.get_or_init(|| Mutex::new(Vec::new()))
}
#[cfg(target_os = "linux")]
fn record_perf_issue(message: impl Into<String>) {
let message = message.into();
let lock = perf_issues().lock();
let mut issues = match lock {
Ok(guard) => guard,
Err(poisoned) => poisoned.into_inner(),
};
if issues.iter().any(|existing| existing == &message) {
return;
}
if issues.len() >= 6 {
return;
}
issues.push(message);
}
#[cfg(target_os = "linux")]
fn clear_perf_issues() {
let lock = perf_issues().lock();
let mut issues = match lock {
Ok(guard) => guard,
Err(poisoned) => poisoned.into_inner(),
};
issues.clear();
}
#[cfg(target_os = "linux")]
fn current_perf_issues() -> Vec<String> {
let lock = perf_issues().lock();
match lock {
Ok(guard) => guard.clone(),
Err(poisoned) => poisoned.into_inner().clone(),
}
}
#[cfg(target_os = "linux")]
fn linux_perf_hint(has_perf_counters: bool, issues: &[String]) -> Option<String> {
if has_perf_counters {
return None;
}
let looks_like_perf_access_issue = issues.iter().any(|issue| {
issue.contains("unusable timing window")
|| issue.contains("Operation not permitted")
|| issue.contains("Permission denied")
});
if !looks_like_perf_access_issue {
return None;
}
let paranoid = std::fs::read_to_string("/proc/sys/kernel/perf_event_paranoid")
.ok()
.and_then(|value| value.trim().parse::<i32>().ok());
match paranoid {
Some(value) if value > 2 => Some(format!(
"kernel.perf_event_paranoid={value}; lower it to 2 or less (or grant CAP_PERFMON/CAP_SYS_ADMIN) to enable PMU counters"
)),
Some(value) => Some(format!(
"kernel.perf_event_paranoid={value}; PMU still unavailable, likely due to missing CAP_PERFMON/CAP_SYS_ADMIN or container perf_event restrictions"
)),
None => Some(
"PMU still unavailable; check /proc/sys/kernel/perf_event_paranoid and container capabilities (CAP_PERFMON/CAP_SYS_ADMIN)".to_string(),
),
}
}
fn warn_affinity_once(message: impl Into<String>) {
static WARNED: AtomicBool = AtomicBool::new(false);
if !WARNED.swap(true, Ordering::Relaxed) {
eprintln!("⚠️ {}", message.into());
}
}
#[cfg(target_os = "linux")]
fn core_has_usable_pmu(core_id: usize) -> bool {
if pin_current_thread_to_core(core_id).is_err() {
return false;
}
let mut counter = match Builder::new(Hardware::INSTRUCTIONS).build() {
Ok(counter) => counter,
Err(_) => return false,
};
if counter.enable().is_err() {
return false;
}
let mut acc = 0_u64;
for i in 0..100_000 {
acc = acc.wrapping_add(i);
}
black_box(acc);
let _ = counter.disable();
match counter.read_count_and_time() {
Ok(cat) => cat.count > 0 || cat.time_running > 0,
Err(_) => false,
}
}
#[cfg(target_os = "linux")]
fn choose_default_pin_core(allowed_core_ids: &[usize]) -> Option<usize> {
let mut candidates: Vec<usize> = Vec::new();
if let Ok(DetectionResult::PerformanceCores(selection)) = detect_performance_cores() {
for core_id in selection.logical_processor_ids {
if allowed_core_ids.contains(&core_id) && !candidates.contains(&core_id) {
candidates.push(core_id);
}
}
}
for core_id in allowed_core_ids {
if !candidates.contains(core_id) {
candidates.push(*core_id);
}
}
for core_id in &candidates {
if core_has_usable_pmu(*core_id) {
return Some(*core_id);
}
}
candidates.first().copied()
}
#[cfg(target_os = "linux")]
fn capture_current_thread_affinity() -> io::Result<libc::cpu_set_t> {
let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
let result = unsafe {
libc::pthread_getaffinity_np(
libc::pthread_self(),
std::mem::size_of::<libc::cpu_set_t>(),
&mut cpuset,
)
};
if result != 0 {
return Err(io::Error::from_raw_os_error(result));
}
Ok(cpuset)
}
#[cfg(target_os = "linux")]
fn restore_current_thread_affinity(mask: &libc::cpu_set_t) -> io::Result<()> {
let result = unsafe {
libc::pthread_setaffinity_np(
libc::pthread_self(),
std::mem::size_of::<libc::cpu_set_t>(),
mask,
)
};
if result != 0 {
return Err(io::Error::from_raw_os_error(result));
}
Ok(())
}
#[cfg(target_os = "linux")]
fn core_ids_from_mask(mask: &libc::cpu_set_t) -> Vec<usize> {
let mut core_ids = Vec::new();
for core_id in 0..(libc::CPU_SETSIZE as usize) {
let is_set = unsafe { libc::CPU_ISSET(core_id, mask) };
if is_set {
core_ids.push(core_id);
}
}
core_ids
}
struct BenchAffinityGuard {
#[cfg(target_os = "linux")]
restore_mask: Option<libc::cpu_set_t>,
#[cfg(target_os = "linux")]
did_pin: bool,
}
impl BenchAffinityGuard {
fn acquire() -> Self {
#[cfg(target_os = "linux")]
{
let restore_mask = match capture_current_thread_affinity() {
Ok(mask) => Some(mask),
Err(error) => {
warn_affinity_once(format!(
"Could not capture existing benchmark thread affinity: {error}. Continuing with best effort pinning"
));
None
}
};
let allowed_core_ids = restore_mask
.as_ref()
.map(core_ids_from_mask)
.filter(|core_ids| !core_ids.is_empty())
.unwrap_or_else(|| {
let count = std::thread::available_parallelism()
.map(|p| p.get())
.unwrap_or(1);
(0..count).collect()
});
let requested_core = std::env::var("BENCH_UTILS_PIN_CORE")
.ok()
.and_then(|value| value.parse::<usize>().ok());
let core_to_pin = requested_core
.filter(|core_id| allowed_core_ids.contains(core_id))
.or_else(|| choose_default_pin_core(&allowed_core_ids));
let Some(core_id) = core_to_pin else {
warn_affinity_once(
"No logical cores detected for pinning; benchmark will run without CPU pinning",
);
return Self {
restore_mask: None,
did_pin: false,
};
};
if let Err(error) = pin_current_thread_to_core(core_id) {
warn_affinity_once(format!(
"Could not pin benchmark thread to core {core_id}: {error}. Continuing without pinning"
));
return Self {
restore_mask: None,
did_pin: false,
};
}
Self {
restore_mask,
did_pin: true,
}
}
#[cfg(not(target_os = "linux"))]
{
Self {}
}
}
}
impl Drop for BenchAffinityGuard {
fn drop(&mut self) {
#[cfg(target_os = "linux")]
{
if !self.did_pin {
return;
}
let Some(mask) = &self.restore_mask else {
return;
};
if let Err(error) = restore_current_thread_affinity(mask) {
warn_affinity_once(format!(
"Could not restore benchmark thread affinity after run: {error}"
));
}
}
}
}
#[cfg(target_os = "linux")]
struct PerfGroupCounters {
group: Group,
instructions: Option<perf_event::Counter>,
branches: Option<perf_event::Counter>,
branch_misses: Option<perf_event::Counter>,
cache_misses: Option<perf_event::Counter>,
}
#[cfg(target_os = "linux")]
fn try_add_group_counter(
group: &mut Group,
event: Hardware,
name: &str,
) -> Option<perf_event::Counter> {
match group.add(&Builder::new(event)) {
Ok(counter) => Some(counter),
Err(error) => {
record_perf_issue(format!("perf event '{name}' unavailable: {error}"));
None
}
}
}
#[cfg(target_os = "linux")]
fn build_perf_counter_group() -> Option<PerfGroupCounters> {
let mut group = match Group::new() {
Ok(group) => group,
Err(error) => {
record_perf_issue(format!("perf group unavailable: {error}"));
return None;
}
};
let instructions = try_add_group_counter(&mut group, Hardware::INSTRUCTIONS, "instructions");
let branches = try_add_group_counter(&mut group, Hardware::BRANCH_INSTRUCTIONS, "branches");
let branch_misses = try_add_group_counter(&mut group, Hardware::BRANCH_MISSES, "branch-misses");
let cache_misses = try_add_group_counter(&mut group, Hardware::CACHE_MISSES, "cache-misses");
if instructions.is_none()
&& branches.is_none()
&& branch_misses.is_none()
&& cache_misses.is_none()
{
record_perf_issue("no perf events could be added to perf group".to_string());
return None;
}
Some(PerfGroupCounters {
group,
instructions,
branches,
branch_misses,
cache_misses,
})
}
#[derive(Clone, Default)]
pub struct Results {
pub instructions: u64,
pub branches: u64,
pub branch_misses: u64,
pub cache_misses: u64,
pub has_instructions: bool,
pub has_branches: bool,
pub has_branch_misses: bool,
pub has_cache_misses: bool,
pub pmu_time_enabled_ns: u64,
pub pmu_time_running_ns: u64,
pub duration: Duration,
pub iterations: u64,
pub chunks_executed: u64,
}
impl Results {
pub fn add(&mut self, other: &Results) {
self.instructions += other.instructions;
self.branches += other.branches;
self.branch_misses += other.branch_misses;
self.cache_misses += other.cache_misses;
self.has_instructions |= other.has_instructions;
self.has_branches |= other.has_branches;
self.has_branch_misses |= other.has_branch_misses;
self.has_cache_misses |= other.has_cache_misses;
self.pmu_time_enabled_ns += other.pmu_time_enabled_ns;
self.pmu_time_running_ns += other.pmu_time_running_ns;
self.duration += other.duration;
self.iterations += other.iterations;
self.chunks_executed += other.chunks_executed;
}
pub fn divide(&mut self, divisor: u64) {
if divisor > 0 {
self.instructions /= divisor;
self.branches /= divisor;
self.branch_misses /= divisor;
self.cache_misses /= divisor;
self.pmu_time_enabled_ns /= divisor;
self.pmu_time_running_ns /= divisor;
self.duration /= divisor as u32;
self.iterations /= divisor;
self.chunks_executed /= divisor;
}
}
}
pub struct BenchmarkConfig {
pub chunk_size: usize,
pub target_samples: usize,
pub estimated_ops_per_sec: f64,
}
#[cfg(target_os = "linux")]
pub struct PerfCounters {
pub instructions_counter: perf_event::Counter,
pub cycles_counter: perf_event::Counter,
pub branch_counter: perf_event::Counter,
pub branch_misses: perf_event::Counter,
pub cache_misses: perf_event::Counter,
pub l1i_misses: perf_event::Counter,
pub stalled_frontend: perf_event::Counter,
pub stalled_backend: perf_event::Counter,
pub start_time: Option<Instant>,
}
#[cfg(target_os = "linux")]
impl Default for PerfCounters {
fn default() -> Self {
Self::new()
}
}
#[cfg(target_os = "linux")]
impl PerfCounters {
pub fn try_new() -> io::Result<Self> {
Ok(PerfCounters {
instructions_counter: Builder::new(Hardware::INSTRUCTIONS).build()?,
cycles_counter: Builder::new(Hardware::CPU_CYCLES).build()?,
branch_counter: Builder::new(Hardware::BRANCH_INSTRUCTIONS).build()?,
branch_misses: Builder::new(Hardware::BRANCH_MISSES).build()?,
cache_misses: Builder::new(Hardware::CACHE_MISSES).build()?,
l1i_misses: Builder::new(Hardware::CACHE_MISSES).build()?, stalled_frontend: Builder::new(Hardware::CACHE_MISSES).build()?, stalled_backend: Builder::new(Hardware::CACHE_MISSES).build()?, start_time: None,
})
}
pub fn new() -> Self {
Self::try_new().expect("failed to initialize perf counters")
}
pub fn start(&mut self) {
self.start_time = Some(Instant::now());
let _ = self.instructions_counter.enable();
let _ = self.cycles_counter.enable();
let _ = self.branch_counter.enable();
let _ = self.branch_misses.enable();
let _ = self.cache_misses.enable();
let _ = self.l1i_misses.enable();
let _ = self.stalled_frontend.enable();
let _ = self.stalled_backend.enable();
}
pub fn stop(&mut self) -> (Duration, u64, u64, u64, u64, u64, u64, u64, u64) {
let _ = self.instructions_counter.disable();
let _ = self.cycles_counter.disable();
let _ = self.branch_counter.disable();
let _ = self.branch_misses.disable();
let _ = self.cache_misses.disable();
let _ = self.l1i_misses.disable();
let _ = self.stalled_frontend.disable();
let _ = self.stalled_backend.disable();
let duration = self
.start_time
.map_or(Duration::from_secs(0), |start| start.elapsed());
let instructions = self.instructions_counter.read().unwrap_or(0);
let cycles = self.cycles_counter.read().unwrap_or(0);
let branches = self.branch_counter.read().unwrap_or(0);
let branch_misses = self.branch_misses.read().unwrap_or(0);
let cache_misses = self.cache_misses.read().unwrap_or(0);
let l1i_misses = self.l1i_misses.read().unwrap_or(0);
let stalled_frontend = self.stalled_frontend.read().unwrap_or(0);
let stalled_backend = self.stalled_backend.read().unwrap_or(0);
(
duration,
instructions,
cycles,
branches,
branch_misses,
cache_misses,
l1i_misses,
stalled_frontend,
stalled_backend,
)
}
}
pub trait BenchContext {
fn prepare(num_chunks: usize) -> Self;
fn chunk_size() -> Option<usize> {
None
}
fn operations_per_chunk() -> Option<u64> {
None
}
}
pub struct NoContext;
impl BenchContext for NoContext {
fn prepare(_num_chunks: usize) -> Self {
NoContext
}
}
fn calibrate_engine<T: BenchContext, F: Fn() -> T + ?Sized>(
f: &BenchFunction<T>,
factory: &F,
) -> BenchmarkConfig {
rewrite_line("🔥 calibrating benchmark");
if let Some(preferred_chunk_size) = T::chunk_size() {
let warm_up_end = Instant::now() + WARM_UP_DURATION;
let mut warm_up_count = 0;
while Instant::now() < warm_up_end {
let mut prepared = factory();
black_box(|| f(&mut prepared, preferred_chunk_size, warm_up_count))();
warm_up_count += 1;
let remaining_ms = warm_up_end
.saturating_duration_since(Instant::now())
.as_millis();
rewrite_line(&format!(
"🔥 calibrating benchmark warmup remaining: {remaining_ms:>4} ms chunk: {preferred_chunk_size}"
));
}
clear_line();
return BenchmarkConfig {
chunk_size: preferred_chunk_size,
target_samples: MIN_SAMPLES,
estimated_ops_per_sec: 0.0,
};
}
let mut chunk_size = MIN_CHUNK_SIZE;
let mut best_chunk_size = chunk_size;
let mut ops_per_sec = 0.0;
for i in 0..15 {
let mut prepared = factory();
let started = Instant::now();
black_box(|| f(&mut prepared, chunk_size, 0))();
let duration = started.elapsed();
let duration_secs = duration.as_secs_f64();
if duration_secs >= 0.0001 {
ops_per_sec = chunk_size as f64 / duration_secs;
if duration >= TARGET_CHUNK_DURATION.mul_f64(0.8)
&& duration <= TARGET_CHUNK_DURATION.mul_f64(1.2)
{
best_chunk_size = chunk_size;
break;
}
let scaling_factor = TARGET_CHUNK_DURATION.as_secs_f64() / duration_secs;
let new_chunk_size = ((chunk_size as f64) * scaling_factor) as usize;
chunk_size = new_chunk_size.clamp(MIN_CHUNK_SIZE, MAX_CHUNK_SIZE);
best_chunk_size = chunk_size;
} else {
chunk_size = (chunk_size * 10).min(MAX_CHUNK_SIZE);
best_chunk_size = chunk_size;
}
rewrite_line(&format!(
"🔥 calibrating benchmark pass: {:>2}/15 chunk: {:>9} est: {:>8.2} Mops/s",
i + 1,
chunk_size,
ops_per_sec / 1_000_000.0
));
}
let warm_up_end = Instant::now() + WARM_UP_DURATION;
let mut warm_up_count = 0;
while Instant::now() < warm_up_end {
let mut prepared = factory();
black_box(|| f(&mut prepared, best_chunk_size, warm_up_count))();
warm_up_count += 1;
let remaining_ms = warm_up_end
.saturating_duration_since(Instant::now())
.as_millis();
rewrite_line(&format!(
"🔥 calibrating benchmark warmup remaining: {remaining_ms:>4} ms chunk: {best_chunk_size:>9}"
));
}
let estimated_chunk_duration_secs = if ops_per_sec > 0.0 {
best_chunk_size as f64 / ops_per_sec
} else {
TARGET_CHUNK_DURATION.as_secs_f64()
};
let target_samples = ((MIN_BENCHMARK_DURATION.as_secs_f64() / estimated_chunk_duration_secs)
as usize)
.clamp(MIN_SAMPLES, MAX_SAMPLES);
clear_line();
BenchmarkConfig {
chunk_size: best_chunk_size,
target_samples,
estimated_ops_per_sec: ops_per_sec,
}
}
fn execute_timing_only<T: BenchContext>(
f: &BenchFunction<T>,
prepared: &mut T,
chunk_size: usize,
chunk_num: usize,
ops: u64,
) -> Results {
let start_time = Instant::now();
black_box(|| f(prepared, chunk_size, chunk_num))();
let duration = start_time.elapsed();
Results {
duration,
iterations: ops,
chunks_executed: 1,
..Results::default()
}
}
#[cfg(target_os = "linux")]
fn try_build_individual_counter(event: Hardware, name: &str) -> Option<perf_event::Counter> {
match Builder::new(event).build() {
Ok(counter) => Some(counter),
Err(error) => {
record_perf_issue(format!("perf event '{name}' unavailable: {error}"));
None
}
}
}
#[cfg(target_os = "linux")]
fn read_scaled_counter(counter: &mut Option<perf_event::Counter>, name: &str) -> (u64, u64, u64) {
let Some(counter) = counter else {
return (0, 0, 0);
};
match counter.read_count_and_time() {
Ok(cat) => {
if cat.count > 0 && (cat.time_enabled == 0 || cat.time_running == 0) {
record_perf_issue(format!(
"perf event '{name}' missing timing metadata (enabled/running); using raw count"
));
}
(
scale_multiplexed_count(cat.count, cat.time_enabled, cat.time_running),
cat.time_enabled,
cat.time_running,
)
}
Err(error) => {
record_perf_issue(format!("perf event '{name}' read failed: {error}"));
(0, 0, 0)
}
}
}
#[cfg(target_os = "linux")]
fn enable_counter(counter: &mut Option<perf_event::Counter>, name: &str) {
let Some(mut inner) = counter.take() else {
return;
};
if let Err(error) = inner.enable() {
record_perf_issue(format!("perf event '{name}' enable failed: {error}"));
return;
}
*counter = Some(inner);
}
#[cfg(target_os = "linux")]
fn disable_counter(counter: &mut Option<perf_event::Counter>, name: &str) {
let Some(counter) = counter.as_mut() else {
return;
};
if let Err(error) = counter.disable() {
record_perf_issue(format!("perf event '{name}' disable failed: {error}"));
}
}
#[cfg(target_os = "linux")]
fn execute_with_individual_counters<T: BenchContext>(
f: &BenchFunction<T>,
prepared: &mut T,
chunk_size: usize,
chunk_num: usize,
ops: u64,
) -> Results {
let mut instructions_counter =
try_build_individual_counter(Hardware::INSTRUCTIONS, "instructions");
let mut branches_counter =
try_build_individual_counter(Hardware::BRANCH_INSTRUCTIONS, "branches");
let mut branch_misses_counter =
try_build_individual_counter(Hardware::BRANCH_MISSES, "branch-misses");
let mut cache_misses_counter =
try_build_individual_counter(Hardware::CACHE_MISSES, "cache-misses");
if instructions_counter.is_none()
&& branches_counter.is_none()
&& branch_misses_counter.is_none()
&& cache_misses_counter.is_none()
{
return execute_timing_only(f, prepared, chunk_size, chunk_num, ops);
}
record_perf_issue("using ungrouped perf counters fallback".to_string());
enable_counter(&mut instructions_counter, "instructions");
enable_counter(&mut branches_counter, "branches");
enable_counter(&mut branch_misses_counter, "branch-misses");
enable_counter(&mut cache_misses_counter, "cache-misses");
let start_time = Instant::now();
black_box(|| f(prepared, chunk_size, chunk_num))();
let duration = start_time.elapsed();
disable_counter(&mut instructions_counter, "instructions");
disable_counter(&mut branches_counter, "branches");
disable_counter(&mut branch_misses_counter, "branch-misses");
disable_counter(&mut cache_misses_counter, "cache-misses");
let (instructions, instructions_enabled, instructions_running) =
read_scaled_counter(&mut instructions_counter, "instructions");
let (branches, branches_enabled, branches_running) =
read_scaled_counter(&mut branches_counter, "branches");
let (branch_misses, branch_misses_enabled, branch_misses_running) =
read_scaled_counter(&mut branch_misses_counter, "branch-misses");
let (cache_misses, cache_misses_enabled, cache_misses_running) =
read_scaled_counter(&mut cache_misses_counter, "cache-misses");
let timing_candidates = [
(instructions_enabled, instructions_running),
(branches_enabled, branches_running),
(branch_misses_enabled, branch_misses_running),
(cache_misses_enabled, cache_misses_running),
];
let (pmu_time_enabled_ns, pmu_time_running_ns) = timing_candidates
.iter()
.copied()
.find(|(_, running)| *running > 0)
.or_else(|| {
timing_candidates
.iter()
.copied()
.find(|(enabled, _)| *enabled > 0)
})
.unwrap_or((0, 0));
Results {
instructions,
branches,
branch_misses,
cache_misses,
has_instructions: instructions_counter.is_some(),
has_branches: branches_counter.is_some(),
has_branch_misses: branch_misses_counter.is_some(),
has_cache_misses: cache_misses_counter.is_some(),
pmu_time_enabled_ns,
pmu_time_running_ns,
duration,
iterations: ops,
chunks_executed: 1,
}
}
#[cfg(target_os = "linux")]
fn execute_with_perf_group<T: BenchContext>(
f: &BenchFunction<T>,
prepared: &mut T,
chunk_size: usize,
chunk_num: usize,
ops: u64,
) -> Results {
let Some(mut perf) = build_perf_counter_group() else {
return execute_with_individual_counters(f, prepared, chunk_size, chunk_num, ops);
};
if let Err(error) = perf.group.enable() {
record_perf_issue(format!("perf group enable failed: {error}"));
return execute_with_individual_counters(f, prepared, chunk_size, chunk_num, ops);
}
let start_time = Instant::now();
black_box(|| f(prepared, chunk_size, chunk_num))();
let duration = start_time.elapsed();
if let Err(error) = perf.group.disable() {
record_perf_issue(format!("perf group disable failed: {error}"));
}
let counts = match perf.group.read() {
Ok(counts) => counts,
Err(error) => {
record_perf_issue(format!("perf group read failed: {error}"));
return execute_with_individual_counters(f, prepared, chunk_size, chunk_num, ops);
}
};
let enabled_ns = counts
.time_enabled()
.map(|duration| duration.as_nanos().min(u64::MAX as u128) as u64)
.unwrap_or(0);
let running_ns = counts
.time_running()
.map(|duration| duration.as_nanos().min(u64::MAX as u128) as u64)
.unwrap_or(0);
let instructions_raw = perf
.instructions
.as_ref()
.and_then(|counter| counts.get(counter).map(|entry| entry.value()))
.unwrap_or(0);
let branches_raw = perf
.branches
.as_ref()
.and_then(|counter| counts.get(counter).map(|entry| entry.value()))
.unwrap_or(0);
let branch_misses_raw = perf
.branch_misses
.as_ref()
.and_then(|counter| counts.get(counter).map(|entry| entry.value()))
.unwrap_or(0);
let cache_misses_raw = perf
.cache_misses
.as_ref()
.and_then(|counter| counts.get(counter).map(|entry| entry.value()))
.unwrap_or(0);
if enabled_ns == 0 || running_ns == 0 {
record_perf_issue(
"perf counters reported unusable timing window (enabled/running)".to_string(),
);
if instructions_raw == 0
&& branches_raw == 0
&& branch_misses_raw == 0
&& cache_misses_raw == 0
{
return execute_with_individual_counters(f, prepared, chunk_size, chunk_num, ops);
}
}
Results {
instructions: scale_multiplexed_count(instructions_raw, enabled_ns, running_ns),
branches: scale_multiplexed_count(branches_raw, enabled_ns, running_ns),
branch_misses: scale_multiplexed_count(branch_misses_raw, enabled_ns, running_ns),
cache_misses: scale_multiplexed_count(cache_misses_raw, enabled_ns, running_ns),
has_instructions: perf.instructions.is_some(),
has_branches: perf.branches.is_some(),
has_branch_misses: perf.branch_misses.is_some(),
has_cache_misses: perf.cache_misses.is_some(),
pmu_time_enabled_ns: enabled_ns,
pmu_time_running_ns: running_ns,
duration,
iterations: ops,
chunks_executed: 1,
}
}
fn update_progress_bar(current: usize, total: usize, current_throughput: f64) {
let width = 40;
let filled = (current * width / total.max(1)).min(width);
let empty = width - filled;
let percentage = (current * 100 / total.max(1)).min(100);
print!("\r\x1b[2K⚡ running [");
for i in 0..filled {
if i == filled - 1 && current < total {
print!(">"); } else {
print!("="); }
}
for _ in 0..empty {
print!(" "); }
let throughput_display = if current_throughput.is_finite() && current_throughput > 0.0 {
if current_throughput > 1000.0 {
format!("{current_throughput:.0} Mops/s")
} else {
format!("{current_throughput:.1} Mops/s")
}
} else {
"Calculating...".to_string()
};
print!("] {percentage}% ({current}/{total}) {throughput_display}");
flush_stdout();
}
pub struct BenchmarkRunner {
session: std::sync::Arc<BenchmarkSession>,
filter: Option<String>,
}
impl Default for BenchmarkRunner {
fn default() -> Self {
Self::new()
}
}
impl BenchmarkRunner {
pub fn new() -> Self {
Self {
session: std::sync::Arc::new(BenchmarkSession::new()),
filter: None,
}
}
pub fn with_suite(mut self, suite: impl Into<String>) -> Self {
self.session = std::sync::Arc::new(BenchmarkSession::new_with_suite(suite));
self
}
pub fn with_filter(mut self, filter: Option<&str>) -> Self {
self.filter = filter.map(|s| s.to_string());
self
}
pub fn group<T: BenchContext>(&self, name: &'static str, f: impl FnOnce(&BenchmarkGroup<T>)) {
let group = BenchmarkGroup {
runner: self,
name,
_marker: std::marker::PhantomData,
};
f(&group);
}
pub fn report(&self) -> BenchmarkReport {
self.session.report()
}
pub fn results(&self) -> Vec<BenchmarkResult> {
self.session.get_results()
}
fn should_run(&self, name: &str, group: &str) -> bool {
let Some(filter) = &self.filter else { return true };
filter == "all" || name.contains(filter) || group.contains(filter) || filter == group
}
pub fn run<T: BenchContext>(&self, name: &str, group: &str, f: BenchFunction<T>) {
self.run_with_factory(name, group, f, &|| T::prepare(MIN_CHUNK_SIZE));
}
pub fn run_with_factory<T: BenchContext, F: Fn() -> T + ?Sized>(
&self,
name: &str,
group: &str,
f: BenchFunction<T>,
factory: &F,
) {
if !self.should_run(name, group) {
return;
}
#[cfg(target_os = "linux")]
clear_perf_issues();
let _affinity_guard = BenchAffinityGuard::acquire();
println!("\nBenchmark: {name}");
let config = calibrate_engine(&f, factory);
let estimated_mops = if config.estimated_ops_per_sec > 0.0 {
format!("{:.2} Mops/s", config.estimated_ops_per_sec / 1_000_000.0)
} else {
"n/a".to_string()
};
println!(
" calibrated: chunk={} samples={} estimate={estimated_mops}",
config.chunk_size, config.target_samples
);
rewrite_line(&format!("⚡ running 0/{} samples", config.target_samples));
let mut all_results: Vec<Results> = Vec::new();
let mut summed_results = Results::default();
let mut running_throughput = if config.estimated_ops_per_sec > 0.0 {
config.estimated_ops_per_sec / 1_000_000.0
} else {
0.0
};
for sample in 0..config.target_samples {
let mut prepared = factory();
let ops = T::operations_per_chunk().unwrap_or(config.chunk_size as u64);
#[cfg(target_os = "linux")]
let sample_result = execute_with_perf_group(&f, &mut prepared, config.chunk_size, sample, ops);
#[cfg(not(target_os = "linux"))]
let sample_result = execute_timing_only(&f, &mut prepared, config.chunk_size, sample, ops);
let duration_secs = sample_result.duration.as_secs_f64();
let sample_throughput_mops =
safe_ratio_f64(sample_result.iterations as f64, duration_secs) / 1_000_000.0;
if sample_throughput_mops > 0.0 {
running_throughput = running_throughput * 0.9 + sample_throughput_mops * 0.1;
}
summed_results.add(&sample_result);
all_results.push(sample_result);
if sample % 2 == 0 || sample == config.target_samples - 1 {
update_progress_bar(sample + 1, config.target_samples, running_throughput);
}
}
clear_line();
println!(" samples complete: {}", config.target_samples);
let mut results = summed_results.clone();
results.divide(config.target_samples as u64);
let ops_per_sec = safe_ratio_f64(results.iterations as f64, results.duration.as_secs_f64());
let ns_per_op = safe_ratio_f64(
results.duration.as_nanos() as f64,
results.iterations as f64,
);
let instructions_per_op =
safe_ratio_f64(results.instructions as f64, results.iterations as f64);
let branches_per_op = safe_ratio_f64(results.branches as f64, results.iterations as f64);
let branch_miss_rate =
safe_ratio_f64(results.branch_misses as f64, results.branches as f64) * 100.0;
let branch_misses_per_op =
safe_ratio_f64(results.branch_misses as f64, results.iterations as f64);
let cache_miss_rate_per_op =
safe_ratio_f64(results.cache_misses as f64, results.iterations as f64);
let cv_percent = coefficient_of_variation_percent(&all_results);
let mut throughput_samples = sample_mops_per_sec(&all_results);
throughput_samples.sort_by(|a, b| a.total_cmp(b));
let median_mops_per_sec = median(&throughput_samples);
let mut latency_samples = sample_ns_per_op(&all_results);
latency_samples.sort_by(|a, b| a.total_cmp(b));
let median_ns_per_op = median(&latency_samples);
let p95_ns_per_op = percentile(&latency_samples, 0.95);
let mad_ns_per_op = median_absolute_deviation(&latency_samples, median_ns_per_op);
let outlier_count = tukey_outlier_count(&latency_samples);
println!(" results:");
let has_perf_counters = results.has_instructions
|| results.has_branches
|| results.has_branch_misses
|| results.has_cache_misses;
let has_full_perf_counters = results.has_instructions
&& results.has_branches
&& results.has_branch_misses
&& results.has_cache_misses;
#[cfg(target_os = "linux")]
{
let issues = current_perf_issues();
if !has_perf_counters {
warn_perf_unavailable_once(&issues);
} else if !has_full_perf_counters {
warn_partial_perf_once();
}
}
#[cfg(not(target_os = "linux"))]
if !has_perf_counters {
warn_perf_unavailable_once_non_linux();
}
enforce_pmu_quality(name, has_perf_counters, &results);
let mut table = TableFormatter::new(
vec!["Stat", "Value", "Stat", "Value"],
vec![22, 18, 22, 18],
)
.with_alignments(vec![
Alignment::Left,
Alignment::Right,
Alignment::Left,
Alignment::Right,
])
.with_group_split_after(1);
table.add_row(vec![
&colorize_label("Throughput"),
&colorize_value(&format!("{:.2} Mops/s", ops_per_sec / 1_000_000.0)),
&colorize_label("Median Throughput"),
&colorize_value(&format!("{median_mops_per_sec:.2} Mops/s")),
]);
table.add_row(vec![
&colorize_label("Mean Latency"),
&colorize_value(&format!("{ns_per_op:.2} ns/op")),
&colorize_label("Median Latency"),
&colorize_value(&format!("{median_ns_per_op:.2} ns/op")),
]);
table.add_row(vec![
&colorize_label("P95 Latency"),
&colorize_value(&format!("{p95_ns_per_op:.2} ns/op")),
&colorize_label("MAD Latency"),
&colorize_value(&format!("{mad_ns_per_op:.2} ns/op")),
]);
table.add_row(vec![
&colorize_label("Samples"),
&colorize_value(&format!("{}", config.target_samples)),
&colorize_label("Outliers"),
&colorize_value(&format!("{outlier_count}")),
]);
table.add_row(vec![
&colorize_label("Operations"),
&colorize_value(&format!("{}", results.iterations)),
&colorize_label("Total Duration"),
&colorize_value(&format!("{:.3}s", summed_results.duration.as_secs_f64())),
]);
table.add_row(vec![
&colorize_label("Coefficient Var."),
&colorize_value(&format!("{cv_percent:.2}%")),
&colorize_label("Measurement"),
&colorize_value(if has_perf_counters { "timing + PMU" } else { "timing only" }),
]);
let mut pmu_byline = None;
if has_perf_counters {
let active_percent = pmu_active_percent(&results);
let pmu_avg_running_sec = results.pmu_time_running_ns as f64 / 1_000_000_000.0;
let pmu_avg_enabled_sec = results.pmu_time_enabled_ns as f64 / 1_000_000_000.0;
let pmu_total_running_sec = summed_results.pmu_time_running_ns as f64 / 1_000_000_000.0;
let pmu_total_enabled_sec = summed_results.pmu_time_enabled_ns as f64 / 1_000_000_000.0;
if results.has_instructions {
let branches_value = if results.has_branches {
format!("{branches_per_op:.1}")
} else {
String::new()
};
table.add_row(vec![
&colorize_label("Instructions / op"),
&colorize_value(&format!("{instructions_per_op:.1}")),
&colorize_label("Branches / op"),
&colorize_value(&branches_value),
]);
} else if results.has_branches {
table.add_row(vec![
&colorize_label("Branches / op"),
&colorize_value(&format!("{branches_per_op:.1}")),
&colorize_label("Branch Count"),
&colorize_value(&format!("{:.1}M", results.branches as f64 / 1_000_000.0)),
]);
}
if results.has_branches && results.has_branch_misses {
table.add_row(vec![
&colorize_label("Branch Miss Rate"),
&colorize_value(&format!("{branch_miss_rate:.4}%")),
&colorize_label("Branch Misses / op"),
&colorize_value(&format!("{branch_misses_per_op:.4}")),
]);
}
if results.has_cache_misses {
table.add_row(vec![
&colorize_label("Cache Misses / op"),
&colorize_value(&format!("{cache_miss_rate_per_op:.4}")),
"",
"",
]);
}
pmu_byline = Some(format!(
" PMU: coverage={} avg_running={:.3}s avg_enabled={:.3}s total_running={:.3}s total_enabled={:.3}s",
colorize_value(&format!("{active_percent:.1}%")),
pmu_avg_running_sec,
pmu_avg_enabled_sec,
pmu_total_running_sec,
pmu_total_enabled_sec,
));
}
table.print();
if let Some(pmu_byline) = pmu_byline {
println!("{pmu_byline}");
}
let benchmark_result = BenchmarkResult {
name: name.to_string(),
group: group.to_string(),
kind: BenchmarkKind::Standard,
mops_per_sec: ops_per_sec / 1_000_000.0,
median_mops_per_sec,
ns_per_op,
median_ns_per_op,
p95_ns_per_op,
mad_ns_per_op,
instructions_per_op,
branches_per_op,
branch_miss_rate,
branch_misses_per_op,
cache_miss_rate: cache_miss_rate_per_op,
cv_percent,
outlier_count,
samples: config.target_samples,
operations: results.iterations,
total_duration_sec: summed_results.duration.as_secs_f64(),
sample_throughput_mops_per_sec: throughput_samples,
sample_latency_ns_per_op: latency_samples,
};
self.session.add_result(benchmark_result);
}
}
pub struct BenchmarkGroup<'a, T: BenchContext> {
runner: &'a BenchmarkRunner,
name: &'static str,
_marker: std::marker::PhantomData<T>,
}
impl<'a, T: BenchContext> BenchmarkGroup<'a, T> {
pub fn bench(&self, name: &str, f: BenchFunction<T>) {
self.runner.run(name, self.name, f);
}
pub fn bench_with_factory<F: Fn() -> T + ?Sized>(&self, name: &str, factory: &F, f: BenchFunction<T>) {
self.runner.run_with_factory(name, self.name, f, factory);
}
}
#[cfg(target_os = "linux")]
fn warn_perf_unavailable_once(issues: &[String]) {
static WARNED: AtomicBool = AtomicBool::new(false);
if WARNED.swap(true, Ordering::Relaxed) {
return;
}
eprintln!("⚠️ PMU counters unavailable; continuing with timing-only results.");
if let Some(hint) = linux_perf_hint(false, issues) {
eprintln!(" {hint}");
}
}
#[cfg(target_os = "linux")]
fn warn_partial_perf_once() {
static WARNED: AtomicBool = AtomicBool::new(false);
if WARNED.swap(true, Ordering::Relaxed) {
return;
}
eprintln!("⚠️ Some PMU counters are unavailable; omitted metrics will not be shown.");
}
#[cfg(not(target_os = "linux"))]
fn warn_perf_unavailable_once_non_linux() {
static WARNED: AtomicBool = AtomicBool::new(false);
if WARNED.swap(true, Ordering::Relaxed) {
return;
}
eprintln!("⚠️ PMU counters are unavailable on this platform; continuing with timing-only results.");
}
#[cfg(test)]
mod tests {
use super::{median, median_absolute_deviation, percentile, tukey_outlier_count};
#[test]
fn percentile_interpolates_sorted_values() {
let values = [1.0, 2.0, 3.0, 4.0];
assert_eq!(percentile(&values, 0.0), 1.0);
assert_eq!(percentile(&values, 0.5), 2.5);
assert_eq!(percentile(&values, 1.0), 4.0);
}
#[test]
fn mad_is_zero_for_uniform_values() {
let values = [5.0, 5.0, 5.0];
assert_eq!(median(&values), 5.0);
assert_eq!(median_absolute_deviation(&values, 5.0), 0.0);
}
#[test]
fn tukey_outlier_count_flags_far_values() {
let values = [10.0, 10.0, 11.0, 11.0, 12.0, 100.0];
assert_eq!(tukey_outlier_count(&values), 1);
}
}