Skip to main content

scirs2_core/validation/
benchmark_harness.rs

1//! Benchmark harness for performance regression detection.
2//!
3//! This module provides a lightweight, self-contained micro-benchmark harness
4//! that records wall-clock timings and can compare a new run against a stored
5//! baseline to detect performance regressions.
6//!
7//! ## Design goals
8//!
9//! - **No heavy framework dependency** — uses only `std::time::Instant`.
10//! - **Serialisable** — baselines can be persisted as JSON files and loaded
11//!   back for comparison across CI runs.
12//! - **Composable** — individual `BenchmarkResult`s are collected into a
13//!   `BenchmarkBaseline`, which can be diffed by the `RegressionDetector`.
14//!
15//! ## Example
16//!
17//! ```rust
18//! use scirs2_core::validation::benchmark_harness::{
19//!     BenchmarkHarness, BenchmarkBaseline, RegressionDetector,
20//! };
21//!
22//! // Record timings for a function.
23//! let mut harness = BenchmarkHarness::new("vec_sum");
24//! harness.run(1000, || {
25//!     let v: Vec<f64> = (0..1000).map(|i| i as f64).collect();
26//!     let _: f64 = v.iter().sum();
27//! });
28//! let result = harness.finish();
29//! println!("mean: {:.0} ns", result.mean_ns);
30//!
31//! // Build a baseline and check for regressions.
32//! let mut baseline = BenchmarkBaseline::new("my_crate");
33//! baseline.add(result.clone());
34//!
35//! let mut current = BenchmarkBaseline::new("my_crate");
36//! // Simulate a 5 % slower result.
37//! let mut slower = result.clone();
38//! slower.mean_ns *= 1.05;
39//! current.add(slower);
40//!
41//! let detector = RegressionDetector::new(0.10); // 10 % threshold
42//! assert!(!detector.has_regressions(&baseline, &current));
43//! ```
44
45use std::time::Instant;
46
47#[cfg(feature = "serde")]
48use serde::{Deserialize, Serialize};
49
50// ---------------------------------------------------------------------------
51// BenchmarkResult
52// ---------------------------------------------------------------------------
53
54/// Timing statistics for a single named benchmark.
55#[derive(Debug, Clone)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57pub struct BenchmarkResult {
58    /// Human-readable name for this benchmark.
59    pub name: String,
60    /// Mean wall-clock time per iteration in nanoseconds.
61    pub mean_ns: f64,
62    /// Standard deviation of per-iteration times in nanoseconds.
63    pub std_dev_ns: f64,
64    /// Minimum observed time in nanoseconds.
65    pub min_ns: u64,
66    /// Maximum observed time in nanoseconds.
67    pub max_ns: u64,
68    /// Number of iterations used to compute these statistics.
69    pub num_iterations: usize,
70}
71
72impl BenchmarkResult {
73    /// Coefficient of variation (std_dev / mean).  Returns 0.0 when mean is
74    /// zero.
75    pub fn cv(&self) -> f64 {
76        if self.mean_ns == 0.0 {
77            0.0
78        } else {
79            self.std_dev_ns / self.mean_ns
80        }
81    }
82}
83
84impl std::fmt::Display for BenchmarkResult {
85    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
86        write!(
87            f,
88            "{}: mean={:.1}ns ±{:.1}ns (min={}, max={}, n={})",
89            self.name, self.mean_ns, self.std_dev_ns, self.min_ns, self.max_ns, self.num_iterations,
90        )
91    }
92}
93
94// ---------------------------------------------------------------------------
95// BenchmarkBaseline
96// ---------------------------------------------------------------------------
97
98/// A collection of [`BenchmarkResult`]s representing either a stored baseline
99/// or a fresh measurement run.
100#[derive(Debug, Clone)]
101#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
102pub struct BenchmarkBaseline {
103    /// Name of the crate or component being benchmarked.
104    pub crate_name: String,
105    /// Git commit hash at the time of recording, if available.
106    pub git_hash: Option<String>,
107    /// ISO 8601 timestamp string (e.g. `"2026-01-01T00:00:00Z"`).
108    pub timestamp: String,
109    /// Individual benchmark results.
110    pub results: Vec<BenchmarkResult>,
111}
112
113impl BenchmarkBaseline {
114    /// Create an empty baseline for `crate_name`.
115    ///
116    /// `git_hash` is populated from the `GIT_HASH` environment variable when
117    /// present; otherwise `None` is stored.
118    pub fn new(crate_name: impl Into<String>) -> Self {
119        let git_hash = std::env::var("GIT_HASH").ok();
120        // Use a simple static timestamp fallback when chrono is unavailable.
121        let timestamp = chrono_timestamp_or_placeholder();
122        Self {
123            crate_name: crate_name.into(),
124            git_hash,
125            timestamp,
126            results: Vec::new(),
127        }
128    }
129
130    /// Append a [`BenchmarkResult`] to this baseline.
131    pub fn add(&mut self, result: BenchmarkResult) {
132        self.results.push(result);
133    }
134
135    /// Return a reference to the [`BenchmarkResult`] with the given name, or
136    /// `None` if not present.
137    pub fn find(&self, name: &str) -> Option<&BenchmarkResult> {
138        self.results.iter().find(|r| r.name == name)
139    }
140
141    /// Persist the baseline to a JSON file at `path`.
142    ///
143    /// # Errors
144    ///
145    /// Propagates I/O and serialisation errors as [`std::io::Error`].
146    #[cfg(feature = "serde")]
147    pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
148        let json = serde_json::to_string_pretty(self).map_err(|e| std::io::Error::other(e))?;
149        std::fs::write(path, json)
150    }
151
152    /// Load a baseline from a JSON file at `path`.
153    ///
154    /// # Errors
155    ///
156    /// Propagates I/O and deserialisation errors as [`std::io::Error`].
157    #[cfg(feature = "serde")]
158    pub fn load(path: &std::path::Path) -> std::io::Result<Self> {
159        let json = std::fs::read_to_string(path)?;
160        serde_json::from_str(&json).map_err(|e| std::io::Error::other(e))
161    }
162
163    /// Return a summary line for each result in Markdown table form.
164    pub fn to_markdown(&self) -> String {
165        let mut md = String::new();
166        md.push_str(&format!("## Benchmark Baseline: `{}`\n\n", self.crate_name));
167        if let Some(ref h) = self.git_hash {
168            md.push_str(&format!("Git hash: `{h}`  \n"));
169        }
170        md.push_str(&format!("Timestamp: {}  \n\n", self.timestamp));
171        md.push_str("| Benchmark | Mean (ns) | Std Dev (ns) | Min (ns) | Max (ns) | N |\n");
172        md.push_str("|-----------|-----------|--------------|----------|----------|---|\n");
173        for r in &self.results {
174            md.push_str(&format!(
175                "| {} | {:.1} | {:.1} | {} | {} | {} |\n",
176                r.name, r.mean_ns, r.std_dev_ns, r.min_ns, r.max_ns, r.num_iterations
177            ));
178        }
179        md
180    }
181}
182
183// ---------------------------------------------------------------------------
184// RegressionEntry
185// ---------------------------------------------------------------------------
186
187/// Describes a single benchmark that regressed beyond the configured threshold.
188#[derive(Debug, Clone)]
189pub struct RegressionEntry {
190    /// Name of the regressed benchmark.
191    pub name: String,
192    /// Mean time in the baseline run (nanoseconds).
193    pub baseline_mean_ns: f64,
194    /// Mean time in the current run (nanoseconds).
195    pub current_mean_ns: f64,
196    /// Relative change: `(current − baseline) / baseline`.
197    ///
198    /// A value of `0.10` means the benchmark got 10 % slower.
199    pub relative_change: f64,
200}
201
202impl RegressionEntry {
203    /// How much slower the benchmark became, expressed as a percentage string
204    /// (e.g. `"+12.3%"`).
205    pub fn change_pct_str(&self) -> String {
206        format!("{:+.1}%", self.relative_change * 100.0)
207    }
208}
209
210// ---------------------------------------------------------------------------
211// RegressionDetector
212// ---------------------------------------------------------------------------
213
214/// Compares a fresh benchmark run against a stored baseline and identifies
215/// benchmarks that regressed beyond the configured relative threshold.
216#[derive(Debug, Clone)]
217pub struct RegressionDetector {
218    /// Minimum relative slowdown before a result is considered a regression.
219    ///
220    /// E.g. `0.05` means a 5 % slow-down triggers a regression report.
221    pub threshold: f64,
222}
223
224impl RegressionDetector {
225    /// Create a new [`RegressionDetector`] with the given threshold.
226    pub fn new(threshold: f64) -> Self {
227        Self { threshold }
228    }
229
230    /// Return all benchmarks present in both `baseline` and `current` whose
231    /// mean time increased by more than `self.threshold`.
232    pub fn find_regressions(
233        &self,
234        baseline: &BenchmarkBaseline,
235        current: &BenchmarkBaseline,
236    ) -> Vec<RegressionEntry> {
237        let mut entries = Vec::new();
238        for cur in &current.results {
239            if let Some(base) = baseline.find(&cur.name) {
240                if base.mean_ns > 0.0 {
241                    let rel = (cur.mean_ns - base.mean_ns) / base.mean_ns;
242                    if rel > self.threshold {
243                        entries.push(RegressionEntry {
244                            name: cur.name.clone(),
245                            baseline_mean_ns: base.mean_ns,
246                            current_mean_ns: cur.mean_ns,
247                            relative_change: rel,
248                        });
249                    }
250                }
251            }
252        }
253        entries
254    }
255
256    /// Returns `true` when at least one regression is found.
257    pub fn has_regressions(
258        &self,
259        baseline: &BenchmarkBaseline,
260        current: &BenchmarkBaseline,
261    ) -> bool {
262        !self.find_regressions(baseline, current).is_empty()
263    }
264
265    /// Format the regression list as a Markdown table.
266    pub fn report(&self, regressions: &[RegressionEntry]) -> String {
267        if regressions.is_empty() {
268            return "No performance regressions detected.\n".to_string();
269        }
270        let mut md = String::new();
271        md.push_str("## Performance Regressions\n\n");
272        md.push_str(&format!("Threshold: {:.1}%\n\n", self.threshold * 100.0));
273        md.push_str("| Benchmark | Baseline (ns) | Current (ns) | Change |\n");
274        md.push_str("|-----------|---------------|--------------|--------|\n");
275        for r in regressions {
276            md.push_str(&format!(
277                "| {} | {:.1} | {:.1} | {} |\n",
278                r.name,
279                r.baseline_mean_ns,
280                r.current_mean_ns,
281                r.change_pct_str(),
282            ));
283        }
284        md
285    }
286}
287
288// ---------------------------------------------------------------------------
289// BenchmarkHarness
290// ---------------------------------------------------------------------------
291
292/// Simple timing harness.
293///
294/// Call [`BenchmarkHarness::run`] with a closure to record per-iteration
295/// nanosecond timings, then call [`BenchmarkHarness::finish`] to get a
296/// [`BenchmarkResult`] with mean, standard deviation, min, and max.
297pub struct BenchmarkHarness {
298    name: String,
299    /// Raw per-iteration timings in nanoseconds.
300    timings: Vec<u64>,
301}
302
303impl BenchmarkHarness {
304    /// Create a new harness for a benchmark named `name`.
305    pub fn new(name: impl Into<String>) -> Self {
306        Self {
307            name: name.into(),
308            timings: Vec::new(),
309        }
310    }
311
312    /// Execute `f` for exactly `iterations` iterations, recording the
313    /// wall-clock time of each call.
314    ///
315    /// Calling [`Self::run`] multiple times discards previous timings.
316    pub fn run(&mut self, iterations: usize, f: impl Fn()) -> &mut Self {
317        self.timings.clear();
318        self.timings.reserve(iterations);
319        for _ in 0..iterations {
320            let start = Instant::now();
321            f();
322            self.timings.push(start.elapsed().as_nanos() as u64);
323        }
324        self
325    }
326
327    /// Compute summary statistics from the recorded timings and return a
328    /// [`BenchmarkResult`].
329    ///
330    /// Returns a zeroed result when no timings have been recorded.
331    pub fn finish(&self) -> BenchmarkResult {
332        if self.timings.is_empty() {
333            return BenchmarkResult {
334                name: self.name.clone(),
335                mean_ns: 0.0,
336                std_dev_ns: 0.0,
337                min_ns: 0,
338                max_ns: 0,
339                num_iterations: 0,
340            };
341        }
342
343        let n = self.timings.len();
344        let sum: u64 = self.timings.iter().sum();
345        let mean = sum as f64 / n as f64;
346
347        let variance = self
348            .timings
349            .iter()
350            .map(|&t| {
351                let diff = t as f64 - mean;
352                diff * diff
353            })
354            .sum::<f64>()
355            / n as f64;
356        let std_dev = variance.sqrt();
357
358        let min = *self.timings.iter().min().unwrap_or(&0);
359        let max = *self.timings.iter().max().unwrap_or(&0);
360
361        BenchmarkResult {
362            name: self.name.clone(),
363            mean_ns: mean,
364            std_dev_ns: std_dev,
365            min_ns: min,
366            max_ns: max,
367            num_iterations: n,
368        }
369    }
370}
371
372// ---------------------------------------------------------------------------
373// Internal helpers
374// ---------------------------------------------------------------------------
375
376/// Return an ISO 8601-like timestamp using [`chrono`] when it is compiled in,
377/// or a static placeholder otherwise.
378fn chrono_timestamp_or_placeholder() -> String {
379    // chrono is always available as a workspace dep in scirs2-core.
380    use chrono::Utc;
381    Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string()
382}
383
384// ---------------------------------------------------------------------------
385// Tests
386// ---------------------------------------------------------------------------
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391
392    // -----------------------------------------------------------------------
393    // BenchmarkHarness
394    // -----------------------------------------------------------------------
395
396    #[test]
397    fn test_harness_records_timings() {
398        let mut h = BenchmarkHarness::new("noop");
399        h.run(50, || {});
400        let r = h.finish();
401        assert_eq!(r.name, "noop");
402        assert_eq!(r.num_iterations, 50);
403        assert!(r.mean_ns >= 0.0);
404        assert!(r.min_ns <= r.max_ns);
405    }
406
407    #[test]
408    fn test_harness_empty_finish() {
409        let h = BenchmarkHarness::new("empty");
410        let r = h.finish();
411        assert_eq!(r.num_iterations, 0);
412        assert_eq!(r.mean_ns, 0.0);
413        assert_eq!(r.min_ns, 0);
414        assert_eq!(r.max_ns, 0);
415    }
416
417    #[test]
418    fn test_harness_run_resets_previous_timings() {
419        let mut h = BenchmarkHarness::new("reset");
420        h.run(200, || {});
421        h.run(10, || {});
422        let r = h.finish();
423        assert_eq!(r.num_iterations, 10);
424    }
425
426    #[test]
427    fn test_harness_std_dev_nonnegative() {
428        let mut h = BenchmarkHarness::new("std_dev");
429        h.run(100, || {
430            // a tiny bit of work so timings aren't all zero
431            let _: u64 = (0_u64..100).sum();
432        });
433        let r = h.finish();
434        assert!(r.std_dev_ns >= 0.0);
435    }
436
437    #[test]
438    fn test_harness_cv_zero_mean() {
439        let r = BenchmarkResult {
440            name: "x".to_string(),
441            mean_ns: 0.0,
442            std_dev_ns: 0.0,
443            min_ns: 0,
444            max_ns: 0,
445            num_iterations: 1,
446        };
447        assert_eq!(r.cv(), 0.0);
448    }
449
450    // -----------------------------------------------------------------------
451    // BenchmarkBaseline
452    // -----------------------------------------------------------------------
453
454    #[test]
455    fn test_baseline_add_and_find() {
456        let mut bl = BenchmarkBaseline::new("my_crate");
457        let r = BenchmarkResult {
458            name: "foo".to_string(),
459            mean_ns: 100.0,
460            std_dev_ns: 5.0,
461            min_ns: 90,
462            max_ns: 120,
463            num_iterations: 50,
464        };
465        bl.add(r.clone());
466        let found = bl.find("foo");
467        assert!(found.is_some());
468        assert!((found.unwrap().mean_ns - 100.0).abs() < 1e-9);
469    }
470
471    #[test]
472    fn test_baseline_find_missing() {
473        let bl = BenchmarkBaseline::new("x");
474        assert!(bl.find("nonexistent").is_none());
475    }
476
477    #[test]
478    fn test_baseline_markdown_format() {
479        let mut bl = BenchmarkBaseline::new("scirs2-test");
480        bl.add(BenchmarkResult {
481            name: "bench_a".to_string(),
482            mean_ns: 250.5,
483            std_dev_ns: 12.3,
484            min_ns: 220,
485            max_ns: 310,
486            num_iterations: 100,
487        });
488        let md = bl.to_markdown();
489        assert!(md.contains("## Benchmark Baseline: `scirs2-test`"));
490        assert!(md.contains("bench_a"));
491        assert!(md.contains("250.5"));
492    }
493
494    // -----------------------------------------------------------------------
495    // Save/load roundtrip (requires serde feature)
496    // -----------------------------------------------------------------------
497
498    #[cfg(feature = "serde")]
499    #[test]
500    fn test_baseline_save_load_roundtrip() {
501        use std::env::temp_dir;
502
503        let mut bl = BenchmarkBaseline::new("roundtrip_crate");
504        bl.git_hash = Some("abc123".to_string());
505        bl.add(BenchmarkResult {
506            name: "test_bench".to_string(),
507            mean_ns: 42.0,
508            std_dev_ns: 1.5,
509            min_ns: 38,
510            max_ns: 50,
511            num_iterations: 200,
512        });
513
514        let path = temp_dir().join("scirs2_core_benchmark_baseline_test.json");
515        bl.save(&path).expect("save failed");
516
517        let loaded = BenchmarkBaseline::load(&path).expect("load failed");
518        assert_eq!(loaded.crate_name, "roundtrip_crate");
519        assert_eq!(loaded.git_hash.as_deref(), Some("abc123"));
520        assert_eq!(loaded.results.len(), 1);
521        assert!((loaded.results[0].mean_ns - 42.0).abs() < 1e-9);
522
523        // cleanup
524        let _ = std::fs::remove_file(&path);
525    }
526
527    // -----------------------------------------------------------------------
528    // RegressionDetector
529    // -----------------------------------------------------------------------
530
531    fn make_baseline(name: &str, mean_ns: f64) -> BenchmarkBaseline {
532        let mut bl = BenchmarkBaseline::new("test");
533        bl.add(BenchmarkResult {
534            name: name.to_string(),
535            mean_ns,
536            std_dev_ns: 1.0,
537            min_ns: (mean_ns * 0.9) as u64,
538            max_ns: (mean_ns * 1.1) as u64,
539            num_iterations: 100,
540        });
541        bl
542    }
543
544    #[test]
545    fn test_regression_none_when_no_change() {
546        let baseline = make_baseline("bench", 100.0);
547        let current = make_baseline("bench", 100.0);
548        let det = RegressionDetector::new(0.05);
549        assert!(!det.has_regressions(&baseline, &current));
550        assert!(det.find_regressions(&baseline, &current).is_empty());
551    }
552
553    #[test]
554    fn test_regression_detected_above_threshold() {
555        let baseline = make_baseline("bench", 100.0);
556        let current = make_baseline("bench", 115.0); // +15 %
557        let det = RegressionDetector::new(0.10); // 10 % threshold
558        assert!(det.has_regressions(&baseline, &current));
559        let regs = det.find_regressions(&baseline, &current);
560        assert_eq!(regs.len(), 1);
561        assert!((regs[0].relative_change - 0.15).abs() < 1e-9);
562    }
563
564    #[test]
565    fn test_regression_not_detected_below_threshold() {
566        let baseline = make_baseline("bench", 100.0);
567        let current = make_baseline("bench", 104.0); // +4 %
568        let det = RegressionDetector::new(0.10); // 10 % threshold
569        assert!(!det.has_regressions(&baseline, &current));
570    }
571
572    #[test]
573    fn test_regression_improvement_not_flagged() {
574        // A benchmark that got faster should never be flagged.
575        let baseline = make_baseline("bench", 100.0);
576        let current = make_baseline("bench", 80.0); // 20 % faster
577        let det = RegressionDetector::new(0.05);
578        assert!(!det.has_regressions(&baseline, &current));
579    }
580
581    #[test]
582    fn test_regression_missing_benchmark_skipped() {
583        let baseline = make_baseline("bench_a", 100.0);
584        let current = make_baseline("bench_b", 200.0); // different name
585        let det = RegressionDetector::new(0.05);
586        assert!(!det.has_regressions(&baseline, &current));
587    }
588
589    #[test]
590    fn test_regression_report_empty() {
591        let det = RegressionDetector::new(0.05);
592        let report = det.report(&[]);
593        assert!(report.contains("No performance regressions"));
594    }
595
596    #[test]
597    fn test_regression_report_markdown() {
598        let det = RegressionDetector::new(0.10);
599        let entries = vec![RegressionEntry {
600            name: "slow_bench".to_string(),
601            baseline_mean_ns: 100.0,
602            current_mean_ns: 150.0,
603            relative_change: 0.5,
604        }];
605        let report = det.report(&entries);
606        assert!(report.contains("## Performance Regressions"));
607        assert!(report.contains("slow_bench"));
608        assert!(report.contains("+50.0%"));
609    }
610}