Skip to main content

aatxe_core/
compare.rs

1//! Compare two [`RunReport`]s and produce a [`CompareReport`].
2//!
3//! ## Verdict — three independent signals
4//!
5//! 1. **Effect size**: relative change in *median*. Median is preferred over
6//!    mean because bench distributions have heavy right tails (GC pauses,
7//!    scheduler pre-emption) that pull the mean upward and inflate variance.
8//! 2. **Significance**: Mann–Whitney U two-tailed p-value. Non-parametric;
9//!    makes no normality assumption.
10//! 3. **Noise gate**: if either side's CV is high *and* the measured delta is
11//!    small relative to that noise, the diff is labelled `Neutral / TooNoisy`
12//!    rather than `Regression`. Prevents the PR comment getting spammed by
13//!    hopelessly noisy benches.
14
15use crate::stats::{mann_whitney_u, summarize_samples, welch_t};
16use crate::types::{
17    BenchDiff, BenchRun, CompareReport, CompareSide, CompareSummary, NeutralReason, RunReport,
18    Verdict,
19};
20use std::collections::{BTreeSet, HashMap, HashSet};
21
22/// Knobs governing the verdict. Use [`CompareOptions::default`] for the
23/// well-tested defaults; override individually if you want a tighter or
24/// looser gate for a specific service.
25#[derive(Debug, Clone, Copy)]
26pub struct CompareOptions {
27    /// Minimum |median delta| (as fraction of base median) to call a change
28    /// meaningful. Default 0.05 (5%).
29    pub threshold_pct: f64,
30    /// Significance level for the Mann–Whitney U test. Default 0.05.
31    pub alpha: f64,
32    /// If either side's CV exceeds this and the measured delta is smaller
33    /// than `2 × maxCv`, the diff is classified neutral with reason
34    /// `TooNoisy`. Default 0.25 (25%).
35    pub noisy_cv_threshold: f64,
36}
37
38impl Default for CompareOptions {
39    fn default() -> Self {
40        Self {
41            threshold_pct: 0.05,
42            alpha: 0.05,
43            noisy_cv_threshold: 0.25,
44        }
45    }
46}
47
48/// Compare two run reports and classify each bench.
49///
50/// Pairing is by bench name (whitespace-significant). Benches present only on
51/// one side are tagged [`Verdict::New`] or [`Verdict::Removed`]. When the
52/// head report carries an [`AffectedScope`](crate::types::AffectedScope), benches whose file lives among
53/// the explicitly-skipped files are tagged [`Verdict::OutOfScope`] instead of
54/// [`Verdict::Removed`] — they must not gate CI as regressions.
55pub fn compare_reports(
56    base: &RunReport,
57    head: &RunReport,
58    options: CompareOptions,
59) -> CompareReport {
60    let mut base_by_name: HashMap<&str, BenchRun> = HashMap::new();
61    for r in &base.runs {
62        base_by_name.insert(r.name.as_str(), normalize(r));
63    }
64    let mut head_by_name: HashMap<&str, BenchRun> = HashMap::new();
65    for r in &head.runs {
66        head_by_name.insert(r.name.as_str(), normalize(r));
67    }
68    let mut all_names: BTreeSet<&str> = BTreeSet::new();
69    all_names.extend(base_by_name.keys().copied());
70    all_names.extend(head_by_name.keys().copied());
71
72    let skipped_basenames: Option<HashSet<String>> = head.affected_scope.as_ref().map(|a| {
73        a.skipped_bench_files
74            .iter()
75            .map(|p| basename_of(p).to_string())
76            .collect()
77    });
78
79    let mut diffs: Vec<BenchDiff> = Vec::with_capacity(all_names.len());
80    for name in all_names {
81        let b = base_by_name.get(name).cloned();
82        let h = head_by_name.get(name).cloned();
83        diffs.push(diff_one(
84            name.to_string(),
85            b,
86            h,
87            options,
88            skipped_basenames.as_ref(),
89        ));
90    }
91
92    let summary = summarize(&diffs);
93
94    CompareReport {
95        base: CompareSide {
96            r#ref: base.r#ref.clone(),
97            service: base.service.clone(),
98        },
99        head: CompareSide {
100            r#ref: head.r#ref.clone(),
101            service: head.service.clone(),
102        },
103        language: head.language,
104        threshold_pct: options.threshold_pct,
105        alpha: options.alpha,
106        noisy_cv_threshold: options.noisy_cv_threshold,
107        diffs,
108        summary,
109        affected_scope: head.affected_scope.clone(),
110    }
111}
112
113/// True if the comparator marked any bench as a regression. Use this for the
114/// CI exit-code gate (non-zero ⇒ fail).
115pub fn has_regressions(cmp: &CompareReport) -> bool {
116    cmp.summary.regressions > 0
117}
118
119fn summarize(diffs: &[BenchDiff]) -> CompareSummary {
120    let mut s = CompareSummary::default();
121    for d in diffs {
122        match d.verdict {
123            Verdict::Regression => s.regressions += 1,
124            Verdict::Improvement => s.improvements += 1,
125            Verdict::Neutral => s.neutrals += 1,
126            Verdict::New => s.new += 1,
127            Verdict::Removed => s.removed += 1,
128            Verdict::OutOfScope => s.out_of_scope += 1,
129        }
130    }
131    s
132}
133
134/// Recompute derived statistics from raw samples if the producer didn't.
135///
136/// Modern (schema v2) reports already carry all derived fields, so this is
137/// essentially a no-op on them. Legacy / minimal producers may emit only
138/// `samples` + `name`, in which case we fill in the rest here.
139fn normalize(r: &BenchRun) -> BenchRun {
140    if r.samples.is_empty() {
141        return r.clone();
142    }
143    // Heuristic: zero variance + zero mean + non-trivial samples ⇒ definitely
144    // not filled in. Recompute unconditionally if any of the derived fields
145    // looks suspect.
146    let suspect = (r.mean == 0.0 && r.median == 0.0)
147        || (r.elapsed_ns == 0.0 && !r.samples.is_empty())
148        || (r.batch_size == 0);
149    if !suspect {
150        return r.clone();
151    }
152    let s = summarize_samples(&r.samples);
153    BenchRun {
154        name: r.name.clone(),
155        file: r.file.clone(),
156        iterations: r.samples.len() as u32,
157        batch_size: if r.batch_size == 0 { 1 } else { r.batch_size },
158        elapsed_ns: if r.elapsed_ns == 0.0 {
159            r.samples.iter().sum()
160        } else {
161            r.elapsed_ns
162        },
163        samples: r.samples.clone(),
164        mean: s.mean,
165        median: s.median,
166        trimmed_mean: s.trimmed_mean,
167        stddev: s.stddev,
168        cv: s.cv,
169        mad: s.mad,
170        iqr: s.iqr,
171        min: s.min,
172        max: s.max,
173        p50: s.p50,
174        p95: s.p95,
175        p99: s.p99,
176        metrics: r.metrics.clone(),
177        tags: r.tags.clone(),
178    }
179}
180
181fn basename_of(path: &str) -> &str {
182    let i = path.rfind(['/', '\\']).map(|i| i + 1).unwrap_or(0);
183    &path[i..]
184}
185
186fn diff_one(
187    name: String,
188    base: Option<BenchRun>,
189    head: Option<BenchRun>,
190    opts: CompareOptions,
191    skipped_basenames: Option<&HashSet<String>>,
192) -> BenchDiff {
193    match (base, head) {
194        (None, Some(h)) => BenchDiff {
195            name,
196            base: None,
197            head: Some(h),
198            delta_pct: None,
199            mean_delta_pct: None,
200            p_value: None,
201            p_value_welch: None,
202            max_cv: None,
203            verdict: Verdict::New,
204            neutral_reason: None,
205        },
206        (Some(b), None) => {
207            // If the head was scoped via `--affected` and the bench file lives
208            // among the explicitly-skipped files, this is an intentional skip,
209            // not a removal — and must not be counted as a regression.
210            let in_skipped = skipped_basenames
211                .map(|set| set.contains(basename_of(&b.file)))
212                .unwrap_or(false);
213            let verdict = if in_skipped {
214                Verdict::OutOfScope
215            } else {
216                Verdict::Removed
217            };
218            BenchDiff {
219                name,
220                base: Some(b),
221                head: None,
222                delta_pct: None,
223                mean_delta_pct: None,
224                p_value: None,
225                p_value_welch: None,
226                max_cv: None,
227                verdict,
228                neutral_reason: None,
229            }
230        }
231        (None, None) => unreachable!("at least one side must be present"),
232        (Some(b), Some(h)) => {
233            let delta_pct = if b.median == 0.0 {
234                0.0
235            } else {
236                (h.median - b.median) / b.median
237            };
238            let mean_delta_pct = if b.mean == 0.0 {
239                0.0
240            } else {
241                (h.mean - b.mean) / b.mean
242            };
243            let mw = mann_whitney_u(&h.samples, &b.samples);
244            let welch = welch_t(&h.samples, &b.samples);
245            let max_cv = b.cv.max(h.cv);
246
247            let significant = mw.p < opts.alpha;
248            let meaningful = delta_pct.abs() >= opts.threshold_pct;
249            let noisy = max_cv > opts.noisy_cv_threshold && delta_pct.abs() < 2.0 * max_cv;
250
251            let (verdict, neutral_reason) = if noisy {
252                (Verdict::Neutral, Some(NeutralReason::TooNoisy))
253            } else if significant && meaningful && delta_pct > 0.0 {
254                (Verdict::Regression, None)
255            } else if significant && meaningful && delta_pct < 0.0 {
256                (Verdict::Improvement, None)
257            } else if !meaningful {
258                (Verdict::Neutral, Some(NeutralReason::BelowThreshold))
259            } else {
260                (Verdict::Neutral, Some(NeutralReason::NotSignificant))
261            };
262
263            BenchDiff {
264                name,
265                base: Some(b),
266                head: Some(h),
267                delta_pct: Some(delta_pct),
268                mean_delta_pct: Some(mean_delta_pct),
269                p_value: Some(mw.p),
270                p_value_welch: Some(welch.p),
271                max_cv: Some(max_cv),
272                verdict,
273                neutral_reason,
274            }
275        }
276    }
277}