cgp/analysis/
regression.rs

1//! Regression detection using bootstrap confidence intervals.
2//! Methodology from Hoefler & Belli (2015) [8]: "Scientific Benchmarking
3//! of Parallel Computing Systems."
4//! Also supports PELT changepoint detection [43].
5
6use serde::{Deserialize, Serialize};
7
8/// Result of a regression comparison.
9#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
10pub enum Verdict {
11    Regression,
12    Improvement,
13    NoChange,
14}
15
16impl std::fmt::Display for Verdict {
17    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
18        match self {
19            Verdict::Regression => write!(f, "REGRESSION"),
20            Verdict::Improvement => write!(f, "IMPROVED"),
21            Verdict::NoChange => write!(f, "NO_CHANGE"),
22        }
23    }
24}
25
26/// Full regression comparison result.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct RegressionResult {
29    pub verdict: Verdict,
30    /// Change from baseline (positive = slower/regression, negative = faster/improvement)
31    pub change_pct: f64,
32    /// Statistical significance
33    pub p_value: f64,
34    /// Cohen's d effect size (|d| >= 0.8 = large effect)
35    pub effect_size_cohens_d: f64,
36    /// Bootstrap 99% CI lower bound for ratio (current/baseline)
37    pub ci_lower: f64,
38    /// Bootstrap 99% CI upper bound for ratio (current/baseline)
39    pub ci_upper: f64,
40}
41
42/// Performance regression detector.
43/// Uses bootstrap confidence intervals per Hoefler & Belli [8].
44pub struct RegressionDetector {
45    /// Minimum number of samples for statistical significance.
46    pub min_samples: usize,
47    /// Confidence level for bootstrap CI.
48    pub confidence: f64,
49    /// Regression threshold (fraction, e.g., 0.05 = 5%).
50    pub threshold: f64,
51    /// Require large effect size (Cohen's d >= 0.8) in addition to CI.
52    pub require_large_effect: bool,
53    /// Number of bootstrap resamples.
54    pub bootstrap_iterations: usize,
55}
56
57impl Default for RegressionDetector {
58    fn default() -> Self {
59        Self {
60            min_samples: 30,
61            confidence: 0.99,
62            threshold: 0.05,
63            require_large_effect: true,
64            bootstrap_iterations: 10_000,
65        }
66    }
67}
68
69impl RegressionDetector {
70    pub fn new() -> Self {
71        Self::default()
72    }
73
74    /// Compare baseline and current samples.
75    /// Returns the regression verdict with statistical details.
76    pub fn compare(&self, baseline: &[f64], current: &[f64]) -> RegressionResult {
77        if baseline.is_empty() || current.is_empty() {
78            return RegressionResult {
79                verdict: Verdict::NoChange,
80                change_pct: 0.0,
81                p_value: 1.0,
82                effect_size_cohens_d: 0.0,
83                ci_lower: 1.0,
84                ci_upper: 1.0,
85            };
86        }
87
88        let baseline_mean = mean(baseline);
89        let current_mean = mean(current);
90
91        if baseline_mean == 0.0 {
92            return RegressionResult {
93                verdict: Verdict::NoChange,
94                change_pct: 0.0,
95                p_value: 1.0,
96                effect_size_cohens_d: 0.0,
97                ci_lower: 1.0,
98                ci_upper: 1.0,
99            };
100        }
101
102        let ratio = current_mean / baseline_mean;
103        let change_pct = (ratio - 1.0) * 100.0;
104
105        // Cohen's d effect size
106        let cohens_d = compute_cohens_d(baseline, current);
107
108        // Bootstrap CI for the ratio of means
109        let (ci_lower, ci_upper) = self.bootstrap_ratio_ci(baseline, current);
110
111        // p-value: fraction of bootstrap samples where ratio crosses 1.0
112        let p_value = self.bootstrap_p_value(baseline, current);
113
114        // Determine verdict
115        let verdict = if ci_lower > 1.0 + self.threshold {
116            // Entire CI above 1+threshold: regression (slower)
117            if !self.require_large_effect || cohens_d.abs() >= 0.8 {
118                Verdict::Regression
119            } else {
120                Verdict::NoChange
121            }
122        } else if ci_upper < 1.0 - self.threshold {
123            // Entire CI below 1-threshold: improvement (faster)
124            if !self.require_large_effect || cohens_d.abs() >= 0.8 {
125                Verdict::Improvement
126            } else {
127                Verdict::NoChange
128            }
129        } else {
130            Verdict::NoChange
131        };
132
133        RegressionResult {
134            verdict,
135            change_pct,
136            p_value,
137            effect_size_cohens_d: cohens_d,
138            ci_lower,
139            ci_upper,
140        }
141    }
142
143    /// Bootstrap confidence interval for the ratio of means.
144    fn bootstrap_ratio_ci(&self, baseline: &[f64], current: &[f64]) -> (f64, f64) {
145        let mut ratios = Vec::with_capacity(self.bootstrap_iterations);
146        let alpha = 1.0 - self.confidence;
147
148        // Simple LCG PRNG for reproducibility (no external dependency)
149        let mut rng_state: u64 = 42;
150        let lcg_next = |state: &mut u64| -> usize {
151            *state = state
152                .wrapping_mul(6_364_136_223_846_793_005)
153                .wrapping_add(1);
154            (*state >> 33) as usize
155        };
156
157        for _ in 0..self.bootstrap_iterations {
158            let b_mean = bootstrap_mean(baseline, &mut rng_state, &lcg_next);
159            let c_mean = bootstrap_mean(current, &mut rng_state, &lcg_next);
160            if b_mean > 0.0 {
161                ratios.push(c_mean / b_mean);
162            }
163        }
164
165        ratios.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
166
167        if ratios.is_empty() {
168            return (1.0, 1.0);
169        }
170
171        let lower_idx = ((alpha / 2.0) * ratios.len() as f64) as usize;
172        let upper_idx = ((1.0 - alpha / 2.0) * ratios.len() as f64) as usize;
173
174        let lower = ratios[lower_idx.min(ratios.len() - 1)];
175        let upper = ratios[upper_idx.min(ratios.len() - 1)];
176        (lower, upper)
177    }
178
179    /// Bootstrap p-value: fraction of bootstrap resamples where the null hypothesis
180    /// (no difference) would produce the observed ratio or more extreme.
181    fn bootstrap_p_value(&self, baseline: &[f64], current: &[f64]) -> f64 {
182        let observed_ratio = mean(current) / mean(baseline).max(f64::EPSILON);
183
184        // Pool samples under null hypothesis
185        let mut pooled = Vec::with_capacity(baseline.len() + current.len());
186        pooled.extend_from_slice(baseline);
187        pooled.extend_from_slice(current);
188
189        let mut rng_state: u64 = 123;
190        let lcg_next = |state: &mut u64| -> usize {
191            *state = state
192                .wrapping_mul(6_364_136_223_846_793_005)
193                .wrapping_add(1);
194            (*state >> 33) as usize
195        };
196
197        let mut extreme_count = 0;
198        for _ in 0..self.bootstrap_iterations {
199            let b_mean =
200                bootstrap_mean_from_pool(&pooled, baseline.len(), &mut rng_state, &lcg_next);
201            let c_mean =
202                bootstrap_mean_from_pool(&pooled, current.len(), &mut rng_state, &lcg_next);
203            if b_mean > 0.0 {
204                let null_ratio = c_mean / b_mean;
205                if (null_ratio - 1.0).abs() >= (observed_ratio - 1.0).abs() {
206                    extreme_count += 1;
207                }
208            }
209        }
210
211        extreme_count as f64 / self.bootstrap_iterations as f64
212    }
213}
214
215fn mean(data: &[f64]) -> f64 {
216    if data.is_empty() {
217        return 0.0;
218    }
219    data.iter().sum::<f64>() / data.len() as f64
220}
221
222fn variance(data: &[f64]) -> f64 {
223    if data.len() < 2 {
224        return 0.0;
225    }
226    let m = mean(data);
227    data.iter().map(|x| (x - m).powi(2)).sum::<f64>() / (data.len() - 1) as f64
228}
229
230/// Cohen's d = (mean1 - mean2) / pooled_std_dev
231fn compute_cohens_d(baseline: &[f64], current: &[f64]) -> f64 {
232    let m1 = mean(baseline);
233    let m2 = mean(current);
234    let v1 = variance(baseline);
235    let v2 = variance(current);
236    let n1 = baseline.len() as f64;
237    let n2 = current.len() as f64;
238
239    // Pooled standard deviation
240    let pooled_var = ((n1 - 1.0) * v1 + (n2 - 1.0) * v2) / (n1 + n2 - 2.0);
241    let pooled_sd = pooled_var.sqrt();
242
243    if pooled_sd == 0.0 {
244        return 0.0;
245    }
246
247    (m2 - m1) / pooled_sd
248}
249
250/// Resample with replacement and compute mean.
251fn bootstrap_mean(data: &[f64], rng_state: &mut u64, lcg_next: &dyn Fn(&mut u64) -> usize) -> f64 {
252    let n = data.len();
253    let mut sum = 0.0;
254    for _ in 0..n {
255        let idx = lcg_next(rng_state) % n;
256        sum += data[idx];
257    }
258    sum / n as f64
259}
260
261/// Resample from a pooled set with replacement.
262fn bootstrap_mean_from_pool(
263    pool: &[f64],
264    sample_size: usize,
265    rng_state: &mut u64,
266    lcg_next: &dyn Fn(&mut u64) -> usize,
267) -> f64 {
268    let n = pool.len();
269    let mut sum = 0.0;
270    for _ in 0..sample_size {
271        let idx = lcg_next(rng_state) % n;
272        sum += pool[idx];
273    }
274    sum / sample_size as f64
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    /// FALSIFY-CGP-030: Must detect deliberate 10% regression.
282    #[test]
283    fn test_detect_10pct_regression() {
284        let detector = RegressionDetector {
285            min_samples: 10,
286            bootstrap_iterations: 5_000,
287            require_large_effect: false,
288            ..Default::default()
289        };
290
291        // Baseline: mean 100us
292        let baseline: Vec<f64> = (0..50).map(|i| 100.0 + (i as f64 * 0.1) - 2.5).collect();
293        // Current: mean 112us (12% slower)
294        let current: Vec<f64> = (0..50).map(|i| 112.0 + (i as f64 * 0.1) - 2.5).collect();
295
296        let result = detector.compare(&baseline, &current);
297        assert_eq!(result.verdict, Verdict::Regression);
298        assert!(result.change_pct > 10.0);
299    }
300
301    /// FALSIFY-CGP-031: Must NOT false-positive on noise (<2% variation).
302    #[test]
303    fn test_no_false_positive_on_noise() {
304        let detector = RegressionDetector {
305            min_samples: 10,
306            bootstrap_iterations: 5_000,
307            ..Default::default()
308        };
309
310        // Both same distribution with slight noise
311        let baseline: Vec<f64> = (0..50).map(|i| 100.0 + (i as f64 % 3.0) - 1.0).collect();
312        let current: Vec<f64> = (0..50).map(|i| 100.5 + (i as f64 % 3.0) - 1.0).collect();
313
314        let result = detector.compare(&baseline, &current);
315        assert_eq!(result.verdict, Verdict::NoChange);
316    }
317
318    /// FALSIFY-CGP-032: Must detect improvement.
319    #[test]
320    fn test_detect_improvement() {
321        let detector = RegressionDetector {
322            min_samples: 10,
323            bootstrap_iterations: 5_000,
324            require_large_effect: false,
325            ..Default::default()
326        };
327
328        // Baseline: 35.7us
329        let baseline: Vec<f64> = (0..50).map(|i| 35.7 + (i as f64 * 0.01) - 0.25).collect();
330        // Current: 23.2us (35% faster)
331        let current: Vec<f64> = (0..50).map(|i| 23.2 + (i as f64 * 0.01) - 0.25).collect();
332
333        let result = detector.compare(&baseline, &current);
334        assert_eq!(result.verdict, Verdict::Improvement);
335        assert!(result.change_pct < -30.0);
336    }
337
338    #[test]
339    fn test_cohens_d_large_effect() {
340        let baseline: Vec<f64> = vec![10.0; 30];
341        let current: Vec<f64> = vec![15.0; 30];
342        let _d = compute_cohens_d(&baseline, &current);
343        // With zero variance in each group, pooled_sd = 0 -> d = 0
344        // Use slight variation
345        let baseline: Vec<f64> = (0..30).map(|i| 10.0 + (i as f64 * 0.1)).collect();
346        let current: Vec<f64> = (0..30).map(|i| 15.0 + (i as f64 * 0.1)).collect();
347        let d = compute_cohens_d(&baseline, &current);
348        assert!(d.abs() > 0.8, "Cohen's d = {d:.2} should be large effect");
349    }
350
351    #[test]
352    fn test_empty_samples() {
353        let detector = RegressionDetector::new();
354        let result = detector.compare(&[], &[1.0, 2.0]);
355        assert_eq!(result.verdict, Verdict::NoChange);
356    }
357}
cgp/analysis/regression.rs

cgp/analysis/
regression.rs