fluxbench_stats/
comparison.rs

1//! A/B Comparison Statistics
2//!
3//! Provides statistical comparison between baseline and candidate distributions
4//! using bootstrap resampling to compute probability of regression.
5
6use crate::outliers::OutlierMethod;
7use crate::summary::{SummaryStatistics, compute_summary};
8use rand::prelude::*;
9use rayon::prelude::*;
10
11/// Result of comparing two distributions
12#[derive(Debug, Clone)]
13pub struct ComparisonResult {
14    /// Summary statistics for baseline
15    pub baseline_stats: SummaryStatistics,
16    /// Summary statistics for candidate
17    pub candidate_stats: SummaryStatistics,
18    /// Relative change: (candidate - baseline) / baseline
19    pub relative_change: f64,
20    /// Absolute change in nanoseconds
21    pub absolute_change: f64,
22    /// Probability that candidate is slower than baseline (0.0 to 1.0)
23    pub probability_regression: f64,
24    /// Confidence interval of the difference
25    pub difference_ci_lower: f64,
26    /// Confidence interval of the difference
27    pub difference_ci_upper: f64,
28    /// Whether the difference is statistically significant
29    pub is_significant: bool,
30    /// Effect size (Cohen's d)
31    pub effect_size: f64,
32    /// Effect size interpretation
33    pub effect_interpretation: EffectInterpretation,
34}
35
36/// Interpretation of effect size magnitude
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum EffectInterpretation {
39    /// |d| < 0.2 - negligible difference
40    Negligible,
41    /// 0.2 <= |d| < 0.5 - small difference
42    Small,
43    /// 0.5 <= |d| < 0.8 - medium difference
44    Medium,
45    /// |d| >= 0.8 - large difference
46    Large,
47}
48
49impl std::fmt::Display for EffectInterpretation {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        match self {
52            EffectInterpretation::Negligible => write!(f, "negligible"),
53            EffectInterpretation::Small => write!(f, "small"),
54            EffectInterpretation::Medium => write!(f, "medium"),
55            EffectInterpretation::Large => write!(f, "large"),
56        }
57    }
58}
59
60/// Configuration for comparison
61#[derive(Debug, Clone)]
62pub struct ComparisonConfig {
63    /// Number of bootstrap iterations
64    pub bootstrap_iterations: usize,
65    /// Confidence level (e.g., 0.95 for 95%)
66    pub confidence_level: f64,
67    /// Threshold for significance (relative change %)
68    pub significance_threshold: f64,
69    /// Outlier detection method
70    pub outlier_method: OutlierMethod,
71}
72
73impl Default for ComparisonConfig {
74    fn default() -> Self {
75        Self {
76            bootstrap_iterations: 10_000,
77            confidence_level: 0.95,
78            significance_threshold: 5.0, // 5% change threshold
79            outlier_method: OutlierMethod::default(),
80        }
81    }
82}
83
84/// Compare two distributions using bootstrap resampling
85///
86/// Returns the probability that `candidate` is slower than `baseline`,
87/// along with effect size and confidence intervals for the difference.
88///
89/// # Examples
90///
91/// ```ignore
92/// # use fluxbench_stats::{compare_distributions, ComparisonConfig};
93/// let baseline = vec![100.0, 102.0, 98.0, 101.0, 99.0];
94/// let candidate = vec![105.0, 107.0, 103.0, 106.0, 104.0];
95/// let config = ComparisonConfig::default();
96/// let result = compare_distributions(&baseline, &candidate, &config).unwrap();
97/// println!("Relative change: {:.2}%", result.relative_change);
98/// println!("P(regression): {:.2}", result.probability_regression);
99/// println!("Effect size: {:.2}", result.effect_size);
100/// ```
101pub fn compare_distributions(
102    baseline: &[f64],
103    candidate: &[f64],
104    config: &ComparisonConfig,
105) -> Result<ComparisonResult, ComparisonError> {
106    // Validate inputs
107    if baseline.is_empty() {
108        return Err(ComparisonError::EmptyBaseline);
109    }
110    if candidate.is_empty() {
111        return Err(ComparisonError::EmptyCandidate);
112    }
113    if baseline.len() < 2 {
114        return Err(ComparisonError::InsufficientBaseline);
115    }
116    if candidate.len() < 2 {
117        return Err(ComparisonError::InsufficientCandidate);
118    }
119
120    // Compute summary statistics for both
121    let baseline_stats = compute_summary(baseline, config.outlier_method);
122    let candidate_stats = compute_summary(candidate, config.outlier_method);
123
124    // Compute observed difference
125    let observed_diff = candidate_stats.mean - baseline_stats.mean;
126    let relative_change = if baseline_stats.mean > 0.0 {
127        (observed_diff / baseline_stats.mean) * 100.0
128    } else {
129        0.0
130    };
131
132    // Bootstrap the difference of means
133    let bootstrap_diffs: Vec<f64> = (0..config.bootstrap_iterations)
134        .into_par_iter()
135        .map_init(thread_rng, |rng, _| {
136            // Resample baseline
137            let baseline_mean: f64 = (0..baseline.len())
138                .map(|_| baseline[rng.gen_range(0..baseline.len())])
139                .sum::<f64>()
140                / baseline.len() as f64;
141
142            // Resample candidate
143            let candidate_mean: f64 = (0..candidate.len())
144                .map(|_| candidate[rng.gen_range(0..candidate.len())])
145                .sum::<f64>()
146                / candidate.len() as f64;
147
148            candidate_mean - baseline_mean
149        })
150        .collect();
151
152    // Probability of regression (candidate slower = positive difference)
153    let regressions = bootstrap_diffs.iter().filter(|&&d| d > 0.0).count();
154    let probability_regression = regressions as f64 / config.bootstrap_iterations as f64;
155
156    // Confidence interval of the difference
157    let mut sorted_diffs = bootstrap_diffs.clone();
158    sorted_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
159
160    let alpha = 1.0 - config.confidence_level;
161    let lower_idx = (alpha / 2.0 * config.bootstrap_iterations as f64) as usize;
162    let upper_idx = ((1.0 - alpha / 2.0) * config.bootstrap_iterations as f64) as usize;
163    let difference_ci_lower = sorted_diffs[lower_idx];
164    let difference_ci_upper = sorted_diffs[upper_idx.min(sorted_diffs.len() - 1)];
165
166    // Effect size (Cohen's d)
167    // Pooled standard deviation
168    let n1 = baseline.len() as f64;
169    let n2 = candidate.len() as f64;
170    let var1 = baseline_stats.std_dev.powi(2);
171    let var2 = candidate_stats.std_dev.powi(2);
172    let pooled_std = ((((n1 - 1.0) * var1) + ((n2 - 1.0) * var2)) / (n1 + n2 - 2.0)).sqrt();
173
174    let effect_size = if pooled_std > 0.0 {
175        observed_diff / pooled_std
176    } else {
177        0.0
178    };
179
180    let effect_interpretation = interpret_effect_size(effect_size);
181
182    // Significance: CI doesn't include zero AND change exceeds threshold
183    let ci_excludes_zero = (difference_ci_lower > 0.0) || (difference_ci_upper < 0.0);
184    let exceeds_threshold = relative_change.abs() >= config.significance_threshold;
185    let is_significant = ci_excludes_zero && exceeds_threshold;
186
187    Ok(ComparisonResult {
188        baseline_stats,
189        candidate_stats,
190        relative_change,
191        absolute_change: observed_diff,
192        probability_regression,
193        difference_ci_lower,
194        difference_ci_upper,
195        is_significant,
196        effect_size,
197        effect_interpretation,
198    })
199}
200
201/// Interpret effect size magnitude using Cohen's conventions
202fn interpret_effect_size(d: f64) -> EffectInterpretation {
203    let abs_d = d.abs();
204    if abs_d < 0.2 {
205        EffectInterpretation::Negligible
206    } else if abs_d < 0.5 {
207        EffectInterpretation::Small
208    } else if abs_d < 0.8 {
209        EffectInterpretation::Medium
210    } else {
211        EffectInterpretation::Large
212    }
213}
214
215/// Errors from comparison operations
216#[derive(Debug, Clone, thiserror::Error)]
217#[non_exhaustive]
218pub enum ComparisonError {
219    /// Baseline samples are empty
220    #[error("Baseline samples are empty")]
221    EmptyBaseline,
222    /// Candidate samples are empty
223    #[error("Candidate samples are empty")]
224    EmptyCandidate,
225    /// Baseline needs at least 2 samples
226    #[error("Baseline needs at least 2 samples")]
227    InsufficientBaseline,
228    /// Candidate needs at least 2 samples
229    #[error("Candidate needs at least 2 samples")]
230    InsufficientCandidate,
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236
237    #[test]
238    fn test_compare_identical() {
239        let samples = vec![100.0, 102.0, 98.0, 101.0, 99.0, 100.0, 101.0, 99.0];
240        let config = ComparisonConfig {
241            bootstrap_iterations: 1000,
242            ..Default::default()
243        };
244
245        let result = compare_distributions(&samples, &samples, &config).unwrap();
246
247        // Should have ~50% probability of regression (random noise)
248        assert!(result.probability_regression > 0.3 && result.probability_regression < 0.7);
249        assert!(result.relative_change.abs() < 1.0);
250        assert!(!result.is_significant);
251        assert_eq!(
252            result.effect_interpretation,
253            EffectInterpretation::Negligible
254        );
255    }
256
257    #[test]
258    fn test_compare_clear_regression() {
259        let baseline = vec![100.0, 102.0, 98.0, 101.0, 99.0, 100.0, 101.0, 99.0];
260        let candidate = vec![200.0, 202.0, 198.0, 201.0, 199.0, 200.0, 201.0, 199.0];
261        let config = ComparisonConfig {
262            bootstrap_iterations: 1000,
263            ..Default::default()
264        };
265
266        let result = compare_distributions(&baseline, &candidate, &config).unwrap();
267
268        // Clear regression - probability should be very high
269        assert!(result.probability_regression > 0.95);
270        assert!(result.relative_change > 90.0); // ~100% regression
271        assert!(result.is_significant);
272        assert_eq!(result.effect_interpretation, EffectInterpretation::Large);
273    }
274
275    #[test]
276    fn test_compare_clear_improvement() {
277        let baseline = vec![200.0, 202.0, 198.0, 201.0, 199.0, 200.0, 201.0, 199.0];
278        let candidate = vec![100.0, 102.0, 98.0, 101.0, 99.0, 100.0, 101.0, 99.0];
279        let config = ComparisonConfig {
280            bootstrap_iterations: 1000,
281            ..Default::default()
282        };
283
284        let result = compare_distributions(&baseline, &candidate, &config).unwrap();
285
286        // Clear improvement - probability should be very low
287        assert!(result.probability_regression < 0.05);
288        assert!(result.relative_change < -40.0); // ~50% improvement
289        assert!(result.is_significant);
290        assert_eq!(result.effect_interpretation, EffectInterpretation::Large);
291    }
292
293    #[test]
294    fn test_effect_size_interpretation() {
295        assert_eq!(interpret_effect_size(0.1), EffectInterpretation::Negligible);
296        assert_eq!(interpret_effect_size(0.3), EffectInterpretation::Small);
297        assert_eq!(interpret_effect_size(0.6), EffectInterpretation::Medium);
298        assert_eq!(interpret_effect_size(1.0), EffectInterpretation::Large);
299        assert_eq!(interpret_effect_size(-0.5), EffectInterpretation::Medium);
300    }
301
302    #[test]
303    fn test_empty_samples() {
304        let config = ComparisonConfig::default();
305
306        assert!(matches!(
307            compare_distributions(&[], &[1.0, 2.0], &config),
308            Err(ComparisonError::EmptyBaseline)
309        ));
310        assert!(matches!(
311            compare_distributions(&[1.0, 2.0], &[], &config),
312            Err(ComparisonError::EmptyCandidate)
313        ));
314    }
315
316    #[test]
317    fn test_insufficient_samples() {
318        let config = ComparisonConfig::default();
319
320        assert!(matches!(
321            compare_distributions(&[1.0], &[1.0, 2.0], &config),
322            Err(ComparisonError::InsufficientBaseline)
323        ));
324        assert!(matches!(
325            compare_distributions(&[1.0, 2.0], &[1.0], &config),
326            Err(ComparisonError::InsufficientCandidate)
327        ));
328    }
329}
fluxbench_stats/comparison.rs

fluxbench_stats/
comparison.rs