Skip to main content

perfgate_paired/
lib.rs

1//! Paired benchmarking statistics for A/B comparison.
2//!
3//! This crate provides statistical functions for analyzing paired benchmark data,
4//! where each measurement consists of a baseline and current observation from the
5//! same experimental unit (e.g., same input, same machine configuration).
6//!
7//! Part of the [perfgate](https://github.com/EffortlessMetrics/perfgate) workspace.
8//!
9//! # Overview
10//!
11//! The crate provides:
12//! - [`compute_paired_stats`] — Compute summary statistics from paired samples
13//! - [`compare_paired_stats`] — Compare paired statistics with confidence intervals
14//! - [`PairedComparison`] — Result struct with significance testing
15//! - [`summarize_paired_diffs`] — Summarize the distribution of differences
16//!
17//! # Statistical Methodology
18//!
19//! ## Paired t-test
20//!
21//! The comparison uses a paired t-test approach:
22//! - For n >= 30 samples: uses t-value of 1.96 (normal approximation)
23//! - For n < 30 samples: uses t-value of 2.0 (conservative small-sample estimate)
24//!
25//! ## Confidence Intervals
26//!
27//! 95% confidence intervals are computed as:
28//! ```text
29//! CI = mean ± t_value × (std_dev / sqrt(n))
30//! ```
31//!
32//! A result is considered statistically significant if the confidence interval
33//! does not span zero (i.e., `ci_lower > 0` or `ci_upper < 0`).
34//!
35//! # Example
36//!
37//! ```
38//! use perfgate_paired::{compute_paired_stats, compare_paired_stats, PairedError};
39//! use perfgate_types::{PairedSample, PairedSampleHalf};
40//!
41//! fn make_half(wall_ms: u64) -> PairedSampleHalf {
42//!     PairedSampleHalf {
43//!         wall_ms,
44//!         exit_code: 0,
45//!         timed_out: false,
46//!         max_rss_kb: None,
47//!         stdout: None,
48//!         stderr: None,
49//!     }
50//! }
51//!
52//! fn make_sample(idx: u32, baseline_ms: u64, current_ms: u64) -> PairedSample {
53//!     PairedSample {
54//!         pair_index: idx,
55//!         warmup: false,
56//!         baseline: make_half(baseline_ms),
57//!         current: make_half(current_ms),
58//!         wall_diff_ms: current_ms as i64 - baseline_ms as i64,
59//!         rss_diff_kb: None,
60//!     }
61//! }
62//!
63//! let samples = vec![
64//!     make_sample(0, 100, 95),   // 5ms improvement
65//!     make_sample(1, 105, 100),  // 5ms improvement
66//!     make_sample(2, 110, 103),  // 7ms improvement
67//! ];
68//!
69//! let stats = compute_paired_stats(&samples, None, None)?;
70//! let comparison = compare_paired_stats(&stats);
71//!
72//! println!("Mean diff: {:.2}ms", comparison.mean_diff_ms);
73//! println!("% change: {:.2}%", comparison.pct_change * 100.0);
74//! println!("Significant: {}", comparison.is_significant);
75//! # Ok::<(), PairedError>(())
76//! ```
77
78use perfgate_stats::{summarize_f64, summarize_u64};
79use perfgate_types::{
80    PairedDiffSummary, PairedSample, PairedStats, Significance, SignificancePolicy,
81};
82
83pub use perfgate_error::PairedError;
84
85/// Compute summary statistics from paired benchmark samples.
86///
87/// Filters out warmup samples, then computes wall-time, RSS, and
88/// throughput summaries for both baseline and current runs, as well
89/// as the distribution of their paired differences.
90pub fn compute_paired_stats(
91    samples: &[PairedSample],
92    work_units: Option<u64>,
93    significance_policy: Option<&SignificancePolicy>,
94) -> Result<PairedStats, PairedError> {
95    let measured: Vec<&PairedSample> = samples.iter().filter(|s| !s.warmup).collect();
96    if measured.is_empty() {
97        return Err(PairedError::NoSamples);
98    }
99
100    let baseline_wall: Vec<u64> = measured.iter().map(|s| s.baseline.wall_ms).collect();
101    let current_wall: Vec<u64> = measured.iter().map(|s| s.current.wall_ms).collect();
102    let wall_diffs: Vec<f64> = measured.iter().map(|s| s.wall_diff_ms as f64).collect();
103
104    let baseline_wall_ms = summarize_u64(&baseline_wall).map_err(|_| PairedError::NoSamples)?;
105    let current_wall_ms = summarize_u64(&current_wall).map_err(|_| PairedError::NoSamples)?;
106    let wall_diff_ms = summarize_paired_diffs(&wall_diffs, significance_policy)?;
107
108    let baseline_rss: Vec<u64> = measured
109        .iter()
110        .filter_map(|s| s.baseline.max_rss_kb)
111        .collect();
112    let current_rss: Vec<u64> = measured
113        .iter()
114        .filter_map(|s| s.current.max_rss_kb)
115        .collect();
116    let rss_diffs: Vec<f64> = measured
117        .iter()
118        .filter_map(|s| s.rss_diff_kb)
119        .map(|d| d as f64)
120        .collect();
121
122    let baseline_max_rss_kb = if baseline_rss.is_empty() {
123        None
124    } else {
125        Some(summarize_u64(&baseline_rss).map_err(|_| PairedError::NoSamples)?)
126    };
127    let current_max_rss_kb = if current_rss.is_empty() {
128        None
129    } else {
130        Some(summarize_u64(&current_rss).map_err(|_| PairedError::NoSamples)?)
131    };
132    let rss_diff_kb = if rss_diffs.is_empty() {
133        None
134    } else {
135        Some(summarize_paired_diffs(&rss_diffs, significance_policy)?)
136    };
137
138    let (baseline_throughput_per_s, current_throughput_per_s, throughput_diff_per_s) =
139        match work_units {
140            Some(work) => {
141                let baseline_thr: Vec<f64> = measured
142                    .iter()
143                    .map(|s| {
144                        let secs = s.baseline.wall_ms as f64 / 1000.0;
145                        if secs <= 0.0 { 0.0 } else { work as f64 / secs }
146                    })
147                    .collect();
148                let current_thr: Vec<f64> = measured
149                    .iter()
150                    .map(|s| {
151                        let secs = s.current.wall_ms as f64 / 1000.0;
152                        if secs <= 0.0 { 0.0 } else { work as f64 / secs }
153                    })
154                    .collect();
155                let thr_diffs: Vec<f64> = baseline_thr
156                    .iter()
157                    .zip(current_thr.iter())
158                    .map(|(b, c)| c - b)
159                    .collect();
160                (
161                    Some(summarize_f64(&baseline_thr).map_err(|_| PairedError::NoSamples)?),
162                    Some(summarize_f64(&current_thr).map_err(|_| PairedError::NoSamples)?),
163                    Some(summarize_paired_diffs(&thr_diffs, significance_policy)?),
164                )
165            }
166            None => (None, None, None),
167        };
168
169    Ok(PairedStats {
170        baseline_wall_ms,
171        current_wall_ms,
172        wall_diff_ms,
173        baseline_max_rss_kb,
174        current_max_rss_kb,
175        rss_diff_kb,
176        baseline_throughput_per_s,
177        current_throughput_per_s,
178        throughput_diff_per_s,
179    })
180}
181
182/// Summarize the distribution of paired differences.
183pub fn summarize_paired_diffs(
184    diffs: &[f64],
185    policy: Option<&SignificancePolicy>,
186) -> Result<PairedDiffSummary, PairedError> {
187    if diffs.is_empty() {
188        return Err(PairedError::NoSamples);
189    }
190    let count = diffs.len() as u32;
191    let mean = diffs.iter().sum::<f64>() / count as f64;
192    let mut sorted = diffs.to_vec();
193    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
194    let median = if count % 2 == 1 {
195        sorted[(count / 2) as usize]
196    } else {
197        (sorted[(count / 2 - 1) as usize] + sorted[(count / 2) as usize]) / 2.0
198    };
199    let min = *sorted.first().unwrap();
200    let max = *sorted.last().unwrap();
201    let variance = diffs.iter().map(|d| (d - mean).powi(2)).sum::<f64>() / count as f64;
202    let std_dev = variance.sqrt();
203
204    let significance = policy.map(|p| {
205        let n = count as f64;
206        let std_error = if n > 1.0 { std_dev / n.sqrt() } else { 0.0 };
207        let alpha = p.alpha.unwrap_or(0.05);
208        let min_samples = p.min_samples.unwrap_or(3);
209
210        let t_value = if n >= 30.0 { 1.96 } else { 2.0 };
211        let ci_lower = mean - t_value * std_error;
212        let ci_upper = mean + t_value * std_error;
213
214        let significant = n >= min_samples as f64 && (ci_lower > 0.0 || ci_upper < 0.0);
215
216        Significance {
217            test: perfgate_types::SignificanceTest::WelchT,
218            significant,
219            alpha,
220            p_value: None, // Paired t-test p-value could be added here
221            ci_lower: Some(ci_lower),
222            ci_upper: Some(ci_upper),
223            baseline_samples: count,
224            current_samples: count,
225        }
226    });
227
228    Ok(PairedDiffSummary {
229        mean,
230        median,
231        std_dev,
232        min,
233        max,
234        count,
235        significance,
236    })
237}
238
239/// Compute the coefficient of variation (CV) of the wall-time differences
240/// from a set of paired samples (excluding warmups).
241///
242/// CV = std_dev / |mean|. Returns 0.0 if mean is zero (no variation detectable).
243pub fn compute_paired_cv(samples: &[PairedSample]) -> f64 {
244    let measured: Vec<f64> = samples
245        .iter()
246        .filter(|s| !s.warmup)
247        .map(|s| s.wall_diff_ms as f64)
248        .collect();
249    if measured.is_empty() {
250        return 0.0;
251    }
252    let n = measured.len() as f64;
253    let mean = measured.iter().sum::<f64>() / n;
254    if mean.abs() < f64::EPSILON {
255        return 0.0;
256    }
257    let variance = measured.iter().map(|d| (d - mean).powi(2)).sum::<f64>() / n;
258    variance.sqrt() / mean.abs()
259}
260
261/// Result of comparing paired statistics, including significance testing.
262///
263/// # Examples
264///
265/// ```
266/// use perfgate_paired::{compare_paired_stats, PairedComparison};
267/// use perfgate_types::{PairedStats, PairedDiffSummary, U64Summary};
268///
269/// let stats = PairedStats {
270///     baseline_wall_ms: U64Summary::new(100, 100, 100 ),
271///     current_wall_ms: U64Summary::new(120, 120, 120 ),
272///     wall_diff_ms: PairedDiffSummary {
273///         mean: 20.0, median: 20.0, std_dev: 3.0,
274///         min: 17.0, max: 23.0, count: 10,
275///         significance: None,
276///     },
277///     baseline_max_rss_kb: None,
278///     current_max_rss_kb: None,
279///     rss_diff_kb: None,
280///     baseline_throughput_per_s: None,
281///     current_throughput_per_s: None,
282///     throughput_diff_per_s: None,
283/// };
284///
285/// let cmp: PairedComparison = compare_paired_stats(&stats);
286/// assert!(cmp.is_significant);
287/// assert_eq!(cmp.mean_diff_ms, 20.0);
288/// ```
289#[derive(Debug, Clone, PartialEq)]
290pub struct PairedComparison {
291    pub mean_diff_ms: f64,
292    pub median_diff_ms: f64,
293    pub pct_change: f64,
294    pub std_error: f64,
295    pub ci_95_lower: f64,
296    pub ci_95_upper: f64,
297    pub is_significant: bool,
298}
299
300/// Compare paired statistics and compute a confidence interval.
301///
302/// Uses a paired t-test approach: t = 1.96 for n ≥ 30, t = 2.0 otherwise.
303/// A result is significant when the 95 % CI does not span zero.
304///
305/// # Examples
306///
307/// ```
308/// use perfgate_paired::compare_paired_stats;
309/// use perfgate_types::{PairedStats, PairedDiffSummary, U64Summary};
310///
311/// let stats = PairedStats {
312///     baseline_wall_ms: U64Summary::new(100, 90, 110 ),
313///     current_wall_ms: U64Summary::new(110, 100, 120 ),
314///     wall_diff_ms: PairedDiffSummary {
315///         mean: 10.0, median: 10.0, std_dev: 2.0,
316///         min: 8.0, max: 12.0, count: 5,
317///         significance: None,
318///     },
319///     baseline_max_rss_kb: None,
320///     current_max_rss_kb: None,
321///     rss_diff_kb: None,
322///     baseline_throughput_per_s: None,
323///     current_throughput_per_s: None,
324///     throughput_diff_per_s: None,
325/// };
326///
327/// let cmp = compare_paired_stats(&stats);
328/// assert!(cmp.is_significant);
329/// assert_eq!(cmp.mean_diff_ms, 10.0);
330/// assert!(cmp.pct_change > 0.0);
331/// ```
332pub fn compare_paired_stats(stats: &PairedStats) -> PairedComparison {
333    let diff = &stats.wall_diff_ms;
334    let n = diff.count as f64;
335    let std_error = if n > 1.0 {
336        diff.std_dev / n.sqrt()
337    } else {
338        0.0
339    };
340    let t_value = if n >= 30.0 { 1.96 } else { 2.0 };
341    let ci_95_lower = diff.mean - t_value * std_error;
342    let ci_95_upper = diff.mean + t_value * std_error;
343    let is_significant = ci_95_lower > 0.0 || ci_95_upper < 0.0;
344    let baseline_mean = stats.baseline_wall_ms.median as f64;
345    let pct_change = if baseline_mean > 0.0 {
346        diff.mean / baseline_mean
347    } else {
348        0.0
349    };
350    PairedComparison {
351        mean_diff_ms: diff.mean,
352        median_diff_ms: diff.median,
353        pct_change,
354        std_error,
355        ci_95_lower,
356        ci_95_upper,
357        is_significant,
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use perfgate_types::{PairedSampleHalf, U64Summary};
365
366    fn sample_half(wall_ms: u64) -> PairedSampleHalf {
367        PairedSampleHalf {
368            wall_ms,
369            exit_code: 0,
370            timed_out: false,
371            max_rss_kb: None,
372            stdout: None,
373            stderr: None,
374        }
375    }
376
377    fn sample_half_with_rss(wall_ms: u64, max_rss_kb: u64) -> PairedSampleHalf {
378        PairedSampleHalf {
379            wall_ms,
380            exit_code: 0,
381            timed_out: false,
382            max_rss_kb: Some(max_rss_kb),
383            stdout: None,
384            stderr: None,
385        }
386    }
387
388    fn paired_sample(
389        pair_index: u32,
390        warmup: bool,
391        baseline_wall_ms: u64,
392        current_wall_ms: u64,
393    ) -> PairedSample {
394        PairedSample {
395            pair_index,
396            warmup,
397            baseline: sample_half(baseline_wall_ms),
398            current: sample_half(current_wall_ms),
399            wall_diff_ms: current_wall_ms as i64 - baseline_wall_ms as i64,
400            rss_diff_kb: None,
401        }
402    }
403
404    fn paired_sample_with_rss(
405        pair_index: u32,
406        warmup: bool,
407        baseline_wall_ms: u64,
408        current_wall_ms: u64,
409        baseline_rss: u64,
410        current_rss: u64,
411    ) -> PairedSample {
412        PairedSample {
413            pair_index,
414            warmup,
415            baseline: sample_half_with_rss(baseline_wall_ms, baseline_rss),
416            current: sample_half_with_rss(current_wall_ms, current_rss),
417            wall_diff_ms: current_wall_ms as i64 - baseline_wall_ms as i64,
418            rss_diff_kb: Some(current_rss as i64 - baseline_rss as i64),
419        }
420    }
421
422    #[test]
423    fn test_compute_paired_stats_basic() {
424        let samples = vec![
425            paired_sample(0, false, 100, 90),
426            paired_sample(1, false, 110, 100),
427            paired_sample(2, false, 120, 110),
428        ];
429
430        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
431
432        assert_eq!(stats.baseline_wall_ms.median, 110);
433        assert_eq!(stats.baseline_wall_ms.min, 100);
434        assert_eq!(stats.baseline_wall_ms.max, 120);
435
436        assert_eq!(stats.current_wall_ms.median, 100);
437        assert_eq!(stats.current_wall_ms.min, 90);
438        assert_eq!(stats.current_wall_ms.max, 110);
439
440        assert_eq!(stats.wall_diff_ms.mean, -10.0);
441        assert_eq!(stats.wall_diff_ms.median, -10.0);
442        assert_eq!(stats.wall_diff_ms.std_dev, 0.0);
443        assert_eq!(stats.wall_diff_ms.min, -10.0);
444        assert_eq!(stats.wall_diff_ms.max, -10.0);
445        assert_eq!(stats.wall_diff_ms.count, 3);
446    }
447
448    #[test]
449    fn test_compute_paired_stats_with_variance() {
450        let samples = vec![
451            paired_sample(0, false, 100, 110),
452            paired_sample(1, false, 100, 120),
453            paired_sample(2, false, 100, 130),
454        ];
455
456        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
457
458        assert_eq!(stats.wall_diff_ms.mean, 20.0);
459        assert_eq!(stats.wall_diff_ms.median, 20.0);
460        assert_eq!(stats.wall_diff_ms.min, 10.0);
461        assert_eq!(stats.wall_diff_ms.max, 30.0);
462        assert_eq!(stats.wall_diff_ms.count, 3);
463
464        let expected_std_dev = (200.0_f64 / 3.0).sqrt();
465        assert!(
466            (stats.wall_diff_ms.std_dev - expected_std_dev).abs() < 0.001,
467            "std_dev should be ~8.165, got {}",
468            stats.wall_diff_ms.std_dev
469        );
470    }
471
472    #[test]
473    fn test_compute_paired_stats_filters_warmup() {
474        let samples = vec![
475            paired_sample(0, true, 1000, 2000),
476            paired_sample(1, true, 1000, 2000),
477            paired_sample(2, false, 100, 110),
478            paired_sample(3, false, 100, 120),
479        ];
480
481        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
482
483        assert_eq!(stats.wall_diff_ms.count, 2);
484        assert_eq!(stats.baseline_wall_ms.median, 100);
485        assert_eq!(stats.current_wall_ms.median, 115);
486    }
487
488    #[test]
489    fn test_compute_paired_stats_empty_after_warmup_filter() {
490        let samples = vec![
491            paired_sample(0, true, 100, 110),
492            paired_sample(1, true, 100, 120),
493        ];
494
495        let result = compute_paired_stats(&samples, None, None);
496        assert!(result.is_err(), "should error with no measured samples");
497        assert!(matches!(result.unwrap_err(), PairedError::NoSamples));
498    }
499
500    #[test]
501    fn test_compute_paired_stats_empty_samples() {
502        let samples: Vec<PairedSample> = vec![];
503
504        let result = compute_paired_stats(&samples, None, None);
505        assert!(result.is_err(), "should error with empty samples");
506        assert!(matches!(result.unwrap_err(), PairedError::NoSamples));
507    }
508
509    #[test]
510    fn test_compute_paired_stats_single_sample() {
511        let samples = vec![paired_sample(0, false, 100, 150)];
512
513        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
514
515        assert_eq!(stats.baseline_wall_ms.median, 100);
516        assert_eq!(stats.baseline_wall_ms.min, 100);
517        assert_eq!(stats.baseline_wall_ms.max, 100);
518
519        assert_eq!(stats.current_wall_ms.median, 150);
520
521        assert_eq!(stats.wall_diff_ms.mean, 50.0);
522        assert_eq!(stats.wall_diff_ms.median, 50.0);
523        assert_eq!(stats.wall_diff_ms.std_dev, 0.0);
524        assert_eq!(stats.wall_diff_ms.count, 1);
525    }
526
527    #[test]
528    fn test_compute_paired_stats_with_rss() {
529        let samples = vec![
530            paired_sample_with_rss(0, false, 100, 110, 1000, 1100),
531            paired_sample_with_rss(1, false, 100, 120, 1000, 1200),
532            paired_sample_with_rss(2, false, 100, 130, 1000, 1300),
533        ];
534
535        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
536
537        let baseline_rss = stats.baseline_max_rss_kb.expect("should have baseline RSS");
538        assert_eq!(baseline_rss.median, 1000);
539
540        let current_rss = stats.current_max_rss_kb.expect("should have current RSS");
541        assert_eq!(current_rss.median, 1200);
542
543        let rss_diff = stats.rss_diff_kb.expect("should have RSS diff");
544        assert_eq!(rss_diff.mean, 200.0);
545        assert_eq!(rss_diff.count, 3);
546    }
547
548    #[test]
549    fn test_compute_paired_stats_with_work_units() {
550        let samples = vec![
551            paired_sample(0, false, 1000, 500),
552            paired_sample(1, false, 1000, 500),
553        ];
554
555        let stats = compute_paired_stats(&samples, Some(100), None).expect("should compute stats");
556
557        let baseline_thr = stats
558            .baseline_throughput_per_s
559            .expect("should have baseline throughput");
560        assert_eq!(baseline_thr.median, 100.0);
561
562        let current_thr = stats
563            .current_throughput_per_s
564            .expect("should have current throughput");
565        assert_eq!(current_thr.median, 200.0);
566
567        let thr_diff = stats
568            .throughput_diff_per_s
569            .expect("should have throughput diff");
570        assert_eq!(thr_diff.mean, 100.0);
571    }
572
573    #[test]
574    fn test_compute_paired_stats_no_throughput_without_work_units() {
575        let samples = vec![paired_sample(0, false, 100, 110)];
576
577        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
578
579        assert!(stats.baseline_throughput_per_s.is_none());
580        assert!(stats.current_throughput_per_s.is_none());
581        assert!(stats.throughput_diff_per_s.is_none());
582    }
583
584    #[test]
585    fn test_compute_paired_stats_negative_diffs() {
586        let samples = vec![
587            paired_sample(0, false, 200, 100),
588            paired_sample(1, false, 200, 100),
589        ];
590
591        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
592
593        assert_eq!(stats.wall_diff_ms.mean, -100.0);
594        assert_eq!(stats.wall_diff_ms.median, -100.0);
595    }
596
597    #[test]
598    fn test_compute_paired_stats_even_count_median() {
599        let samples = vec![
600            paired_sample(0, false, 100, 110),
601            paired_sample(1, false, 100, 120),
602            paired_sample(2, false, 100, 130),
603            paired_sample(3, false, 100, 140),
604        ];
605
606        let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
607
608        assert_eq!(stats.wall_diff_ms.median, 25.0);
609        assert_eq!(stats.wall_diff_ms.mean, 25.0);
610    }
611
612    #[test]
613    fn test_compare_paired_stats_basic() {
614        let stats = PairedStats {
615            baseline_wall_ms: U64Summary::new(100, 90, 110),
616            current_wall_ms: U64Summary::new(110, 100, 120),
617            wall_diff_ms: PairedDiffSummary {
618                mean: 10.0,
619                median: 10.0,
620                std_dev: 5.0,
621                min: 5.0,
622                max: 15.0,
623                count: 10,
624                significance: None,
625            },
626            baseline_max_rss_kb: None,
627            current_max_rss_kb: None,
628            rss_diff_kb: None,
629            baseline_throughput_per_s: None,
630            current_throughput_per_s: None,
631            throughput_diff_per_s: None,
632        };
633
634        let comparison = compare_paired_stats(&stats);
635
636        assert_eq!(comparison.mean_diff_ms, 10.0);
637        assert_eq!(comparison.median_diff_ms, 10.0);
638        assert_eq!(comparison.pct_change, 0.1);
639
640        let expected_std_error = 5.0 / (10.0_f64).sqrt();
641        assert!(
642            (comparison.std_error - expected_std_error).abs() < 0.01,
643            "std_error should be ~1.58, got {}",
644            comparison.std_error
645        );
646    }
647
648    #[test]
649    fn test_compare_paired_stats_ci_calculation() {
650        let stats = PairedStats {
651            baseline_wall_ms: U64Summary::new(100, 100, 100),
652            current_wall_ms: U64Summary::new(110, 110, 110),
653            wall_diff_ms: PairedDiffSummary {
654                mean: 10.0,
655                median: 10.0,
656                std_dev: 2.0,
657                min: 8.0,
658                max: 12.0,
659                count: 5,
660                significance: None,
661            },
662            baseline_max_rss_kb: None,
663            current_max_rss_kb: None,
664            rss_diff_kb: None,
665            baseline_throughput_per_s: None,
666            current_throughput_per_s: None,
667            throughput_diff_per_s: None,
668        };
669
670        let comparison = compare_paired_stats(&stats);
671
672        let expected_std_error = 2.0 / (5.0_f64).sqrt();
673        let expected_ci_lower = 10.0 - 2.0 * expected_std_error;
674        let expected_ci_upper = 10.0 + 2.0 * expected_std_error;
675
676        assert!(
677            (comparison.ci_95_lower - expected_ci_lower).abs() < 0.01,
678            "ci_95_lower should be ~{}, got {}",
679            expected_ci_lower,
680            comparison.ci_95_lower
681        );
682        assert!(
683            (comparison.ci_95_upper - expected_ci_upper).abs() < 0.01,
684            "ci_95_upper should be ~{}, got {}",
685            expected_ci_upper,
686            comparison.ci_95_upper
687        );
688
689        assert!(
690            comparison.is_significant,
691            "result should be significant when CI doesn't span zero"
692        );
693    }
694
695    #[test]
696    fn test_compare_paired_stats_large_sample_t_value() {
697        let stats = PairedStats {
698            baseline_wall_ms: U64Summary::new(100, 100, 100),
699            current_wall_ms: U64Summary::new(110, 110, 110),
700            wall_diff_ms: PairedDiffSummary {
701                mean: 10.0,
702                median: 10.0,
703                std_dev: 5.0,
704                min: 0.0,
705                max: 20.0,
706                count: 30,
707                significance: None,
708            },
709            baseline_max_rss_kb: None,
710            current_max_rss_kb: None,
711            rss_diff_kb: None,
712            baseline_throughput_per_s: None,
713            current_throughput_per_s: None,
714            throughput_diff_per_s: None,
715        };
716
717        let comparison = compare_paired_stats(&stats);
718
719        let expected_std_error = 5.0 / (30.0_f64).sqrt();
720        let expected_ci_lower = 10.0 - 1.96 * expected_std_error;
721
722        assert!(
723            (comparison.ci_95_lower - expected_ci_lower).abs() < 0.01,
724            "ci_95_lower with n>=30 should use t_value=1.96"
725        );
726    }
727
728    #[test]
729    fn test_compare_paired_stats_not_significant() {
730        let stats = PairedStats {
731            baseline_wall_ms: U64Summary::new(100, 100, 100),
732            current_wall_ms: U64Summary::new(101, 101, 101),
733            wall_diff_ms: PairedDiffSummary {
734                mean: 1.0,
735                median: 1.0,
736                std_dev: 10.0,
737                min: -15.0,
738                max: 15.0,
739                count: 5,
740                significance: None,
741            },
742            baseline_max_rss_kb: None,
743            current_max_rss_kb: None,
744            rss_diff_kb: None,
745            baseline_throughput_per_s: None,
746            current_throughput_per_s: None,
747            throughput_diff_per_s: None,
748        };
749
750        let comparison = compare_paired_stats(&stats);
751
752        assert!(
753            !comparison.is_significant,
754            "result should not be significant when CI spans zero: [{}, {}]",
755            comparison.ci_95_lower, comparison.ci_95_upper
756        );
757        assert!(
758            comparison.ci_95_lower < 0.0 && comparison.ci_95_upper > 0.0,
759            "CI should span zero"
760        );
761    }
762
763    #[test]
764    fn test_compare_paired_stats_single_sample() {
765        let stats = PairedStats {
766            baseline_wall_ms: U64Summary::new(100, 100, 100),
767            current_wall_ms: U64Summary::new(110, 110, 110),
768            wall_diff_ms: PairedDiffSummary {
769                mean: 10.0,
770                median: 10.0,
771                std_dev: 0.0,
772                min: 10.0,
773                max: 10.0,
774                count: 1,
775                significance: None,
776            },
777            baseline_max_rss_kb: None,
778            current_max_rss_kb: None,
779            rss_diff_kb: None,
780            baseline_throughput_per_s: None,
781            current_throughput_per_s: None,
782            throughput_diff_per_s: None,
783        };
784
785        let comparison = compare_paired_stats(&stats);
786
787        assert_eq!(comparison.std_error, 0.0);
788        assert_eq!(comparison.ci_95_lower, 10.0);
789        assert_eq!(comparison.ci_95_upper, 10.0);
790    }
791
792    #[test]
793    fn test_compare_paired_stats_zero_baseline() {
794        let stats = PairedStats {
795            baseline_wall_ms: U64Summary::new(0, 0, 0),
796            current_wall_ms: U64Summary::new(10, 10, 10),
797            wall_diff_ms: PairedDiffSummary {
798                mean: 10.0,
799                median: 10.0,
800                std_dev: 0.0,
801                min: 10.0,
802                max: 10.0,
803                count: 1,
804                significance: None,
805            },
806            baseline_max_rss_kb: None,
807            current_max_rss_kb: None,
808            rss_diff_kb: None,
809            baseline_throughput_per_s: None,
810            current_throughput_per_s: None,
811            throughput_diff_per_s: None,
812        };
813
814        let comparison = compare_paired_stats(&stats);
815
816        assert_eq!(
817            comparison.pct_change, 0.0,
818            "pct_change should be 0 when baseline is 0"
819        );
820    }
821
822    #[test]
823    fn test_compare_paired_stats_negative_improvement() {
824        let stats = PairedStats {
825            baseline_wall_ms: U64Summary::new(100, 100, 100),
826            current_wall_ms: U64Summary::new(80, 80, 80),
827            wall_diff_ms: PairedDiffSummary {
828                mean: -20.0,
829                median: -20.0,
830                std_dev: 2.0,
831                min: -22.0,
832                max: -18.0,
833                count: 5,
834                significance: None,
835            },
836            baseline_max_rss_kb: None,
837            current_max_rss_kb: None,
838            rss_diff_kb: None,
839            baseline_throughput_per_s: None,
840            current_throughput_per_s: None,
841            throughput_diff_per_s: None,
842        };
843
844        let comparison = compare_paired_stats(&stats);
845
846        assert_eq!(comparison.mean_diff_ms, -20.0);
847        assert_eq!(comparison.pct_change, -0.2);
848        assert!(
849            comparison.is_significant,
850            "significant improvement should be detected"
851        );
852        assert!(
853            comparison.ci_95_upper < 0.0,
854            "CI upper bound should be negative for improvement"
855        );
856    }
857
858    #[test]
859    fn test_summarize_paired_diffs_empty() {
860        let result = summarize_paired_diffs(&[], None);
861        assert!(matches!(result, Err(PairedError::NoSamples)));
862    }
863
864    #[test]
865    fn test_summarize_paired_diffs_single() {
866        let summary = summarize_paired_diffs(&[5.0], None).unwrap();
867        assert_eq!(summary.mean, 5.0);
868        assert_eq!(summary.median, 5.0);
869        assert_eq!(summary.std_dev, 0.0);
870        assert_eq!(summary.min, 5.0);
871        assert_eq!(summary.max, 5.0);
872        assert_eq!(summary.count, 1);
873    }
874
875    #[test]
876    fn test_summarize_paired_diffs_zero_variance() {
877        let summary = summarize_paired_diffs(&[10.0, 10.0, 10.0, 10.0], None).unwrap();
878        assert_eq!(summary.mean, 10.0);
879        assert_eq!(summary.std_dev, 0.0);
880        assert_eq!(summary.count, 4);
881    }
882
883    #[test]
884    fn test_summarize_paired_diffs_large_sample() {
885        let diffs: Vec<f64> = (0..1000).map(|i| i as f64).collect();
886        let summary = summarize_paired_diffs(&diffs, None).unwrap();
887
888        assert_eq!(summary.count, 1000);
889        assert_eq!(summary.min, 0.0);
890        assert_eq!(summary.max, 999.0);
891
892        let expected_mean = (0.0 + 999.0) / 2.0;
893        assert!((summary.mean - expected_mean).abs() < 0.1);
894    }
895
896    mod edge_cases {
897        use super::*;
898
899        #[test]
900        fn test_ci_bounds_with_zero_std_dev() {
901            let stats = PairedStats {
902                baseline_wall_ms: U64Summary::new(100, 100, 100),
903                current_wall_ms: U64Summary::new(110, 110, 110),
904                wall_diff_ms: PairedDiffSummary {
905                    mean: 10.0,
906                    median: 10.0,
907                    std_dev: 0.0,
908                    min: 10.0,
909                    max: 10.0,
910                    count: 10,
911                    significance: None,
912                },
913                baseline_max_rss_kb: None,
914                current_max_rss_kb: None,
915                rss_diff_kb: None,
916                baseline_throughput_per_s: None,
917                current_throughput_per_s: None,
918                throughput_diff_per_s: None,
919            };
920
921            let comparison = compare_paired_stats(&stats);
922            assert_eq!(comparison.std_error, 0.0);
923            assert_eq!(comparison.ci_95_lower, 10.0);
924            assert_eq!(comparison.ci_95_upper, 10.0);
925            assert!(comparison.is_significant);
926        }
927
928        #[test]
929        fn test_large_positive_diff() {
930            let stats = PairedStats {
931                baseline_wall_ms: U64Summary::new(100, 100, 100),
932                current_wall_ms: U64Summary::new(100000, 100000, 100000),
933                wall_diff_ms: PairedDiffSummary {
934                    mean: 99900.0,
935                    median: 99900.0,
936                    std_dev: 100.0,
937                    min: 99800.0,
938                    max: 100000.0,
939                    count: 50,
940                    significance: None,
941                },
942                baseline_max_rss_kb: None,
943                current_max_rss_kb: None,
944                rss_diff_kb: None,
945                baseline_throughput_per_s: None,
946                current_throughput_per_s: None,
947                throughput_diff_per_s: None,
948            };
949
950            let comparison = compare_paired_stats(&stats);
951            assert_eq!(comparison.mean_diff_ms, 99900.0);
952            assert!((comparison.pct_change - 999.0).abs() < 0.01);
953            assert!(comparison.is_significant);
954        }
955
956        #[test]
957        fn test_very_small_diffs() {
958            let stats = PairedStats {
959                baseline_wall_ms: U64Summary::new(100000, 100000, 100000),
960                current_wall_ms: U64Summary::new(100001, 100001, 100001),
961                wall_diff_ms: PairedDiffSummary {
962                    mean: 1.0,
963                    median: 1.0,
964                    std_dev: 0.5,
965                    min: 0.0,
966                    max: 2.0,
967                    count: 30,
968                    significance: None,
969                },
970                baseline_max_rss_kb: None,
971                current_max_rss_kb: None,
972                rss_diff_kb: None,
973                baseline_throughput_per_s: None,
974                current_throughput_per_s: None,
975                throughput_diff_per_s: None,
976            };
977
978            let comparison = compare_paired_stats(&stats);
979            assert!((comparison.pct_change - 0.00001).abs() < 0.000001);
980        }
981    }
982
983    #[test]
984    fn test_compute_paired_cv_empty_samples() {
985        let samples: Vec<PairedSample> = vec![];
986        assert_eq!(compute_paired_cv(&samples), 0.0);
987    }
988
989    #[test]
990    fn test_compute_paired_cv_no_variance() {
991        let samples = vec![
992            paired_sample(0, false, 100, 110),
993            paired_sample(1, false, 100, 110),
994            paired_sample(2, false, 100, 110),
995        ];
996        assert_eq!(compute_paired_cv(&samples), 0.0);
997    }
998
999    #[test]
1000    fn test_compute_paired_cv_with_variance() {
1001        // Diffs: 10, 20, 30 => mean=20, stddev=sqrt(200/3)~=8.165
1002        // CV = 8.165/20 = 0.408
1003        let samples = vec![
1004            paired_sample(0, false, 100, 110),
1005            paired_sample(1, false, 100, 120),
1006            paired_sample(2, false, 100, 130),
1007        ];
1008        let cv = compute_paired_cv(&samples);
1009        assert!((cv - 0.4082).abs() < 0.01, "expected CV ~0.408, got {}", cv);
1010    }
1011
1012    #[test]
1013    fn test_compute_paired_cv_skips_warmup() {
1014        let samples = vec![
1015            paired_sample(0, true, 100, 1000), // warmup, should be ignored
1016            paired_sample(1, false, 100, 110),
1017            paired_sample(2, false, 100, 110),
1018        ];
1019        assert_eq!(compute_paired_cv(&samples), 0.0);
1020    }
1021
1022    #[test]
1023    fn test_compute_paired_cv_zero_mean() {
1024        // Diffs: -10, +10 => mean=0 => CV should be 0 (avoid division by zero)
1025        let samples = vec![
1026            paired_sample(0, false, 110, 100),
1027            paired_sample(1, false, 100, 110),
1028        ];
1029        assert_eq!(compute_paired_cv(&samples), 0.0);
1030    }
1031
1032    #[test]
1033    fn test_compute_paired_cv_high_noise() {
1034        // Diffs: -50, +60 => mean=5, stddev large => high CV
1035        let samples = vec![
1036            paired_sample(0, false, 200, 150),
1037            paired_sample(1, false, 100, 160),
1038        ];
1039        let cv = compute_paired_cv(&samples);
1040        assert!(cv > 1.0, "expected very high CV, got {}", cv);
1041    }
1042}
1043
1044#[cfg(test)]
1045mod property_tests {
1046    use super::*;
1047    use perfgate_types::{PairedSample, PairedSampleHalf};
1048    use proptest::prelude::*;
1049
1050    fn finite_f64_strategy() -> impl Strategy<Value = f64> {
1051        -1e100f64..1e100f64
1052    }
1053
1054    fn make_paired_samples(baseline: &[u64], current: &[u64]) -> Vec<PairedSample> {
1055        baseline
1056            .iter()
1057            .zip(current.iter())
1058            .enumerate()
1059            .map(|(i, (&b, &c))| PairedSample {
1060                pair_index: i as u32,
1061                warmup: false,
1062                baseline: PairedSampleHalf {
1063                    wall_ms: b,
1064                    exit_code: 0,
1065                    timed_out: false,
1066                    max_rss_kb: None,
1067                    stdout: None,
1068                    stderr: None,
1069                },
1070                current: PairedSampleHalf {
1071                    wall_ms: c,
1072                    exit_code: 0,
1073                    timed_out: false,
1074                    max_rss_kb: None,
1075                    stdout: None,
1076                    stderr: None,
1077                },
1078                wall_diff_ms: c as i64 - b as i64,
1079                rss_diff_kb: None,
1080            })
1081            .collect()
1082    }
1083
1084    proptest! {
1085        #[test]
1086        fn prop_summarize_paired_diffs_count_matches(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1087            let summary = summarize_paired_diffs(&diffs, None).unwrap();
1088            prop_assert_eq!(summary.count, diffs.len() as u32);
1089        }
1090
1091        #[test]
1092        fn prop_summarize_paired_diffs_mean_correct(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1093            let summary = summarize_paired_diffs(&diffs, None).unwrap();
1094            let expected_mean: f64 = diffs.iter().sum::<f64>() / diffs.len() as f64;
1095            prop_assert!((summary.mean - expected_mean).abs() < 1e-10 || expected_mean.abs() < 1e-10);
1096        }
1097
1098        #[test]
1099        fn prop_summarize_paired_diffs_min_max_bounds(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1100            let summary = summarize_paired_diffs(&diffs, None).unwrap();
1101            let expected_min = diffs.iter().cloned().fold(f64::INFINITY, f64::min);
1102            let expected_max = diffs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
1103
1104            if expected_min.is_finite() {
1105                prop_assert!((summary.min - expected_min).abs() < 1e-10);
1106            }
1107            if expected_max.is_finite() {
1108                prop_assert!((summary.max - expected_max).abs() < 1e-10);
1109            }
1110        }
1111
1112        #[test]
1113        fn prop_summarize_paired_diffs_ordering(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1114            let summary = summarize_paired_diffs(&diffs, None).unwrap();
1115            if summary.min.is_finite() && summary.median.is_finite() && summary.max.is_finite() {
1116                prop_assert!(summary.min <= summary.median);
1117                prop_assert!(summary.median <= summary.max);
1118            }
1119        }
1120
1121        #[test]
1122        fn prop_std_dev_non_negative(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1123            let summary = summarize_paired_diffs(&diffs, None).unwrap();
1124            prop_assert!(summary.std_dev >= 0.0 || !summary.std_dev.is_finite());
1125        }
1126
1127        #[test]
1128        fn prop_ci_contains_mean(
1129            mean in -1000.0f64..1000.0,
1130            std_dev in 0.1f64..100.0,
1131            count in 2u32..100
1132        ) {
1133            let stats = PairedStats {
1134                baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1135                current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1136                wall_diff_ms: PairedDiffSummary {
1137                    mean,
1138                    median: mean,
1139                    std_dev,
1140                    min: mean - std_dev,
1141                    max: mean + std_dev,
1142                    count,
1143                    significance: None,
1144                    },
1145                baseline_max_rss_kb: None,
1146                current_max_rss_kb: None,
1147                rss_diff_kb: None,
1148                baseline_throughput_per_s: None,
1149                current_throughput_per_s: None,
1150                throughput_diff_per_s: None,
1151            };
1152
1153            let comparison = compare_paired_stats(&stats);
1154
1155            prop_assert!(comparison.ci_95_lower <= mean);
1156            prop_assert!(comparison.ci_95_upper >= mean);
1157        }
1158
1159        #[test]
1160        fn prop_ci_width_decreases_with_sample_size(
1161            mean in 0.0f64..100.0,
1162            std_dev in 1.0f64..10.0,
1163        ) {
1164            let make_comparison = |count: u32| {
1165                let stats = PairedStats {
1166                    baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1167                    current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1168                    wall_diff_ms: PairedDiffSummary {
1169                        mean,
1170                        median: mean,
1171                        std_dev,
1172                        min: mean - std_dev,
1173                        max: mean + std_dev,
1174                        count,
1175                        significance: None,
1176                        },
1177                    baseline_max_rss_kb: None,
1178                    current_max_rss_kb: None,
1179                    rss_diff_kb: None,
1180                    baseline_throughput_per_s: None,
1181                    current_throughput_per_s: None,
1182                    throughput_diff_per_s: None,
1183                };
1184                compare_paired_stats(&stats)
1185            };
1186
1187            let small = make_comparison(10);
1188            let large = make_comparison(100);
1189
1190            let width_small = small.ci_95_upper - small.ci_95_lower;
1191            let width_large = large.ci_95_upper - large.ci_95_lower;
1192
1193            prop_assert!(width_large < width_small, "CI should narrow with more samples");
1194        }
1195
1196        #[test]
1197        fn prop_zero_variance_zero_std_error(
1198            mean in -1000.0f64..1000.0,
1199            count in 2u32..100
1200        ) {
1201            let stats = PairedStats {
1202                baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1203                current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1204                wall_diff_ms: PairedDiffSummary {
1205                    mean,
1206                    median: mean,
1207                    std_dev: 0.0,
1208                    min: mean,
1209                    max: mean,
1210                    count,
1211                    significance: None,
1212                    },
1213                baseline_max_rss_kb: None,
1214                current_max_rss_kb: None,
1215                rss_diff_kb: None,
1216                baseline_throughput_per_s: None,
1217                current_throughput_per_s: None,
1218                throughput_diff_per_s: None,
1219            };
1220
1221            let comparison = compare_paired_stats(&stats);
1222
1223            prop_assert_eq!(comparison.std_error, 0.0);
1224            prop_assert_eq!(comparison.ci_95_lower, mean);
1225            prop_assert_eq!(comparison.ci_95_upper, mean);
1226        }
1227
1228        #[test]
1229        fn prop_significance_deterministic(
1230            mean in -100.0f64..100.0,
1231            std_dev in 0.0f64..50.0,
1232            count in 1u32..50
1233        ) {
1234            let stats = PairedStats {
1235                baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1236                current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1237                wall_diff_ms: PairedDiffSummary {
1238                    mean,
1239                    median: mean,
1240                    std_dev,
1241                    min: mean - std_dev,
1242                    max: mean + std_dev,
1243                    count,
1244                    significance: None,
1245                    },
1246                baseline_max_rss_kb: None,
1247                current_max_rss_kb: None,
1248                rss_diff_kb: None,
1249                baseline_throughput_per_s: None,
1250                current_throughput_per_s: None,
1251                throughput_diff_per_s: None,
1252            };
1253
1254            let comparison = compare_paired_stats(&stats);
1255
1256            let is_significant = comparison.ci_95_lower > 0.0 || comparison.ci_95_upper < 0.0;
1257            prop_assert_eq!(comparison.is_significant, is_significant);
1258        }
1259
1260        #[test]
1261        fn prop_paired_stats_deterministic(
1262            baseline in prop::collection::vec(1u64..10000u64, 5..50),
1263            current in prop::collection::vec(1u64..10000u64, 5..50),
1264        ) {
1265            let len = baseline.len().min(current.len());
1266            let samples = make_paired_samples(&baseline[..len], &current[..len]);
1267            let r1 = compute_paired_stats(&samples, None, None);
1268            let r2 = compute_paired_stats(&samples, None, None);
1269            match (r1, r2) {
1270                (Ok(s1), Ok(s2)) => {
1271                    prop_assert_eq!(s1.wall_diff_ms.mean, s2.wall_diff_ms.mean);
1272                    prop_assert_eq!(s1.wall_diff_ms.median, s2.wall_diff_ms.median);
1273                    prop_assert_eq!(s1.wall_diff_ms.std_dev, s2.wall_diff_ms.std_dev);
1274                    prop_assert_eq!(s1.wall_diff_ms.count, s2.wall_diff_ms.count);
1275                }
1276                (Err(_), Err(_)) => {}
1277                _ => prop_assert!(false, "both calls must produce the same result"),
1278            }
1279        }
1280
1281        #[test]
1282        fn prop_ci_contains_mean_diff_from_samples(
1283            baseline in prop::collection::vec(1u64..10000u64, 5..50),
1284            current in prop::collection::vec(1u64..10000u64, 5..50),
1285        ) {
1286            let len = baseline.len().min(current.len());
1287            let samples = make_paired_samples(&baseline[..len], &current[..len]);
1288            if let Ok(stats) = compute_paired_stats(&samples, None, None) {
1289                let cmp = compare_paired_stats(&stats);
1290                prop_assert!(
1291                    cmp.ci_95_lower <= cmp.mean_diff_ms,
1292                    "CI lower {} must be <= mean {}",
1293                    cmp.ci_95_lower, cmp.mean_diff_ms
1294                );
1295                prop_assert!(
1296                    cmp.ci_95_upper >= cmp.mean_diff_ms,
1297                    "CI upper {} must be >= mean {}",
1298                    cmp.ci_95_upper, cmp.mean_diff_ms
1299                );
1300            }
1301        }
1302
1303        #[test]
1304        fn prop_reversing_negates_mean_diff(
1305            baseline in prop::collection::vec(1u64..10000u64, 5..50),
1306            current in prop::collection::vec(1u64..10000u64, 5..50),
1307        ) {
1308            let len = baseline.len().min(current.len());
1309            let fwd = make_paired_samples(&baseline[..len], &current[..len]);
1310            let rev = make_paired_samples(&current[..len], &baseline[..len]);
1311            if let (Ok(fwd_stats), Ok(rev_stats)) =
1312                (compute_paired_stats(&fwd, None, None), compute_paired_stats(&rev, None, None))
1313            {
1314                let fwd_cmp = compare_paired_stats(&fwd_stats);
1315                let rev_cmp = compare_paired_stats(&rev_stats);
1316                prop_assert!(
1317                    (fwd_cmp.mean_diff_ms + rev_cmp.mean_diff_ms).abs() < 1e-10,
1318                    "reversing must negate mean diff: {} vs {}",
1319                    fwd_cmp.mean_diff_ms, rev_cmp.mean_diff_ms
1320                );
1321            }
1322        }
1323
1324        /// Full pipeline determinism: compute + compare yields identical results.
1325        #[test]
1326        fn prop_full_pipeline_determinism(
1327            baseline in prop::collection::vec(1u64..10000u64, 5..50),
1328            current in prop::collection::vec(1u64..10000u64, 5..50),
1329        ) {
1330            let len = baseline.len().min(current.len());
1331            let samples = make_paired_samples(&baseline[..len], &current[..len]);
1332            let stats1 = compute_paired_stats(&samples, None, None).unwrap();
1333            let stats2 = compute_paired_stats(&samples, None, None).unwrap();
1334            let cmp1 = compare_paired_stats(&stats1);
1335            let cmp2 = compare_paired_stats(&stats2);
1336            prop_assert_eq!(cmp1, cmp2, "identical inputs must produce identical comparisons");
1337        }
1338
1339        /// Sample count in output matches number of non-warmup input samples.
1340        #[test]
1341        fn prop_sample_count_preserved(
1342            baseline in prop::collection::vec(1u64..10000u64, 2..50),
1343            current in prop::collection::vec(1u64..10000u64, 2..50),
1344        ) {
1345            let len = baseline.len().min(current.len());
1346            let samples = make_paired_samples(&baseline[..len], &current[..len]);
1347            let non_warmup = samples.iter().filter(|s| !s.warmup).count() as u32;
1348            let stats = compute_paired_stats(&samples, None, None).unwrap();
1349            prop_assert_eq!(
1350                stats.wall_diff_ms.count, non_warmup,
1351                "output count {} must equal non-warmup input count {}",
1352                stats.wall_diff_ms.count, non_warmup
1353            );
1354        }
1355
1356        /// Swapping baseline/current flips the sign of pct_change.
1357        #[test]
1358        fn prop_diff_symmetry_direction_flips(
1359            baseline in prop::collection::vec(1u64..10000u64, 5..50),
1360            current in prop::collection::vec(1u64..10000u64, 5..50),
1361        ) {
1362            let len = baseline.len().min(current.len());
1363            let fwd = make_paired_samples(&baseline[..len], &current[..len]);
1364            let rev = make_paired_samples(&current[..len], &baseline[..len]);
1365            if let (Ok(fwd_stats), Ok(rev_stats)) =
1366                (compute_paired_stats(&fwd, None, None), compute_paired_stats(&rev, None, None))
1367            {
1368                let fwd_cmp = compare_paired_stats(&fwd_stats);
1369                let rev_cmp = compare_paired_stats(&rev_stats);
1370                // If forward shows regression (positive diff), reverse must show improvement.
1371                if fwd_cmp.mean_diff_ms > 0.0 {
1372                    prop_assert!(
1373                        rev_cmp.mean_diff_ms < 0.0,
1374                        "if forward is regression ({:.2}), reverse must be improvement ({:.2})",
1375                        fwd_cmp.mean_diff_ms, rev_cmp.mean_diff_ms
1376                    );
1377                } else if fwd_cmp.mean_diff_ms < 0.0 {
1378                    prop_assert!(
1379                        rev_cmp.mean_diff_ms > 0.0,
1380                        "if forward is improvement ({:.2}), reverse must be regression ({:.2})",
1381                        fwd_cmp.mean_diff_ms, rev_cmp.mean_diff_ms
1382                    );
1383                }
1384                // Median must also flip.
1385                prop_assert!(
1386                    (fwd_cmp.median_diff_ms + rev_cmp.median_diff_ms).abs() < 1e-10,
1387                    "median diff must negate: {:.2} vs {:.2}",
1388                    fwd_cmp.median_diff_ms, rev_cmp.median_diff_ms
1389                );
1390            }
1391        }
1392
1393        /// Mean difference is bounded by min and max of individual pair diffs.
1394        #[test]
1395        fn prop_mean_bounded_by_min_max(
1396            baseline in prop::collection::vec(1u64..10000u64, 2..50),
1397            current in prop::collection::vec(1u64..10000u64, 2..50),
1398        ) {
1399            let len = baseline.len().min(current.len());
1400            let samples = make_paired_samples(&baseline[..len], &current[..len]);
1401            if let Ok(stats) = compute_paired_stats(&samples, None, None) {
1402                let diff = &stats.wall_diff_ms;
1403                prop_assert!(
1404                    diff.mean >= diff.min,
1405                    "mean {:.2} must be >= min {:.2}",
1406                    diff.mean, diff.min
1407                );
1408                prop_assert!(
1409                    diff.mean <= diff.max,
1410                    "mean {:.2} must be <= max {:.2}",
1411                    diff.mean, diff.max
1412                );
1413            }
1414        }
1415    }
1416}