1use perfgate_stats::{summarize_f64, summarize_u64};
79use perfgate_types::{
80 PairedDiffSummary, PairedSample, PairedStats, Significance, SignificancePolicy,
81};
82
83pub use perfgate_error::PairedError;
84
85pub fn compute_paired_stats(
91 samples: &[PairedSample],
92 work_units: Option<u64>,
93 significance_policy: Option<&SignificancePolicy>,
94) -> Result<PairedStats, PairedError> {
95 let measured: Vec<&PairedSample> = samples.iter().filter(|s| !s.warmup).collect();
96 if measured.is_empty() {
97 return Err(PairedError::NoSamples);
98 }
99
100 let baseline_wall: Vec<u64> = measured.iter().map(|s| s.baseline.wall_ms).collect();
101 let current_wall: Vec<u64> = measured.iter().map(|s| s.current.wall_ms).collect();
102 let wall_diffs: Vec<f64> = measured.iter().map(|s| s.wall_diff_ms as f64).collect();
103
104 let baseline_wall_ms = summarize_u64(&baseline_wall).map_err(|_| PairedError::NoSamples)?;
105 let current_wall_ms = summarize_u64(¤t_wall).map_err(|_| PairedError::NoSamples)?;
106 let wall_diff_ms = summarize_paired_diffs(&wall_diffs, significance_policy)?;
107
108 let baseline_rss: Vec<u64> = measured
109 .iter()
110 .filter_map(|s| s.baseline.max_rss_kb)
111 .collect();
112 let current_rss: Vec<u64> = measured
113 .iter()
114 .filter_map(|s| s.current.max_rss_kb)
115 .collect();
116 let rss_diffs: Vec<f64> = measured
117 .iter()
118 .filter_map(|s| s.rss_diff_kb)
119 .map(|d| d as f64)
120 .collect();
121
122 let baseline_max_rss_kb = if baseline_rss.is_empty() {
123 None
124 } else {
125 Some(summarize_u64(&baseline_rss).map_err(|_| PairedError::NoSamples)?)
126 };
127 let current_max_rss_kb = if current_rss.is_empty() {
128 None
129 } else {
130 Some(summarize_u64(¤t_rss).map_err(|_| PairedError::NoSamples)?)
131 };
132 let rss_diff_kb = if rss_diffs.is_empty() {
133 None
134 } else {
135 Some(summarize_paired_diffs(&rss_diffs, significance_policy)?)
136 };
137
138 let (baseline_throughput_per_s, current_throughput_per_s, throughput_diff_per_s) =
139 match work_units {
140 Some(work) => {
141 let baseline_thr: Vec<f64> = measured
142 .iter()
143 .map(|s| {
144 let secs = s.baseline.wall_ms as f64 / 1000.0;
145 if secs <= 0.0 { 0.0 } else { work as f64 / secs }
146 })
147 .collect();
148 let current_thr: Vec<f64> = measured
149 .iter()
150 .map(|s| {
151 let secs = s.current.wall_ms as f64 / 1000.0;
152 if secs <= 0.0 { 0.0 } else { work as f64 / secs }
153 })
154 .collect();
155 let thr_diffs: Vec<f64> = baseline_thr
156 .iter()
157 .zip(current_thr.iter())
158 .map(|(b, c)| c - b)
159 .collect();
160 (
161 Some(summarize_f64(&baseline_thr).map_err(|_| PairedError::NoSamples)?),
162 Some(summarize_f64(¤t_thr).map_err(|_| PairedError::NoSamples)?),
163 Some(summarize_paired_diffs(&thr_diffs, significance_policy)?),
164 )
165 }
166 None => (None, None, None),
167 };
168
169 Ok(PairedStats {
170 baseline_wall_ms,
171 current_wall_ms,
172 wall_diff_ms,
173 baseline_max_rss_kb,
174 current_max_rss_kb,
175 rss_diff_kb,
176 baseline_throughput_per_s,
177 current_throughput_per_s,
178 throughput_diff_per_s,
179 })
180}
181
182pub fn summarize_paired_diffs(
184 diffs: &[f64],
185 policy: Option<&SignificancePolicy>,
186) -> Result<PairedDiffSummary, PairedError> {
187 if diffs.is_empty() {
188 return Err(PairedError::NoSamples);
189 }
190 let count = diffs.len() as u32;
191 let mean = diffs.iter().sum::<f64>() / count as f64;
192 let mut sorted = diffs.to_vec();
193 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
194 let median = if count % 2 == 1 {
195 sorted[(count / 2) as usize]
196 } else {
197 (sorted[(count / 2 - 1) as usize] + sorted[(count / 2) as usize]) / 2.0
198 };
199 let min = *sorted.first().unwrap();
200 let max = *sorted.last().unwrap();
201 let variance = diffs.iter().map(|d| (d - mean).powi(2)).sum::<f64>() / count as f64;
202 let std_dev = variance.sqrt();
203
204 let significance = policy.map(|p| {
205 let n = count as f64;
206 let std_error = if n > 1.0 { std_dev / n.sqrt() } else { 0.0 };
207 let alpha = p.alpha.unwrap_or(0.05);
208 let min_samples = p.min_samples.unwrap_or(3);
209
210 let t_value = if n >= 30.0 { 1.96 } else { 2.0 };
211 let ci_lower = mean - t_value * std_error;
212 let ci_upper = mean + t_value * std_error;
213
214 let significant = n >= min_samples as f64 && (ci_lower > 0.0 || ci_upper < 0.0);
215
216 Significance {
217 test: perfgate_types::SignificanceTest::WelchT,
218 significant,
219 alpha,
220 p_value: None, ci_lower: Some(ci_lower),
222 ci_upper: Some(ci_upper),
223 baseline_samples: count,
224 current_samples: count,
225 }
226 });
227
228 Ok(PairedDiffSummary {
229 mean,
230 median,
231 std_dev,
232 min,
233 max,
234 count,
235 significance,
236 })
237}
238
239pub fn compute_paired_cv(samples: &[PairedSample]) -> f64 {
244 let measured: Vec<f64> = samples
245 .iter()
246 .filter(|s| !s.warmup)
247 .map(|s| s.wall_diff_ms as f64)
248 .collect();
249 if measured.is_empty() {
250 return 0.0;
251 }
252 let n = measured.len() as f64;
253 let mean = measured.iter().sum::<f64>() / n;
254 if mean.abs() < f64::EPSILON {
255 return 0.0;
256 }
257 let variance = measured.iter().map(|d| (d - mean).powi(2)).sum::<f64>() / n;
258 variance.sqrt() / mean.abs()
259}
260
261#[derive(Debug, Clone, PartialEq)]
290pub struct PairedComparison {
291 pub mean_diff_ms: f64,
292 pub median_diff_ms: f64,
293 pub pct_change: f64,
294 pub std_error: f64,
295 pub ci_95_lower: f64,
296 pub ci_95_upper: f64,
297 pub is_significant: bool,
298}
299
300pub fn compare_paired_stats(stats: &PairedStats) -> PairedComparison {
333 let diff = &stats.wall_diff_ms;
334 let n = diff.count as f64;
335 let std_error = if n > 1.0 {
336 diff.std_dev / n.sqrt()
337 } else {
338 0.0
339 };
340 let t_value = if n >= 30.0 { 1.96 } else { 2.0 };
341 let ci_95_lower = diff.mean - t_value * std_error;
342 let ci_95_upper = diff.mean + t_value * std_error;
343 let is_significant = ci_95_lower > 0.0 || ci_95_upper < 0.0;
344 let baseline_mean = stats.baseline_wall_ms.median as f64;
345 let pct_change = if baseline_mean > 0.0 {
346 diff.mean / baseline_mean
347 } else {
348 0.0
349 };
350 PairedComparison {
351 mean_diff_ms: diff.mean,
352 median_diff_ms: diff.median,
353 pct_change,
354 std_error,
355 ci_95_lower,
356 ci_95_upper,
357 is_significant,
358 }
359}
360
361#[cfg(test)]
362mod tests {
363 use super::*;
364 use perfgate_types::{PairedSampleHalf, U64Summary};
365
366 fn sample_half(wall_ms: u64) -> PairedSampleHalf {
367 PairedSampleHalf {
368 wall_ms,
369 exit_code: 0,
370 timed_out: false,
371 max_rss_kb: None,
372 stdout: None,
373 stderr: None,
374 }
375 }
376
377 fn sample_half_with_rss(wall_ms: u64, max_rss_kb: u64) -> PairedSampleHalf {
378 PairedSampleHalf {
379 wall_ms,
380 exit_code: 0,
381 timed_out: false,
382 max_rss_kb: Some(max_rss_kb),
383 stdout: None,
384 stderr: None,
385 }
386 }
387
388 fn paired_sample(
389 pair_index: u32,
390 warmup: bool,
391 baseline_wall_ms: u64,
392 current_wall_ms: u64,
393 ) -> PairedSample {
394 PairedSample {
395 pair_index,
396 warmup,
397 baseline: sample_half(baseline_wall_ms),
398 current: sample_half(current_wall_ms),
399 wall_diff_ms: current_wall_ms as i64 - baseline_wall_ms as i64,
400 rss_diff_kb: None,
401 }
402 }
403
404 fn paired_sample_with_rss(
405 pair_index: u32,
406 warmup: bool,
407 baseline_wall_ms: u64,
408 current_wall_ms: u64,
409 baseline_rss: u64,
410 current_rss: u64,
411 ) -> PairedSample {
412 PairedSample {
413 pair_index,
414 warmup,
415 baseline: sample_half_with_rss(baseline_wall_ms, baseline_rss),
416 current: sample_half_with_rss(current_wall_ms, current_rss),
417 wall_diff_ms: current_wall_ms as i64 - baseline_wall_ms as i64,
418 rss_diff_kb: Some(current_rss as i64 - baseline_rss as i64),
419 }
420 }
421
422 #[test]
423 fn test_compute_paired_stats_basic() {
424 let samples = vec![
425 paired_sample(0, false, 100, 90),
426 paired_sample(1, false, 110, 100),
427 paired_sample(2, false, 120, 110),
428 ];
429
430 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
431
432 assert_eq!(stats.baseline_wall_ms.median, 110);
433 assert_eq!(stats.baseline_wall_ms.min, 100);
434 assert_eq!(stats.baseline_wall_ms.max, 120);
435
436 assert_eq!(stats.current_wall_ms.median, 100);
437 assert_eq!(stats.current_wall_ms.min, 90);
438 assert_eq!(stats.current_wall_ms.max, 110);
439
440 assert_eq!(stats.wall_diff_ms.mean, -10.0);
441 assert_eq!(stats.wall_diff_ms.median, -10.0);
442 assert_eq!(stats.wall_diff_ms.std_dev, 0.0);
443 assert_eq!(stats.wall_diff_ms.min, -10.0);
444 assert_eq!(stats.wall_diff_ms.max, -10.0);
445 assert_eq!(stats.wall_diff_ms.count, 3);
446 }
447
448 #[test]
449 fn test_compute_paired_stats_with_variance() {
450 let samples = vec![
451 paired_sample(0, false, 100, 110),
452 paired_sample(1, false, 100, 120),
453 paired_sample(2, false, 100, 130),
454 ];
455
456 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
457
458 assert_eq!(stats.wall_diff_ms.mean, 20.0);
459 assert_eq!(stats.wall_diff_ms.median, 20.0);
460 assert_eq!(stats.wall_diff_ms.min, 10.0);
461 assert_eq!(stats.wall_diff_ms.max, 30.0);
462 assert_eq!(stats.wall_diff_ms.count, 3);
463
464 let expected_std_dev = (200.0_f64 / 3.0).sqrt();
465 assert!(
466 (stats.wall_diff_ms.std_dev - expected_std_dev).abs() < 0.001,
467 "std_dev should be ~8.165, got {}",
468 stats.wall_diff_ms.std_dev
469 );
470 }
471
472 #[test]
473 fn test_compute_paired_stats_filters_warmup() {
474 let samples = vec![
475 paired_sample(0, true, 1000, 2000),
476 paired_sample(1, true, 1000, 2000),
477 paired_sample(2, false, 100, 110),
478 paired_sample(3, false, 100, 120),
479 ];
480
481 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
482
483 assert_eq!(stats.wall_diff_ms.count, 2);
484 assert_eq!(stats.baseline_wall_ms.median, 100);
485 assert_eq!(stats.current_wall_ms.median, 115);
486 }
487
488 #[test]
489 fn test_compute_paired_stats_empty_after_warmup_filter() {
490 let samples = vec![
491 paired_sample(0, true, 100, 110),
492 paired_sample(1, true, 100, 120),
493 ];
494
495 let result = compute_paired_stats(&samples, None, None);
496 assert!(result.is_err(), "should error with no measured samples");
497 assert!(matches!(result.unwrap_err(), PairedError::NoSamples));
498 }
499
500 #[test]
501 fn test_compute_paired_stats_empty_samples() {
502 let samples: Vec<PairedSample> = vec![];
503
504 let result = compute_paired_stats(&samples, None, None);
505 assert!(result.is_err(), "should error with empty samples");
506 assert!(matches!(result.unwrap_err(), PairedError::NoSamples));
507 }
508
509 #[test]
510 fn test_compute_paired_stats_single_sample() {
511 let samples = vec![paired_sample(0, false, 100, 150)];
512
513 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
514
515 assert_eq!(stats.baseline_wall_ms.median, 100);
516 assert_eq!(stats.baseline_wall_ms.min, 100);
517 assert_eq!(stats.baseline_wall_ms.max, 100);
518
519 assert_eq!(stats.current_wall_ms.median, 150);
520
521 assert_eq!(stats.wall_diff_ms.mean, 50.0);
522 assert_eq!(stats.wall_diff_ms.median, 50.0);
523 assert_eq!(stats.wall_diff_ms.std_dev, 0.0);
524 assert_eq!(stats.wall_diff_ms.count, 1);
525 }
526
527 #[test]
528 fn test_compute_paired_stats_with_rss() {
529 let samples = vec![
530 paired_sample_with_rss(0, false, 100, 110, 1000, 1100),
531 paired_sample_with_rss(1, false, 100, 120, 1000, 1200),
532 paired_sample_with_rss(2, false, 100, 130, 1000, 1300),
533 ];
534
535 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
536
537 let baseline_rss = stats.baseline_max_rss_kb.expect("should have baseline RSS");
538 assert_eq!(baseline_rss.median, 1000);
539
540 let current_rss = stats.current_max_rss_kb.expect("should have current RSS");
541 assert_eq!(current_rss.median, 1200);
542
543 let rss_diff = stats.rss_diff_kb.expect("should have RSS diff");
544 assert_eq!(rss_diff.mean, 200.0);
545 assert_eq!(rss_diff.count, 3);
546 }
547
548 #[test]
549 fn test_compute_paired_stats_with_work_units() {
550 let samples = vec![
551 paired_sample(0, false, 1000, 500),
552 paired_sample(1, false, 1000, 500),
553 ];
554
555 let stats = compute_paired_stats(&samples, Some(100), None).expect("should compute stats");
556
557 let baseline_thr = stats
558 .baseline_throughput_per_s
559 .expect("should have baseline throughput");
560 assert_eq!(baseline_thr.median, 100.0);
561
562 let current_thr = stats
563 .current_throughput_per_s
564 .expect("should have current throughput");
565 assert_eq!(current_thr.median, 200.0);
566
567 let thr_diff = stats
568 .throughput_diff_per_s
569 .expect("should have throughput diff");
570 assert_eq!(thr_diff.mean, 100.0);
571 }
572
573 #[test]
574 fn test_compute_paired_stats_no_throughput_without_work_units() {
575 let samples = vec![paired_sample(0, false, 100, 110)];
576
577 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
578
579 assert!(stats.baseline_throughput_per_s.is_none());
580 assert!(stats.current_throughput_per_s.is_none());
581 assert!(stats.throughput_diff_per_s.is_none());
582 }
583
584 #[test]
585 fn test_compute_paired_stats_negative_diffs() {
586 let samples = vec![
587 paired_sample(0, false, 200, 100),
588 paired_sample(1, false, 200, 100),
589 ];
590
591 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
592
593 assert_eq!(stats.wall_diff_ms.mean, -100.0);
594 assert_eq!(stats.wall_diff_ms.median, -100.0);
595 }
596
597 #[test]
598 fn test_compute_paired_stats_even_count_median() {
599 let samples = vec![
600 paired_sample(0, false, 100, 110),
601 paired_sample(1, false, 100, 120),
602 paired_sample(2, false, 100, 130),
603 paired_sample(3, false, 100, 140),
604 ];
605
606 let stats = compute_paired_stats(&samples, None, None).expect("should compute stats");
607
608 assert_eq!(stats.wall_diff_ms.median, 25.0);
609 assert_eq!(stats.wall_diff_ms.mean, 25.0);
610 }
611
612 #[test]
613 fn test_compare_paired_stats_basic() {
614 let stats = PairedStats {
615 baseline_wall_ms: U64Summary::new(100, 90, 110),
616 current_wall_ms: U64Summary::new(110, 100, 120),
617 wall_diff_ms: PairedDiffSummary {
618 mean: 10.0,
619 median: 10.0,
620 std_dev: 5.0,
621 min: 5.0,
622 max: 15.0,
623 count: 10,
624 significance: None,
625 },
626 baseline_max_rss_kb: None,
627 current_max_rss_kb: None,
628 rss_diff_kb: None,
629 baseline_throughput_per_s: None,
630 current_throughput_per_s: None,
631 throughput_diff_per_s: None,
632 };
633
634 let comparison = compare_paired_stats(&stats);
635
636 assert_eq!(comparison.mean_diff_ms, 10.0);
637 assert_eq!(comparison.median_diff_ms, 10.0);
638 assert_eq!(comparison.pct_change, 0.1);
639
640 let expected_std_error = 5.0 / (10.0_f64).sqrt();
641 assert!(
642 (comparison.std_error - expected_std_error).abs() < 0.01,
643 "std_error should be ~1.58, got {}",
644 comparison.std_error
645 );
646 }
647
648 #[test]
649 fn test_compare_paired_stats_ci_calculation() {
650 let stats = PairedStats {
651 baseline_wall_ms: U64Summary::new(100, 100, 100),
652 current_wall_ms: U64Summary::new(110, 110, 110),
653 wall_diff_ms: PairedDiffSummary {
654 mean: 10.0,
655 median: 10.0,
656 std_dev: 2.0,
657 min: 8.0,
658 max: 12.0,
659 count: 5,
660 significance: None,
661 },
662 baseline_max_rss_kb: None,
663 current_max_rss_kb: None,
664 rss_diff_kb: None,
665 baseline_throughput_per_s: None,
666 current_throughput_per_s: None,
667 throughput_diff_per_s: None,
668 };
669
670 let comparison = compare_paired_stats(&stats);
671
672 let expected_std_error = 2.0 / (5.0_f64).sqrt();
673 let expected_ci_lower = 10.0 - 2.0 * expected_std_error;
674 let expected_ci_upper = 10.0 + 2.0 * expected_std_error;
675
676 assert!(
677 (comparison.ci_95_lower - expected_ci_lower).abs() < 0.01,
678 "ci_95_lower should be ~{}, got {}",
679 expected_ci_lower,
680 comparison.ci_95_lower
681 );
682 assert!(
683 (comparison.ci_95_upper - expected_ci_upper).abs() < 0.01,
684 "ci_95_upper should be ~{}, got {}",
685 expected_ci_upper,
686 comparison.ci_95_upper
687 );
688
689 assert!(
690 comparison.is_significant,
691 "result should be significant when CI doesn't span zero"
692 );
693 }
694
695 #[test]
696 fn test_compare_paired_stats_large_sample_t_value() {
697 let stats = PairedStats {
698 baseline_wall_ms: U64Summary::new(100, 100, 100),
699 current_wall_ms: U64Summary::new(110, 110, 110),
700 wall_diff_ms: PairedDiffSummary {
701 mean: 10.0,
702 median: 10.0,
703 std_dev: 5.0,
704 min: 0.0,
705 max: 20.0,
706 count: 30,
707 significance: None,
708 },
709 baseline_max_rss_kb: None,
710 current_max_rss_kb: None,
711 rss_diff_kb: None,
712 baseline_throughput_per_s: None,
713 current_throughput_per_s: None,
714 throughput_diff_per_s: None,
715 };
716
717 let comparison = compare_paired_stats(&stats);
718
719 let expected_std_error = 5.0 / (30.0_f64).sqrt();
720 let expected_ci_lower = 10.0 - 1.96 * expected_std_error;
721
722 assert!(
723 (comparison.ci_95_lower - expected_ci_lower).abs() < 0.01,
724 "ci_95_lower with n>=30 should use t_value=1.96"
725 );
726 }
727
728 #[test]
729 fn test_compare_paired_stats_not_significant() {
730 let stats = PairedStats {
731 baseline_wall_ms: U64Summary::new(100, 100, 100),
732 current_wall_ms: U64Summary::new(101, 101, 101),
733 wall_diff_ms: PairedDiffSummary {
734 mean: 1.0,
735 median: 1.0,
736 std_dev: 10.0,
737 min: -15.0,
738 max: 15.0,
739 count: 5,
740 significance: None,
741 },
742 baseline_max_rss_kb: None,
743 current_max_rss_kb: None,
744 rss_diff_kb: None,
745 baseline_throughput_per_s: None,
746 current_throughput_per_s: None,
747 throughput_diff_per_s: None,
748 };
749
750 let comparison = compare_paired_stats(&stats);
751
752 assert!(
753 !comparison.is_significant,
754 "result should not be significant when CI spans zero: [{}, {}]",
755 comparison.ci_95_lower, comparison.ci_95_upper
756 );
757 assert!(
758 comparison.ci_95_lower < 0.0 && comparison.ci_95_upper > 0.0,
759 "CI should span zero"
760 );
761 }
762
763 #[test]
764 fn test_compare_paired_stats_single_sample() {
765 let stats = PairedStats {
766 baseline_wall_ms: U64Summary::new(100, 100, 100),
767 current_wall_ms: U64Summary::new(110, 110, 110),
768 wall_diff_ms: PairedDiffSummary {
769 mean: 10.0,
770 median: 10.0,
771 std_dev: 0.0,
772 min: 10.0,
773 max: 10.0,
774 count: 1,
775 significance: None,
776 },
777 baseline_max_rss_kb: None,
778 current_max_rss_kb: None,
779 rss_diff_kb: None,
780 baseline_throughput_per_s: None,
781 current_throughput_per_s: None,
782 throughput_diff_per_s: None,
783 };
784
785 let comparison = compare_paired_stats(&stats);
786
787 assert_eq!(comparison.std_error, 0.0);
788 assert_eq!(comparison.ci_95_lower, 10.0);
789 assert_eq!(comparison.ci_95_upper, 10.0);
790 }
791
792 #[test]
793 fn test_compare_paired_stats_zero_baseline() {
794 let stats = PairedStats {
795 baseline_wall_ms: U64Summary::new(0, 0, 0),
796 current_wall_ms: U64Summary::new(10, 10, 10),
797 wall_diff_ms: PairedDiffSummary {
798 mean: 10.0,
799 median: 10.0,
800 std_dev: 0.0,
801 min: 10.0,
802 max: 10.0,
803 count: 1,
804 significance: None,
805 },
806 baseline_max_rss_kb: None,
807 current_max_rss_kb: None,
808 rss_diff_kb: None,
809 baseline_throughput_per_s: None,
810 current_throughput_per_s: None,
811 throughput_diff_per_s: None,
812 };
813
814 let comparison = compare_paired_stats(&stats);
815
816 assert_eq!(
817 comparison.pct_change, 0.0,
818 "pct_change should be 0 when baseline is 0"
819 );
820 }
821
822 #[test]
823 fn test_compare_paired_stats_negative_improvement() {
824 let stats = PairedStats {
825 baseline_wall_ms: U64Summary::new(100, 100, 100),
826 current_wall_ms: U64Summary::new(80, 80, 80),
827 wall_diff_ms: PairedDiffSummary {
828 mean: -20.0,
829 median: -20.0,
830 std_dev: 2.0,
831 min: -22.0,
832 max: -18.0,
833 count: 5,
834 significance: None,
835 },
836 baseline_max_rss_kb: None,
837 current_max_rss_kb: None,
838 rss_diff_kb: None,
839 baseline_throughput_per_s: None,
840 current_throughput_per_s: None,
841 throughput_diff_per_s: None,
842 };
843
844 let comparison = compare_paired_stats(&stats);
845
846 assert_eq!(comparison.mean_diff_ms, -20.0);
847 assert_eq!(comparison.pct_change, -0.2);
848 assert!(
849 comparison.is_significant,
850 "significant improvement should be detected"
851 );
852 assert!(
853 comparison.ci_95_upper < 0.0,
854 "CI upper bound should be negative for improvement"
855 );
856 }
857
858 #[test]
859 fn test_summarize_paired_diffs_empty() {
860 let result = summarize_paired_diffs(&[], None);
861 assert!(matches!(result, Err(PairedError::NoSamples)));
862 }
863
864 #[test]
865 fn test_summarize_paired_diffs_single() {
866 let summary = summarize_paired_diffs(&[5.0], None).unwrap();
867 assert_eq!(summary.mean, 5.0);
868 assert_eq!(summary.median, 5.0);
869 assert_eq!(summary.std_dev, 0.0);
870 assert_eq!(summary.min, 5.0);
871 assert_eq!(summary.max, 5.0);
872 assert_eq!(summary.count, 1);
873 }
874
875 #[test]
876 fn test_summarize_paired_diffs_zero_variance() {
877 let summary = summarize_paired_diffs(&[10.0, 10.0, 10.0, 10.0], None).unwrap();
878 assert_eq!(summary.mean, 10.0);
879 assert_eq!(summary.std_dev, 0.0);
880 assert_eq!(summary.count, 4);
881 }
882
883 #[test]
884 fn test_summarize_paired_diffs_large_sample() {
885 let diffs: Vec<f64> = (0..1000).map(|i| i as f64).collect();
886 let summary = summarize_paired_diffs(&diffs, None).unwrap();
887
888 assert_eq!(summary.count, 1000);
889 assert_eq!(summary.min, 0.0);
890 assert_eq!(summary.max, 999.0);
891
892 let expected_mean = (0.0 + 999.0) / 2.0;
893 assert!((summary.mean - expected_mean).abs() < 0.1);
894 }
895
896 mod edge_cases {
897 use super::*;
898
899 #[test]
900 fn test_ci_bounds_with_zero_std_dev() {
901 let stats = PairedStats {
902 baseline_wall_ms: U64Summary::new(100, 100, 100),
903 current_wall_ms: U64Summary::new(110, 110, 110),
904 wall_diff_ms: PairedDiffSummary {
905 mean: 10.0,
906 median: 10.0,
907 std_dev: 0.0,
908 min: 10.0,
909 max: 10.0,
910 count: 10,
911 significance: None,
912 },
913 baseline_max_rss_kb: None,
914 current_max_rss_kb: None,
915 rss_diff_kb: None,
916 baseline_throughput_per_s: None,
917 current_throughput_per_s: None,
918 throughput_diff_per_s: None,
919 };
920
921 let comparison = compare_paired_stats(&stats);
922 assert_eq!(comparison.std_error, 0.0);
923 assert_eq!(comparison.ci_95_lower, 10.0);
924 assert_eq!(comparison.ci_95_upper, 10.0);
925 assert!(comparison.is_significant);
926 }
927
928 #[test]
929 fn test_large_positive_diff() {
930 let stats = PairedStats {
931 baseline_wall_ms: U64Summary::new(100, 100, 100),
932 current_wall_ms: U64Summary::new(100000, 100000, 100000),
933 wall_diff_ms: PairedDiffSummary {
934 mean: 99900.0,
935 median: 99900.0,
936 std_dev: 100.0,
937 min: 99800.0,
938 max: 100000.0,
939 count: 50,
940 significance: None,
941 },
942 baseline_max_rss_kb: None,
943 current_max_rss_kb: None,
944 rss_diff_kb: None,
945 baseline_throughput_per_s: None,
946 current_throughput_per_s: None,
947 throughput_diff_per_s: None,
948 };
949
950 let comparison = compare_paired_stats(&stats);
951 assert_eq!(comparison.mean_diff_ms, 99900.0);
952 assert!((comparison.pct_change - 999.0).abs() < 0.01);
953 assert!(comparison.is_significant);
954 }
955
956 #[test]
957 fn test_very_small_diffs() {
958 let stats = PairedStats {
959 baseline_wall_ms: U64Summary::new(100000, 100000, 100000),
960 current_wall_ms: U64Summary::new(100001, 100001, 100001),
961 wall_diff_ms: PairedDiffSummary {
962 mean: 1.0,
963 median: 1.0,
964 std_dev: 0.5,
965 min: 0.0,
966 max: 2.0,
967 count: 30,
968 significance: None,
969 },
970 baseline_max_rss_kb: None,
971 current_max_rss_kb: None,
972 rss_diff_kb: None,
973 baseline_throughput_per_s: None,
974 current_throughput_per_s: None,
975 throughput_diff_per_s: None,
976 };
977
978 let comparison = compare_paired_stats(&stats);
979 assert!((comparison.pct_change - 0.00001).abs() < 0.000001);
980 }
981 }
982
983 #[test]
984 fn test_compute_paired_cv_empty_samples() {
985 let samples: Vec<PairedSample> = vec![];
986 assert_eq!(compute_paired_cv(&samples), 0.0);
987 }
988
989 #[test]
990 fn test_compute_paired_cv_no_variance() {
991 let samples = vec![
992 paired_sample(0, false, 100, 110),
993 paired_sample(1, false, 100, 110),
994 paired_sample(2, false, 100, 110),
995 ];
996 assert_eq!(compute_paired_cv(&samples), 0.0);
997 }
998
999 #[test]
1000 fn test_compute_paired_cv_with_variance() {
1001 let samples = vec![
1004 paired_sample(0, false, 100, 110),
1005 paired_sample(1, false, 100, 120),
1006 paired_sample(2, false, 100, 130),
1007 ];
1008 let cv = compute_paired_cv(&samples);
1009 assert!((cv - 0.4082).abs() < 0.01, "expected CV ~0.408, got {}", cv);
1010 }
1011
1012 #[test]
1013 fn test_compute_paired_cv_skips_warmup() {
1014 let samples = vec![
1015 paired_sample(0, true, 100, 1000), paired_sample(1, false, 100, 110),
1017 paired_sample(2, false, 100, 110),
1018 ];
1019 assert_eq!(compute_paired_cv(&samples), 0.0);
1020 }
1021
1022 #[test]
1023 fn test_compute_paired_cv_zero_mean() {
1024 let samples = vec![
1026 paired_sample(0, false, 110, 100),
1027 paired_sample(1, false, 100, 110),
1028 ];
1029 assert_eq!(compute_paired_cv(&samples), 0.0);
1030 }
1031
1032 #[test]
1033 fn test_compute_paired_cv_high_noise() {
1034 let samples = vec![
1036 paired_sample(0, false, 200, 150),
1037 paired_sample(1, false, 100, 160),
1038 ];
1039 let cv = compute_paired_cv(&samples);
1040 assert!(cv > 1.0, "expected very high CV, got {}", cv);
1041 }
1042}
1043
1044#[cfg(test)]
1045mod property_tests {
1046 use super::*;
1047 use perfgate_types::{PairedSample, PairedSampleHalf};
1048 use proptest::prelude::*;
1049
1050 fn finite_f64_strategy() -> impl Strategy<Value = f64> {
1051 -1e100f64..1e100f64
1052 }
1053
1054 fn make_paired_samples(baseline: &[u64], current: &[u64]) -> Vec<PairedSample> {
1055 baseline
1056 .iter()
1057 .zip(current.iter())
1058 .enumerate()
1059 .map(|(i, (&b, &c))| PairedSample {
1060 pair_index: i as u32,
1061 warmup: false,
1062 baseline: PairedSampleHalf {
1063 wall_ms: b,
1064 exit_code: 0,
1065 timed_out: false,
1066 max_rss_kb: None,
1067 stdout: None,
1068 stderr: None,
1069 },
1070 current: PairedSampleHalf {
1071 wall_ms: c,
1072 exit_code: 0,
1073 timed_out: false,
1074 max_rss_kb: None,
1075 stdout: None,
1076 stderr: None,
1077 },
1078 wall_diff_ms: c as i64 - b as i64,
1079 rss_diff_kb: None,
1080 })
1081 .collect()
1082 }
1083
1084 proptest! {
1085 #[test]
1086 fn prop_summarize_paired_diffs_count_matches(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1087 let summary = summarize_paired_diffs(&diffs, None).unwrap();
1088 prop_assert_eq!(summary.count, diffs.len() as u32);
1089 }
1090
1091 #[test]
1092 fn prop_summarize_paired_diffs_mean_correct(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1093 let summary = summarize_paired_diffs(&diffs, None).unwrap();
1094 let expected_mean: f64 = diffs.iter().sum::<f64>() / diffs.len() as f64;
1095 prop_assert!((summary.mean - expected_mean).abs() < 1e-10 || expected_mean.abs() < 1e-10);
1096 }
1097
1098 #[test]
1099 fn prop_summarize_paired_diffs_min_max_bounds(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1100 let summary = summarize_paired_diffs(&diffs, None).unwrap();
1101 let expected_min = diffs.iter().cloned().fold(f64::INFINITY, f64::min);
1102 let expected_max = diffs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
1103
1104 if expected_min.is_finite() {
1105 prop_assert!((summary.min - expected_min).abs() < 1e-10);
1106 }
1107 if expected_max.is_finite() {
1108 prop_assert!((summary.max - expected_max).abs() < 1e-10);
1109 }
1110 }
1111
1112 #[test]
1113 fn prop_summarize_paired_diffs_ordering(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1114 let summary = summarize_paired_diffs(&diffs, None).unwrap();
1115 if summary.min.is_finite() && summary.median.is_finite() && summary.max.is_finite() {
1116 prop_assert!(summary.min <= summary.median);
1117 prop_assert!(summary.median <= summary.max);
1118 }
1119 }
1120
1121 #[test]
1122 fn prop_std_dev_non_negative(diffs in prop::collection::vec(finite_f64_strategy(), 1..100)) {
1123 let summary = summarize_paired_diffs(&diffs, None).unwrap();
1124 prop_assert!(summary.std_dev >= 0.0 || !summary.std_dev.is_finite());
1125 }
1126
1127 #[test]
1128 fn prop_ci_contains_mean(
1129 mean in -1000.0f64..1000.0,
1130 std_dev in 0.1f64..100.0,
1131 count in 2u32..100
1132 ) {
1133 let stats = PairedStats {
1134 baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1135 current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1136 wall_diff_ms: PairedDiffSummary {
1137 mean,
1138 median: mean,
1139 std_dev,
1140 min: mean - std_dev,
1141 max: mean + std_dev,
1142 count,
1143 significance: None,
1144 },
1145 baseline_max_rss_kb: None,
1146 current_max_rss_kb: None,
1147 rss_diff_kb: None,
1148 baseline_throughput_per_s: None,
1149 current_throughput_per_s: None,
1150 throughput_diff_per_s: None,
1151 };
1152
1153 let comparison = compare_paired_stats(&stats);
1154
1155 prop_assert!(comparison.ci_95_lower <= mean);
1156 prop_assert!(comparison.ci_95_upper >= mean);
1157 }
1158
1159 #[test]
1160 fn prop_ci_width_decreases_with_sample_size(
1161 mean in 0.0f64..100.0,
1162 std_dev in 1.0f64..10.0,
1163 ) {
1164 let make_comparison = |count: u32| {
1165 let stats = PairedStats {
1166 baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1167 current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1168 wall_diff_ms: PairedDiffSummary {
1169 mean,
1170 median: mean,
1171 std_dev,
1172 min: mean - std_dev,
1173 max: mean + std_dev,
1174 count,
1175 significance: None,
1176 },
1177 baseline_max_rss_kb: None,
1178 current_max_rss_kb: None,
1179 rss_diff_kb: None,
1180 baseline_throughput_per_s: None,
1181 current_throughput_per_s: None,
1182 throughput_diff_per_s: None,
1183 };
1184 compare_paired_stats(&stats)
1185 };
1186
1187 let small = make_comparison(10);
1188 let large = make_comparison(100);
1189
1190 let width_small = small.ci_95_upper - small.ci_95_lower;
1191 let width_large = large.ci_95_upper - large.ci_95_lower;
1192
1193 prop_assert!(width_large < width_small, "CI should narrow with more samples");
1194 }
1195
1196 #[test]
1197 fn prop_zero_variance_zero_std_error(
1198 mean in -1000.0f64..1000.0,
1199 count in 2u32..100
1200 ) {
1201 let stats = PairedStats {
1202 baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1203 current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1204 wall_diff_ms: PairedDiffSummary {
1205 mean,
1206 median: mean,
1207 std_dev: 0.0,
1208 min: mean,
1209 max: mean,
1210 count,
1211 significance: None,
1212 },
1213 baseline_max_rss_kb: None,
1214 current_max_rss_kb: None,
1215 rss_diff_kb: None,
1216 baseline_throughput_per_s: None,
1217 current_throughput_per_s: None,
1218 throughput_diff_per_s: None,
1219 };
1220
1221 let comparison = compare_paired_stats(&stats);
1222
1223 prop_assert_eq!(comparison.std_error, 0.0);
1224 prop_assert_eq!(comparison.ci_95_lower, mean);
1225 prop_assert_eq!(comparison.ci_95_upper, mean);
1226 }
1227
1228 #[test]
1229 fn prop_significance_deterministic(
1230 mean in -100.0f64..100.0,
1231 std_dev in 0.0f64..50.0,
1232 count in 1u32..50
1233 ) {
1234 let stats = PairedStats {
1235 baseline_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1236 current_wall_ms: perfgate_types::U64Summary::new(100, 100, 100 ),
1237 wall_diff_ms: PairedDiffSummary {
1238 mean,
1239 median: mean,
1240 std_dev,
1241 min: mean - std_dev,
1242 max: mean + std_dev,
1243 count,
1244 significance: None,
1245 },
1246 baseline_max_rss_kb: None,
1247 current_max_rss_kb: None,
1248 rss_diff_kb: None,
1249 baseline_throughput_per_s: None,
1250 current_throughput_per_s: None,
1251 throughput_diff_per_s: None,
1252 };
1253
1254 let comparison = compare_paired_stats(&stats);
1255
1256 let is_significant = comparison.ci_95_lower > 0.0 || comparison.ci_95_upper < 0.0;
1257 prop_assert_eq!(comparison.is_significant, is_significant);
1258 }
1259
1260 #[test]
1261 fn prop_paired_stats_deterministic(
1262 baseline in prop::collection::vec(1u64..10000u64, 5..50),
1263 current in prop::collection::vec(1u64..10000u64, 5..50),
1264 ) {
1265 let len = baseline.len().min(current.len());
1266 let samples = make_paired_samples(&baseline[..len], ¤t[..len]);
1267 let r1 = compute_paired_stats(&samples, None, None);
1268 let r2 = compute_paired_stats(&samples, None, None);
1269 match (r1, r2) {
1270 (Ok(s1), Ok(s2)) => {
1271 prop_assert_eq!(s1.wall_diff_ms.mean, s2.wall_diff_ms.mean);
1272 prop_assert_eq!(s1.wall_diff_ms.median, s2.wall_diff_ms.median);
1273 prop_assert_eq!(s1.wall_diff_ms.std_dev, s2.wall_diff_ms.std_dev);
1274 prop_assert_eq!(s1.wall_diff_ms.count, s2.wall_diff_ms.count);
1275 }
1276 (Err(_), Err(_)) => {}
1277 _ => prop_assert!(false, "both calls must produce the same result"),
1278 }
1279 }
1280
1281 #[test]
1282 fn prop_ci_contains_mean_diff_from_samples(
1283 baseline in prop::collection::vec(1u64..10000u64, 5..50),
1284 current in prop::collection::vec(1u64..10000u64, 5..50),
1285 ) {
1286 let len = baseline.len().min(current.len());
1287 let samples = make_paired_samples(&baseline[..len], ¤t[..len]);
1288 if let Ok(stats) = compute_paired_stats(&samples, None, None) {
1289 let cmp = compare_paired_stats(&stats);
1290 prop_assert!(
1291 cmp.ci_95_lower <= cmp.mean_diff_ms,
1292 "CI lower {} must be <= mean {}",
1293 cmp.ci_95_lower, cmp.mean_diff_ms
1294 );
1295 prop_assert!(
1296 cmp.ci_95_upper >= cmp.mean_diff_ms,
1297 "CI upper {} must be >= mean {}",
1298 cmp.ci_95_upper, cmp.mean_diff_ms
1299 );
1300 }
1301 }
1302
1303 #[test]
1304 fn prop_reversing_negates_mean_diff(
1305 baseline in prop::collection::vec(1u64..10000u64, 5..50),
1306 current in prop::collection::vec(1u64..10000u64, 5..50),
1307 ) {
1308 let len = baseline.len().min(current.len());
1309 let fwd = make_paired_samples(&baseline[..len], ¤t[..len]);
1310 let rev = make_paired_samples(¤t[..len], &baseline[..len]);
1311 if let (Ok(fwd_stats), Ok(rev_stats)) =
1312 (compute_paired_stats(&fwd, None, None), compute_paired_stats(&rev, None, None))
1313 {
1314 let fwd_cmp = compare_paired_stats(&fwd_stats);
1315 let rev_cmp = compare_paired_stats(&rev_stats);
1316 prop_assert!(
1317 (fwd_cmp.mean_diff_ms + rev_cmp.mean_diff_ms).abs() < 1e-10,
1318 "reversing must negate mean diff: {} vs {}",
1319 fwd_cmp.mean_diff_ms, rev_cmp.mean_diff_ms
1320 );
1321 }
1322 }
1323
1324 #[test]
1326 fn prop_full_pipeline_determinism(
1327 baseline in prop::collection::vec(1u64..10000u64, 5..50),
1328 current in prop::collection::vec(1u64..10000u64, 5..50),
1329 ) {
1330 let len = baseline.len().min(current.len());
1331 let samples = make_paired_samples(&baseline[..len], ¤t[..len]);
1332 let stats1 = compute_paired_stats(&samples, None, None).unwrap();
1333 let stats2 = compute_paired_stats(&samples, None, None).unwrap();
1334 let cmp1 = compare_paired_stats(&stats1);
1335 let cmp2 = compare_paired_stats(&stats2);
1336 prop_assert_eq!(cmp1, cmp2, "identical inputs must produce identical comparisons");
1337 }
1338
1339 #[test]
1341 fn prop_sample_count_preserved(
1342 baseline in prop::collection::vec(1u64..10000u64, 2..50),
1343 current in prop::collection::vec(1u64..10000u64, 2..50),
1344 ) {
1345 let len = baseline.len().min(current.len());
1346 let samples = make_paired_samples(&baseline[..len], ¤t[..len]);
1347 let non_warmup = samples.iter().filter(|s| !s.warmup).count() as u32;
1348 let stats = compute_paired_stats(&samples, None, None).unwrap();
1349 prop_assert_eq!(
1350 stats.wall_diff_ms.count, non_warmup,
1351 "output count {} must equal non-warmup input count {}",
1352 stats.wall_diff_ms.count, non_warmup
1353 );
1354 }
1355
1356 #[test]
1358 fn prop_diff_symmetry_direction_flips(
1359 baseline in prop::collection::vec(1u64..10000u64, 5..50),
1360 current in prop::collection::vec(1u64..10000u64, 5..50),
1361 ) {
1362 let len = baseline.len().min(current.len());
1363 let fwd = make_paired_samples(&baseline[..len], ¤t[..len]);
1364 let rev = make_paired_samples(¤t[..len], &baseline[..len]);
1365 if let (Ok(fwd_stats), Ok(rev_stats)) =
1366 (compute_paired_stats(&fwd, None, None), compute_paired_stats(&rev, None, None))
1367 {
1368 let fwd_cmp = compare_paired_stats(&fwd_stats);
1369 let rev_cmp = compare_paired_stats(&rev_stats);
1370 if fwd_cmp.mean_diff_ms > 0.0 {
1372 prop_assert!(
1373 rev_cmp.mean_diff_ms < 0.0,
1374 "if forward is regression ({:.2}), reverse must be improvement ({:.2})",
1375 fwd_cmp.mean_diff_ms, rev_cmp.mean_diff_ms
1376 );
1377 } else if fwd_cmp.mean_diff_ms < 0.0 {
1378 prop_assert!(
1379 rev_cmp.mean_diff_ms > 0.0,
1380 "if forward is improvement ({:.2}), reverse must be regression ({:.2})",
1381 fwd_cmp.mean_diff_ms, rev_cmp.mean_diff_ms
1382 );
1383 }
1384 prop_assert!(
1386 (fwd_cmp.median_diff_ms + rev_cmp.median_diff_ms).abs() < 1e-10,
1387 "median diff must negate: {:.2} vs {:.2}",
1388 fwd_cmp.median_diff_ms, rev_cmp.median_diff_ms
1389 );
1390 }
1391 }
1392
1393 #[test]
1395 fn prop_mean_bounded_by_min_max(
1396 baseline in prop::collection::vec(1u64..10000u64, 2..50),
1397 current in prop::collection::vec(1u64..10000u64, 2..50),
1398 ) {
1399 let len = baseline.len().min(current.len());
1400 let samples = make_paired_samples(&baseline[..len], ¤t[..len]);
1401 if let Ok(stats) = compute_paired_stats(&samples, None, None) {
1402 let diff = &stats.wall_diff_ms;
1403 prop_assert!(
1404 diff.mean >= diff.min,
1405 "mean {:.2} must be >= min {:.2}",
1406 diff.mean, diff.min
1407 );
1408 prop_assert!(
1409 diff.mean <= diff.max,
1410 "mean {:.2} must be <= max {:.2}",
1411 diff.mean, diff.max
1412 );
1413 }
1414 }
1415 }
1416}