Skip to main content

perfgate_app/
paired.rs

1//! Paired benchmark execution for perfgate.
2
3use perfgate_adapters::{CommandSpec, HostProbe, HostProbeOptions, ProcessRunner};
4use perfgate_domain::{compute_paired_cv, compute_paired_stats};
5use perfgate_types::{
6    NoiseDiagnostics, NoiseLevel, PAIRED_SCHEMA_V1, PairedBenchMeta, PairedRunReceipt,
7    PairedSample, PairedSampleHalf, RunMeta, SignificancePolicy, ToolInfo,
8};
9use std::path::PathBuf;
10use std::time::Duration;
11
12use crate::Clock;
13
14#[derive(Debug, Clone)]
15pub struct PairedRunRequest {
16    pub name: String,
17    pub cwd: Option<PathBuf>,
18    pub baseline_command: Vec<String>,
19    pub current_command: Vec<String>,
20    pub repeat: u32,
21    pub warmup: u32,
22    pub work_units: Option<u64>,
23    pub timeout: Option<Duration>,
24    pub env: Vec<(String, String)>,
25    pub output_cap_bytes: usize,
26    pub allow_nonzero: bool,
27    pub include_hostname_hash: bool,
28    pub significance_alpha: Option<f64>,
29    pub significance_min_samples: Option<u32>,
30    pub require_significance: bool,
31    pub max_retries: u32,
32    pub fail_on_regression: Option<f64>,
33    /// CV threshold for early termination. If the coefficient of variation of
34    /// the wall-time differences exceeds this value, retries are aborted because
35    /// the benchmark is too noisy for significance to be achievable.
36    pub cv_threshold: Option<f64>,
37}
38
39#[derive(Debug, Clone)]
40pub struct PairedRunOutcome {
41    pub receipt: PairedRunReceipt,
42    pub failed: bool,
43    pub reasons: Vec<String>,
44}
45
46pub struct PairedRunUseCase<R: ProcessRunner, H: HostProbe, C: Clock> {
47    runner: R,
48    host_probe: H,
49    clock: C,
50    tool: ToolInfo,
51}
52
53impl<R: ProcessRunner, H: HostProbe, C: Clock> PairedRunUseCase<R, H, C> {
54    pub fn new(runner: R, host_probe: H, clock: C, tool: ToolInfo) -> Self {
55        Self {
56            runner,
57            host_probe,
58            clock,
59            tool,
60        }
61    }
62
63    pub fn execute(&self, req: PairedRunRequest) -> anyhow::Result<PairedRunOutcome> {
64        let run_id = uuid::Uuid::new_v4().to_string();
65        let started_at = self.clock.now_rfc3339();
66        let host = self.host_probe.probe(&HostProbeOptions {
67            include_hostname_hash: req.include_hostname_hash,
68        });
69
70        let mut bench = PairedBenchMeta {
71            name: req.name.clone(),
72            cwd: req.cwd.as_ref().map(|p| p.to_string_lossy().to_string()),
73            baseline_command: req.baseline_command.clone(),
74            current_command: req.current_command.clone(),
75            repeat: req.repeat,
76            warmup: req.warmup,
77            work_units: req.work_units,
78            timeout_ms: req.timeout.map(|d| d.as_millis() as u64),
79        };
80
81        let mut samples = Vec::new();
82        let mut reasons = Vec::new();
83
84        // Run warmups first
85        for i in 0..req.warmup {
86            self.run_pair(i, true, &req, &mut samples, &mut reasons)?;
87        }
88
89        // Initial measurement run
90        let mut pairs_collected = 0;
91        for _ in 0..req.repeat {
92            self.run_pair(
93                req.warmup + pairs_collected,
94                false,
95                &req,
96                &mut samples,
97                &mut reasons,
98            )?;
99            pairs_collected += 1;
100        }
101
102        let significance_policy = SignificancePolicy {
103            alpha: req.significance_alpha,
104            min_samples: req.significance_min_samples,
105        };
106
107        // Retry logic for significance with adaptive sample sizing and CV-based early termination
108        let mut retries_done: u32 = 0;
109        let mut early_termination = false;
110        loop {
111            let stats = compute_paired_stats(&samples, req.work_units, Some(&significance_policy))?;
112            let significance_reached = stats
113                .wall_diff_ms
114                .significance
115                .as_ref()
116                .map(|s| s.significant)
117                .unwrap_or(true);
118
119            if !req.require_significance || significance_reached || retries_done >= req.max_retries
120            {
121                break;
122            }
123
124            // Check CV threshold for early termination
125            if let Some(cv_thresh) = req.cv_threshold {
126                let cv = compute_paired_cv(&samples);
127                if cv > cv_thresh {
128                    early_termination = true;
129                    reasons.push(format!(
130                        "early termination: CV {:.3} exceeds threshold {:.3}, benchmark too noisy for retries",
131                        cv, cv_thresh
132                    ));
133                    break;
134                }
135            }
136
137            // Adaptive sample sizing: each retry collects more pairs (1.5x growth)
138            // Retry 1: 1 pair, Retry 2: 2 pairs, Retry 3: 3 pairs, ...
139            let extra_pairs = ((retries_done as f64 + 1.0) * 1.5).ceil() as u32;
140            retries_done += 1;
141
142            for _ in 0..extra_pairs {
143                self.run_pair(
144                    req.warmup + pairs_collected,
145                    false,
146                    &req,
147                    &mut samples,
148                    &mut reasons,
149                )?;
150                pairs_collected += 1;
151            }
152        }
153
154        // Update bench metadata if we collected more samples than originally requested
155        bench.repeat = pairs_collected;
156
157        let stats = compute_paired_stats(&samples, req.work_units, Some(&significance_policy))?;
158        let ended_at = self.clock.now_rfc3339();
159
160        // Build noise diagnostics when retries were configured
161        let noise_diagnostics = if req.max_retries > 0 {
162            let cv = compute_paired_cv(&samples);
163            Some(NoiseDiagnostics {
164                cv,
165                noise_level: NoiseLevel::from_cv(cv),
166                retries_used: retries_done,
167                early_termination,
168            })
169        } else {
170            None
171        };
172
173        let receipt = PairedRunReceipt {
174            schema: PAIRED_SCHEMA_V1.to_string(),
175            tool: self.tool.clone(),
176            run: RunMeta {
177                id: run_id,
178                started_at,
179                ended_at,
180                host,
181            },
182            bench,
183            samples,
184            stats,
185            noise_diagnostics,
186        };
187
188        if let Some(threshold_pct) = req.fail_on_regression {
189            let comparison = perfgate_domain::compare_paired_stats(&receipt.stats);
190            let threshold_fraction = threshold_pct / 100.0;
191            if comparison.pct_change > threshold_fraction && comparison.is_significant {
192                reasons.push(format!(
193                    "wall time regression ({:.2}%) exceeded threshold ({:.2}%)",
194                    comparison.pct_change * 100.0,
195                    threshold_pct
196                ));
197            }
198        }
199
200        let failed = !reasons.is_empty();
201        Ok(PairedRunOutcome {
202            receipt,
203            failed,
204            reasons,
205        })
206    }
207
208    fn run_pair(
209        &self,
210        pair_index: u32,
211        is_warmup: bool,
212        req: &PairedRunRequest,
213        samples: &mut Vec<PairedSample>,
214        reasons: &mut Vec<String>,
215    ) -> anyhow::Result<()> {
216        let baseline_spec = CommandSpec {
217            name: format!("{}-baseline", req.name),
218            argv: req.baseline_command.clone(),
219            cwd: req.cwd.clone(),
220            env: req.env.clone(),
221            timeout: req.timeout,
222            output_cap_bytes: req.output_cap_bytes,
223        };
224        let baseline_run = self.runner.run(&baseline_spec).map_err(|e| match e {
225            perfgate_adapters::AdapterError::RunCommand { command, reason } => {
226                anyhow::anyhow!(
227                    "failed to run baseline pair {}: {}: {}",
228                    pair_index + 1,
229                    command,
230                    reason
231                )
232            }
233            _ => anyhow::anyhow!("failed to run baseline pair {}: {}", pair_index + 1, e),
234        })?;
235
236        let current_spec = CommandSpec {
237            name: format!("{}-current", req.name),
238            argv: req.current_command.clone(),
239            cwd: req.cwd.clone(),
240            env: req.env.clone(),
241            timeout: req.timeout,
242            output_cap_bytes: req.output_cap_bytes,
243        };
244        let current_run = self.runner.run(&current_spec).map_err(|e| match e {
245            perfgate_adapters::AdapterError::RunCommand { command, reason } => {
246                anyhow::anyhow!(
247                    "failed to run current pair {}: {}: {}",
248                    pair_index + 1,
249                    command,
250                    reason
251                )
252            }
253            _ => anyhow::anyhow!("failed to run current pair {}: {}", pair_index + 1, e),
254        })?;
255
256        let baseline = sample_half(&baseline_run);
257        let current = sample_half(&current_run);
258
259        let wall_diff_ms = current.wall_ms as i64 - baseline.wall_ms as i64;
260        let rss_diff_kb = match (baseline.max_rss_kb, current.max_rss_kb) {
261            (Some(b), Some(c)) => Some(c as i64 - b as i64),
262            _ => None,
263        };
264
265        if !is_warmup {
266            if baseline.timed_out {
267                reasons.push(format!("pair {} baseline timed out", pair_index + 1));
268            }
269            if baseline.exit_code != 0 && !req.allow_nonzero {
270                reasons.push(format!(
271                    "pair {} baseline exit {}",
272                    pair_index + 1,
273                    baseline.exit_code
274                ));
275            }
276            if current.timed_out {
277                reasons.push(format!("pair {} current timed out", pair_index + 1));
278            }
279            if current.exit_code != 0 && !req.allow_nonzero {
280                reasons.push(format!(
281                    "pair {} current exit {}",
282                    pair_index + 1,
283                    current.exit_code
284                ));
285            }
286        }
287
288        samples.push(PairedSample {
289            pair_index,
290            warmup: is_warmup,
291            baseline,
292            current,
293            wall_diff_ms,
294            rss_diff_kb,
295        });
296
297        Ok(())
298    }
299}
300
301fn sample_half(run: &perfgate_adapters::RunResult) -> PairedSampleHalf {
302    PairedSampleHalf {
303        wall_ms: run.wall_ms,
304        exit_code: run.exit_code,
305        timed_out: run.timed_out,
306        max_rss_kb: run.max_rss_kb,
307        stdout: if run.stdout.is_empty() {
308            None
309        } else {
310            Some(String::from_utf8_lossy(&run.stdout).to_string())
311        },
312        stderr: if run.stderr.is_empty() {
313            None
314        } else {
315            Some(String::from_utf8_lossy(&run.stderr).to_string())
316        },
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    use perfgate_adapters::{AdapterError, RunResult};
324    use perfgate_types::HostInfo;
325    use std::sync::{Arc, Mutex};
326
327    #[derive(Clone)]
328    struct TestRunner {
329        runs: Arc<Mutex<Vec<RunResult>>>,
330    }
331
332    impl TestRunner {
333        fn new(runs: Vec<RunResult>) -> Self {
334            Self {
335                runs: Arc::new(Mutex::new(runs)),
336            }
337        }
338    }
339
340    impl ProcessRunner for TestRunner {
341        fn run(&self, _spec: &CommandSpec) -> Result<RunResult, AdapterError> {
342            let mut runs = self.runs.lock().expect("lock runs");
343            if runs.is_empty() {
344                return Err(AdapterError::Other("no more queued runs".to_string()));
345            }
346            Ok(runs.remove(0))
347        }
348    }
349
350    #[derive(Clone)]
351    struct TestHostProbe {
352        host: HostInfo,
353        seen_include_hash: Arc<Mutex<Vec<bool>>>,
354    }
355
356    impl TestHostProbe {
357        fn new(host: HostInfo) -> Self {
358            Self {
359                host,
360                seen_include_hash: Arc::new(Mutex::new(Vec::new())),
361            }
362        }
363    }
364
365    impl HostProbe for TestHostProbe {
366        fn probe(&self, options: &HostProbeOptions) -> HostInfo {
367            self.seen_include_hash
368                .lock()
369                .expect("lock options")
370                .push(options.include_hostname_hash);
371            self.host.clone()
372        }
373    }
374
375    #[derive(Clone)]
376    struct TestClock {
377        now: String,
378    }
379
380    impl TestClock {
381        fn new(now: &str) -> Self {
382            Self {
383                now: now.to_string(),
384            }
385        }
386    }
387
388    impl Clock for TestClock {
389        fn now_rfc3339(&self) -> String {
390            self.now.clone()
391        }
392    }
393
394    fn run_result(
395        wall_ms: u64,
396        exit_code: i32,
397        timed_out: bool,
398        max_rss_kb: Option<u64>,
399        stdout: &[u8],
400        stderr: &[u8],
401    ) -> RunResult {
402        RunResult {
403            wall_ms,
404            exit_code,
405            timed_out,
406            cpu_ms: None,
407            page_faults: None,
408            ctx_switches: None,
409            max_rss_kb,
410            io_read_bytes: None,
411            io_write_bytes: None,
412            network_packets: None,
413            energy_uj: None,
414            binary_bytes: None,
415            stdout: stdout.to_vec(),
416            stderr: stderr.to_vec(),
417        }
418    }
419
420    #[test]
421    fn sample_half_maps_optional_output() {
422        let run = run_result(10, 0, false, None, b"hello", b"");
423        let sample = sample_half(&run);
424        assert_eq!(sample.stdout.as_deref(), Some("hello"));
425        assert!(sample.stderr.is_none());
426
427        let run2 = run_result(10, 0, false, None, b"", b"err");
428        let sample2 = sample_half(&run2);
429        assert!(sample2.stdout.is_none());
430        assert_eq!(sample2.stderr.as_deref(), Some("err"));
431    }
432
433    #[test]
434    fn paired_run_collects_samples_and_reasons() {
435        let runs = vec![
436            // warmup baseline/current (current exits nonzero, should be ignored)
437            run_result(100, 0, false, None, b"", b""),
438            run_result(90, 1, false, None, b"", b""),
439            // measured baseline/current (baseline times out + nonzero)
440            run_result(110, 2, true, Some(2000), b"out", b""),
441            run_result(105, 0, false, Some(2500), b"", b""),
442        ];
443
444        let runner = TestRunner::new(runs);
445        let host = HostInfo {
446            os: "linux".to_string(),
447            arch: "x86_64".to_string(),
448            cpu_count: None,
449            memory_bytes: None,
450            hostname_hash: None,
451        };
452        let host_probe = TestHostProbe::new(host.clone());
453        let clock = TestClock::new("2024-01-01T00:00:00Z");
454
455        let usecase = PairedRunUseCase::new(
456            runner,
457            host_probe.clone(),
458            clock,
459            ToolInfo {
460                name: "perfgate".to_string(),
461                version: "0.1.0".to_string(),
462            },
463        );
464
465        let outcome = usecase
466            .execute(PairedRunRequest {
467                name: "bench".to_string(),
468                cwd: None,
469                baseline_command: vec!["true".to_string()],
470                current_command: vec!["true".to_string()],
471                repeat: 1,
472                warmup: 1,
473                work_units: None,
474                timeout: None,
475                env: vec![],
476                output_cap_bytes: 1024,
477                allow_nonzero: false,
478                include_hostname_hash: true,
479                significance_alpha: None,
480                significance_min_samples: None,
481                require_significance: false,
482                max_retries: 0,
483                fail_on_regression: None,
484                cv_threshold: None,
485            })
486            .expect("paired run should succeed");
487
488        assert_eq!(outcome.receipt.samples.len(), 2);
489        assert!(outcome.receipt.samples[0].warmup);
490        assert!(!outcome.receipt.samples[1].warmup);
491        assert_eq!(outcome.receipt.samples[0].pair_index, 0);
492        assert_eq!(outcome.receipt.samples[1].pair_index, 1);
493
494        let measured = &outcome.receipt.samples[1];
495        assert_eq!(measured.rss_diff_kb, Some(500));
496
497        assert!(outcome.failed);
498        assert!(
499            outcome
500                .reasons
501                .iter()
502                .any(|r| r.contains("baseline timed out")),
503            "expected baseline timeout reason"
504        );
505        assert!(
506            outcome.reasons.iter().any(|r| r.contains("baseline exit")),
507            "expected baseline exit reason"
508        );
509        assert!(
510            !outcome
511                .reasons
512                .iter()
513                .any(|r| r.contains("pair 1 current exit")),
514            "warmup errors should not be recorded"
515        );
516
517        let seen = host_probe.seen_include_hash.lock().expect("lock seen");
518        assert_eq!(seen.as_slice(), &[true]);
519        assert_eq!(outcome.receipt.run.host, host);
520    }
521
522    #[test]
523    fn paired_run_all_warmup_no_measured_samples() {
524        // 2 warmups, 0 measured → samples has 2 entries, all warmup, no failures
525        let runs = vec![
526            run_result(100, 0, false, None, b"", b""),
527            run_result(90, 0, false, None, b"", b""),
528            run_result(110, 0, false, None, b"", b""),
529            run_result(95, 0, false, None, b"", b""),
530        ];
531
532        let runner = TestRunner::new(runs);
533        let host = HostInfo {
534            os: "linux".to_string(),
535            arch: "x86_64".to_string(),
536            cpu_count: None,
537            memory_bytes: None,
538            hostname_hash: None,
539        };
540        let host_probe = TestHostProbe::new(host);
541        let clock = TestClock::new("2024-01-01T00:00:00Z");
542
543        let usecase = PairedRunUseCase::new(
544            runner,
545            host_probe,
546            clock,
547            ToolInfo {
548                name: "perfgate".to_string(),
549                version: "0.1.0".to_string(),
550            },
551        );
552
553        let outcome = usecase
554            .execute(PairedRunRequest {
555                name: "warmup-only".to_string(),
556                cwd: None,
557                baseline_command: vec!["true".to_string()],
558                current_command: vec!["true".to_string()],
559                repeat: 2,
560                warmup: 0,
561                work_units: None,
562                timeout: None,
563                env: vec![],
564                output_cap_bytes: 1024,
565                allow_nonzero: false,
566                include_hostname_hash: false,
567                significance_alpha: None,
568                significance_min_samples: None,
569                require_significance: false,
570                max_retries: 0,
571                fail_on_regression: None,
572                cv_threshold: None,
573            })
574            .expect("paired run should succeed");
575
576        assert_eq!(outcome.receipt.samples.len(), 2);
577        assert!(!outcome.failed);
578        assert!(outcome.reasons.is_empty());
579    }
580
581    #[test]
582    fn paired_run_runner_error_propagates() {
583        // Runner that immediately fails
584        let runner = TestRunner::new(vec![]);
585
586        let host = HostInfo {
587            os: "linux".to_string(),
588            arch: "x86_64".to_string(),
589            cpu_count: None,
590            memory_bytes: None,
591            hostname_hash: None,
592        };
593        let host_probe = TestHostProbe::new(host);
594        let clock = TestClock::new("2024-01-01T00:00:00Z");
595
596        let usecase = PairedRunUseCase::new(
597            runner,
598            host_probe,
599            clock,
600            ToolInfo {
601                name: "perfgate".to_string(),
602                version: "0.1.0".to_string(),
603            },
604        );
605
606        let err = usecase
607            .execute(PairedRunRequest {
608                name: "fail-bench".to_string(),
609                cwd: None,
610                baseline_command: vec!["true".to_string()],
611                current_command: vec!["true".to_string()],
612                repeat: 1,
613                warmup: 0,
614                work_units: None,
615                timeout: None,
616                env: vec![],
617                output_cap_bytes: 1024,
618                allow_nonzero: false,
619                include_hostname_hash: false,
620                significance_alpha: None,
621                significance_min_samples: None,
622                require_significance: false,
623                max_retries: 0,
624                fail_on_regression: None,
625                cv_threshold: None,
626            })
627            .unwrap_err();
628
629        assert!(
630            err.to_string().contains("no more queued runs")
631                || err.to_string().contains("failed to run"),
632            "expected runner error, got: {}",
633            err
634        );
635    }
636
637    #[test]
638    fn paired_run_wall_diff_computed_correctly() {
639        let runs = vec![
640            // baseline: 200ms, current: 150ms → diff = -50
641            run_result(200, 0, false, Some(1000), b"", b""),
642            run_result(150, 0, false, Some(800), b"", b""),
643        ];
644
645        let runner = TestRunner::new(runs);
646        let host = HostInfo {
647            os: "linux".to_string(),
648            arch: "x86_64".to_string(),
649            cpu_count: None,
650            memory_bytes: None,
651            hostname_hash: None,
652        };
653        let host_probe = TestHostProbe::new(host);
654        let clock = TestClock::new("2024-01-01T00:00:00Z");
655
656        let usecase = PairedRunUseCase::new(
657            runner,
658            host_probe,
659            clock,
660            ToolInfo {
661                name: "perfgate".to_string(),
662                version: "0.1.0".to_string(),
663            },
664        );
665
666        let outcome = usecase
667            .execute(PairedRunRequest {
668                name: "diff-bench".to_string(),
669                cwd: None,
670                baseline_command: vec!["true".to_string()],
671                current_command: vec!["true".to_string()],
672                repeat: 1,
673                warmup: 0,
674                work_units: None,
675                timeout: None,
676                env: vec![],
677                output_cap_bytes: 1024,
678                allow_nonzero: false,
679                include_hostname_hash: false,
680                significance_alpha: None,
681                significance_min_samples: None,
682                require_significance: false,
683                max_retries: 0,
684                fail_on_regression: None,
685                cv_threshold: None,
686            })
687            .expect("paired run should succeed");
688
689        assert_eq!(outcome.receipt.samples.len(), 1);
690        let sample = &outcome.receipt.samples[0];
691        assert_eq!(sample.wall_diff_ms, -50);
692        assert_eq!(sample.rss_diff_kb, Some(-200));
693        assert!(!outcome.failed);
694    }
695
696    #[test]
697    fn paired_run_retries_until_significance() {
698        // We want to simulate:
699        // Initial run (2 pairs): not significant
700        // Adaptive retries collect increasing pairs until significance
701
702        // Wall diffs: initially ambiguous, later consistently positive.
703        // Retry 1 collects ceil((0+1)*1.5) = 2 pairs, Retry 2 = 3 pairs, etc.
704        // Provide enough runs for multiple retries.
705        let runs = vec![
706            // Pair 1 (diff 0)
707            run_result(100, 0, false, None, b"", b""),
708            run_result(100, 0, false, None, b"", b""),
709            // Pair 2 (diff 10)
710            run_result(100, 0, false, None, b"", b""),
711            run_result(110, 0, false, None, b"", b""),
712            // Retry 1: 2 extra pairs (diff 10 each)
713            run_result(100, 0, false, None, b"", b""),
714            run_result(110, 0, false, None, b"", b""),
715            run_result(100, 0, false, None, b"", b""),
716            run_result(110, 0, false, None, b"", b""),
717            // Retry 2: 3 extra pairs (diff 10 each)
718            run_result(100, 0, false, None, b"", b""),
719            run_result(110, 0, false, None, b"", b""),
720            run_result(100, 0, false, None, b"", b""),
721            run_result(110, 0, false, None, b"", b""),
722            run_result(100, 0, false, None, b"", b""),
723            run_result(110, 0, false, None, b"", b""),
724            // Retry 3: 5 extra pairs (diff 10 each) - just in case
725            run_result(100, 0, false, None, b"", b""),
726            run_result(110, 0, false, None, b"", b""),
727            run_result(100, 0, false, None, b"", b""),
728            run_result(110, 0, false, None, b"", b""),
729            run_result(100, 0, false, None, b"", b""),
730            run_result(110, 0, false, None, b"", b""),
731            run_result(100, 0, false, None, b"", b""),
732            run_result(110, 0, false, None, b"", b""),
733            run_result(100, 0, false, None, b"", b""),
734            run_result(110, 0, false, None, b"", b""),
735        ];
736
737        let runner = TestRunner::new(runs);
738        let host = HostInfo {
739            os: "linux".to_string(),
740            arch: "x86_64".to_string(),
741            cpu_count: None,
742            memory_bytes: None,
743            hostname_hash: None,
744        };
745        let host_probe = TestHostProbe::new(host);
746        let clock = TestClock::new("2024-01-01T00:00:00Z");
747
748        let usecase = PairedRunUseCase::new(
749            runner,
750            host_probe,
751            clock,
752            ToolInfo {
753                name: "perfgate".to_string(),
754                version: "0.1.0".to_string(),
755            },
756        );
757
758        let outcome = usecase
759            .execute(PairedRunRequest {
760                name: "retry-bench".to_string(),
761                cwd: None,
762                baseline_command: vec!["true".to_string()],
763                current_command: vec!["true".to_string()],
764                repeat: 2, // Initial 2 pairs
765                warmup: 0,
766                work_units: None,
767                timeout: None,
768                env: vec![],
769                output_cap_bytes: 1024,
770                allow_nonzero: false,
771                include_hostname_hash: false,
772                significance_alpha: Some(0.05),
773                significance_min_samples: Some(2),
774                require_significance: true,
775                max_retries: 5, // Allow up to 5 retries
776                fail_on_regression: None,
777                cv_threshold: None,
778            })
779            .expect("paired run should succeed");
780
781        // It should have at least 3 samples because it retried
782        assert!(outcome.receipt.samples.len() > 2);
783        assert_eq!(
784            outcome.receipt.bench.repeat,
785            outcome.receipt.samples.len() as u32
786        );
787
788        // Noise diagnostics should be present because max_retries > 0
789        let diag = outcome
790            .receipt
791            .noise_diagnostics
792            .expect("should have noise diagnostics");
793        assert!(diag.retries_used > 0);
794        assert!(!diag.early_termination);
795    }
796
797    #[test]
798    fn paired_run_cv_threshold_early_termination() {
799        // Simulate very noisy data: large variance in diffs
800        // Pair 1: diff = -50, Pair 2: diff = +60 => high CV
801        let runs = vec![
802            // Pair 1
803            run_result(200, 0, false, None, b"", b""),
804            run_result(150, 0, false, None, b"", b""),
805            // Pair 2
806            run_result(100, 0, false, None, b"", b""),
807            run_result(160, 0, false, None, b"", b""),
808            // Extra pair should NOT be consumed because CV threshold triggers early exit
809            // But we provide extras just in case the retry runs one more batch
810            run_result(100, 0, false, None, b"", b""),
811            run_result(100, 0, false, None, b"", b""),
812            run_result(100, 0, false, None, b"", b""),
813            run_result(100, 0, false, None, b"", b""),
814        ];
815
816        let runner = TestRunner::new(runs);
817        let host = HostInfo {
818            os: "linux".to_string(),
819            arch: "x86_64".to_string(),
820            cpu_count: None,
821            memory_bytes: None,
822            hostname_hash: None,
823        };
824        let host_probe = TestHostProbe::new(host);
825        let clock = TestClock::new("2024-01-01T00:00:00Z");
826
827        let usecase = PairedRunUseCase::new(
828            runner,
829            host_probe,
830            clock,
831            ToolInfo {
832                name: "perfgate".to_string(),
833                version: "0.1.0".to_string(),
834            },
835        );
836
837        let outcome = usecase
838            .execute(PairedRunRequest {
839                name: "noisy-bench".to_string(),
840                cwd: None,
841                baseline_command: vec!["true".to_string()],
842                current_command: vec!["true".to_string()],
843                repeat: 2,
844                warmup: 0,
845                work_units: None,
846                timeout: None,
847                env: vec![],
848                output_cap_bytes: 1024,
849                allow_nonzero: false,
850                include_hostname_hash: false,
851                significance_alpha: Some(0.05),
852                significance_min_samples: Some(2),
853                require_significance: true,
854                max_retries: 5,
855                fail_on_regression: None,
856                cv_threshold: Some(0.5),
857            })
858            .expect("paired run should succeed");
859
860        let diag = outcome
861            .receipt
862            .noise_diagnostics
863            .expect("should have noise diagnostics");
864        // Early termination should have kicked in
865        assert!(diag.early_termination);
866        assert!(diag.cv > 0.5);
867        assert_eq!(diag.noise_level, perfgate_types::NoiseLevel::High);
868
869        // Should have a reason about early termination
870        assert!(
871            outcome
872                .reasons
873                .iter()
874                .any(|r| r.contains("early termination")),
875            "expected early termination reason, got: {:?}",
876            outcome.reasons
877        );
878    }
879
880    #[test]
881    fn paired_run_no_retries_no_noise_diagnostics() {
882        let runs = vec![
883            run_result(100, 0, false, None, b"", b""),
884            run_result(110, 0, false, None, b"", b""),
885        ];
886
887        let runner = TestRunner::new(runs);
888        let host = HostInfo {
889            os: "linux".to_string(),
890            arch: "x86_64".to_string(),
891            cpu_count: None,
892            memory_bytes: None,
893            hostname_hash: None,
894        };
895        let host_probe = TestHostProbe::new(host);
896        let clock = TestClock::new("2024-01-01T00:00:00Z");
897
898        let usecase = PairedRunUseCase::new(
899            runner,
900            host_probe,
901            clock,
902            ToolInfo {
903                name: "perfgate".to_string(),
904                version: "0.1.0".to_string(),
905            },
906        );
907
908        let outcome = usecase
909            .execute(PairedRunRequest {
910                name: "simple-bench".to_string(),
911                cwd: None,
912                baseline_command: vec!["true".to_string()],
913                current_command: vec!["true".to_string()],
914                repeat: 1,
915                warmup: 0,
916                work_units: None,
917                timeout: None,
918                env: vec![],
919                output_cap_bytes: 1024,
920                allow_nonzero: false,
921                include_hostname_hash: false,
922                significance_alpha: None,
923                significance_min_samples: None,
924                require_significance: false,
925                max_retries: 0,
926                fail_on_regression: None,
927                cv_threshold: None,
928            })
929            .expect("paired run should succeed");
930
931        assert!(
932            outcome.receipt.noise_diagnostics.is_none(),
933            "should not have noise diagnostics when max_retries=0"
934        );
935    }
936}