Skip to main content

perfgate_app/
paired.rs

1//! Paired benchmark execution for perfgate.
2
3use perfgate_adapters::{CommandSpec, HostProbe, HostProbeOptions, ProcessRunner};
4use perfgate_domain::compute_paired_stats;
5use perfgate_types::{
6    PAIRED_SCHEMA_V1, PairedBenchMeta, PairedRunReceipt, PairedSample, PairedSampleHalf, RunMeta,
7    SignificancePolicy, ToolInfo,
8};
9use std::path::PathBuf;
10use std::time::Duration;
11
12use crate::Clock;
13
14#[derive(Debug, Clone)]
15pub struct PairedRunRequest {
16    pub name: String,
17    pub cwd: Option<PathBuf>,
18    pub baseline_command: Vec<String>,
19    pub current_command: Vec<String>,
20    pub repeat: u32,
21    pub warmup: u32,
22    pub work_units: Option<u64>,
23    pub timeout: Option<Duration>,
24    pub env: Vec<(String, String)>,
25    pub output_cap_bytes: usize,
26    pub allow_nonzero: bool,
27    pub include_hostname_hash: bool,
28    pub significance_alpha: Option<f64>,
29    pub significance_min_samples: Option<u32>,
30    pub require_significance: bool,
31    pub max_retries: u32,
32    pub fail_on_regression: Option<f64>,
33}
34
35#[derive(Debug, Clone)]
36pub struct PairedRunOutcome {
37    pub receipt: PairedRunReceipt,
38    pub failed: bool,
39    pub reasons: Vec<String>,
40}
41
42pub struct PairedRunUseCase<R: ProcessRunner, H: HostProbe, C: Clock> {
43    runner: R,
44    host_probe: H,
45    clock: C,
46    tool: ToolInfo,
47}
48
49impl<R: ProcessRunner, H: HostProbe, C: Clock> PairedRunUseCase<R, H, C> {
50    pub fn new(runner: R, host_probe: H, clock: C, tool: ToolInfo) -> Self {
51        Self {
52            runner,
53            host_probe,
54            clock,
55            tool,
56        }
57    }
58
59    pub fn execute(&self, req: PairedRunRequest) -> anyhow::Result<PairedRunOutcome> {
60        let run_id = uuid::Uuid::new_v4().to_string();
61        let started_at = self.clock.now_rfc3339();
62        let host = self.host_probe.probe(&HostProbeOptions {
63            include_hostname_hash: req.include_hostname_hash,
64        });
65
66        let mut bench = PairedBenchMeta {
67            name: req.name.clone(),
68            cwd: req.cwd.as_ref().map(|p| p.to_string_lossy().to_string()),
69            baseline_command: req.baseline_command.clone(),
70            current_command: req.current_command.clone(),
71            repeat: req.repeat,
72            warmup: req.warmup,
73            work_units: req.work_units,
74            timeout_ms: req.timeout.map(|d| d.as_millis() as u64),
75        };
76
77        let mut samples = Vec::new();
78        let mut reasons = Vec::new();
79
80        // Run warmups first
81        for i in 0..req.warmup {
82            self.run_pair(i, true, &req, &mut samples, &mut reasons)?;
83        }
84
85        // Initial measurement run
86        let mut pairs_collected = 0;
87        for _ in 0..req.repeat {
88            self.run_pair(
89                req.warmup + pairs_collected,
90                false,
91                &req,
92                &mut samples,
93                &mut reasons,
94            )?;
95            pairs_collected += 1;
96        }
97
98        let significance_policy = SignificancePolicy {
99            alpha: req.significance_alpha,
100            min_samples: req.significance_min_samples,
101        };
102
103        // Retry logic for significance
104        let mut retries_done = 0;
105        loop {
106            let stats = compute_paired_stats(&samples, req.work_units, Some(&significance_policy))?;
107            let significance_reached = stats
108                .wall_diff_ms
109                .significance
110                .as_ref()
111                .map(|s| s.significant)
112                .unwrap_or(true);
113
114            if !req.require_significance || significance_reached || retries_done >= req.max_retries
115            {
116                break;
117            }
118
119            // Not significant, and we have retries left - run one more pair
120            retries_done += 1;
121            self.run_pair(
122                req.warmup + pairs_collected,
123                false,
124                &req,
125                &mut samples,
126                &mut reasons,
127            )?;
128            pairs_collected += 1;
129        }
130
131        // Update bench metadata if we collected more samples than originally requested
132        bench.repeat = pairs_collected;
133
134        let stats = compute_paired_stats(&samples, req.work_units, Some(&significance_policy))?;
135        let ended_at = self.clock.now_rfc3339();
136
137        let receipt = PairedRunReceipt {
138            schema: PAIRED_SCHEMA_V1.to_string(),
139            tool: self.tool.clone(),
140            run: RunMeta {
141                id: run_id,
142                started_at,
143                ended_at,
144                host,
145            },
146            bench,
147            samples,
148            stats,
149        };
150
151        if let Some(threshold_pct) = req.fail_on_regression {
152            let comparison = perfgate_domain::compare_paired_stats(&receipt.stats);
153            let threshold_fraction = threshold_pct / 100.0;
154            if comparison.pct_change > threshold_fraction && comparison.is_significant {
155                reasons.push(format!(
156                    "wall time regression ({:.2}%) exceeded threshold ({:.2}%)",
157                    comparison.pct_change * 100.0,
158                    threshold_pct
159                ));
160            }
161        }
162
163        let failed = !reasons.is_empty();
164        Ok(PairedRunOutcome {
165            receipt,
166            failed,
167            reasons,
168        })
169    }
170
171    fn run_pair(
172        &self,
173        pair_index: u32,
174        is_warmup: bool,
175        req: &PairedRunRequest,
176        samples: &mut Vec<PairedSample>,
177        reasons: &mut Vec<String>,
178    ) -> anyhow::Result<()> {
179        let baseline_spec = CommandSpec {
180            name: format!("{}-baseline", req.name),
181            argv: req.baseline_command.clone(),
182            cwd: req.cwd.clone(),
183            env: req.env.clone(),
184            timeout: req.timeout,
185            output_cap_bytes: req.output_cap_bytes,
186        };
187        let baseline_run = self.runner.run(&baseline_spec).map_err(|e| match e {
188            perfgate_adapters::AdapterError::RunCommand { command, reason } => {
189                anyhow::anyhow!(
190                    "failed to run baseline pair {}: {}: {}",
191                    pair_index + 1,
192                    command,
193                    reason
194                )
195            }
196            _ => anyhow::anyhow!("failed to run baseline pair {}: {}", pair_index + 1, e),
197        })?;
198
199        let current_spec = CommandSpec {
200            name: format!("{}-current", req.name),
201            argv: req.current_command.clone(),
202            cwd: req.cwd.clone(),
203            env: req.env.clone(),
204            timeout: req.timeout,
205            output_cap_bytes: req.output_cap_bytes,
206        };
207        let current_run = self.runner.run(&current_spec).map_err(|e| match e {
208            perfgate_adapters::AdapterError::RunCommand { command, reason } => {
209                anyhow::anyhow!(
210                    "failed to run current pair {}: {}: {}",
211                    pair_index + 1,
212                    command,
213                    reason
214                )
215            }
216            _ => anyhow::anyhow!("failed to run current pair {}: {}", pair_index + 1, e),
217        })?;
218
219        let baseline = sample_half(&baseline_run);
220        let current = sample_half(&current_run);
221
222        let wall_diff_ms = current.wall_ms as i64 - baseline.wall_ms as i64;
223        let rss_diff_kb = match (baseline.max_rss_kb, current.max_rss_kb) {
224            (Some(b), Some(c)) => Some(c as i64 - b as i64),
225            _ => None,
226        };
227
228        if !is_warmup {
229            if baseline.timed_out {
230                reasons.push(format!("pair {} baseline timed out", pair_index + 1));
231            }
232            if baseline.exit_code != 0 && !req.allow_nonzero {
233                reasons.push(format!(
234                    "pair {} baseline exit {}",
235                    pair_index + 1,
236                    baseline.exit_code
237                ));
238            }
239            if current.timed_out {
240                reasons.push(format!("pair {} current timed out", pair_index + 1));
241            }
242            if current.exit_code != 0 && !req.allow_nonzero {
243                reasons.push(format!(
244                    "pair {} current exit {}",
245                    pair_index + 1,
246                    current.exit_code
247                ));
248            }
249        }
250
251        samples.push(PairedSample {
252            pair_index,
253            warmup: is_warmup,
254            baseline,
255            current,
256            wall_diff_ms,
257            rss_diff_kb,
258        });
259
260        Ok(())
261    }
262}
263
264fn sample_half(run: &perfgate_adapters::RunResult) -> PairedSampleHalf {
265    PairedSampleHalf {
266        wall_ms: run.wall_ms,
267        exit_code: run.exit_code,
268        timed_out: run.timed_out,
269        max_rss_kb: run.max_rss_kb,
270        stdout: if run.stdout.is_empty() {
271            None
272        } else {
273            Some(String::from_utf8_lossy(&run.stdout).to_string())
274        },
275        stderr: if run.stderr.is_empty() {
276            None
277        } else {
278            Some(String::from_utf8_lossy(&run.stderr).to_string())
279        },
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286    use perfgate_adapters::{AdapterError, RunResult};
287    use perfgate_types::HostInfo;
288    use std::sync::{Arc, Mutex};
289
290    #[derive(Clone)]
291    struct TestRunner {
292        runs: Arc<Mutex<Vec<RunResult>>>,
293    }
294
295    impl TestRunner {
296        fn new(runs: Vec<RunResult>) -> Self {
297            Self {
298                runs: Arc::new(Mutex::new(runs)),
299            }
300        }
301    }
302
303    impl ProcessRunner for TestRunner {
304        fn run(&self, _spec: &CommandSpec) -> Result<RunResult, AdapterError> {
305            let mut runs = self.runs.lock().expect("lock runs");
306            if runs.is_empty() {
307                return Err(AdapterError::Other("no more queued runs".to_string()));
308            }
309            Ok(runs.remove(0))
310        }
311    }
312
313    #[derive(Clone)]
314    struct TestHostProbe {
315        host: HostInfo,
316        seen_include_hash: Arc<Mutex<Vec<bool>>>,
317    }
318
319    impl TestHostProbe {
320        fn new(host: HostInfo) -> Self {
321            Self {
322                host,
323                seen_include_hash: Arc::new(Mutex::new(Vec::new())),
324            }
325        }
326    }
327
328    impl HostProbe for TestHostProbe {
329        fn probe(&self, options: &HostProbeOptions) -> HostInfo {
330            self.seen_include_hash
331                .lock()
332                .expect("lock options")
333                .push(options.include_hostname_hash);
334            self.host.clone()
335        }
336    }
337
338    #[derive(Clone)]
339    struct TestClock {
340        now: String,
341    }
342
343    impl TestClock {
344        fn new(now: &str) -> Self {
345            Self {
346                now: now.to_string(),
347            }
348        }
349    }
350
351    impl Clock for TestClock {
352        fn now_rfc3339(&self) -> String {
353            self.now.clone()
354        }
355    }
356
357    fn run_result(
358        wall_ms: u64,
359        exit_code: i32,
360        timed_out: bool,
361        max_rss_kb: Option<u64>,
362        stdout: &[u8],
363        stderr: &[u8],
364    ) -> RunResult {
365        RunResult {
366            wall_ms,
367            exit_code,
368            timed_out,
369            cpu_ms: None,
370            page_faults: None,
371            ctx_switches: None,
372            max_rss_kb,
373            io_read_bytes: None,
374            io_write_bytes: None,
375            network_packets: None,
376            energy_uj: None,
377            binary_bytes: None,
378            stdout: stdout.to_vec(),
379            stderr: stderr.to_vec(),
380        }
381    }
382
383    #[test]
384    fn sample_half_maps_optional_output() {
385        let run = run_result(10, 0, false, None, b"hello", b"");
386        let sample = sample_half(&run);
387        assert_eq!(sample.stdout.as_deref(), Some("hello"));
388        assert!(sample.stderr.is_none());
389
390        let run2 = run_result(10, 0, false, None, b"", b"err");
391        let sample2 = sample_half(&run2);
392        assert!(sample2.stdout.is_none());
393        assert_eq!(sample2.stderr.as_deref(), Some("err"));
394    }
395
396    #[test]
397    fn paired_run_collects_samples_and_reasons() {
398        let runs = vec![
399            // warmup baseline/current (current exits nonzero, should be ignored)
400            run_result(100, 0, false, None, b"", b""),
401            run_result(90, 1, false, None, b"", b""),
402            // measured baseline/current (baseline times out + nonzero)
403            run_result(110, 2, true, Some(2000), b"out", b""),
404            run_result(105, 0, false, Some(2500), b"", b""),
405        ];
406
407        let runner = TestRunner::new(runs);
408        let host = HostInfo {
409            os: "linux".to_string(),
410            arch: "x86_64".to_string(),
411            cpu_count: None,
412            memory_bytes: None,
413            hostname_hash: None,
414        };
415        let host_probe = TestHostProbe::new(host.clone());
416        let clock = TestClock::new("2024-01-01T00:00:00Z");
417
418        let usecase = PairedRunUseCase::new(
419            runner,
420            host_probe.clone(),
421            clock,
422            ToolInfo {
423                name: "perfgate".to_string(),
424                version: "0.1.0".to_string(),
425            },
426        );
427
428        let outcome = usecase
429            .execute(PairedRunRequest {
430                name: "bench".to_string(),
431                cwd: None,
432                baseline_command: vec!["true".to_string()],
433                current_command: vec!["true".to_string()],
434                repeat: 1,
435                warmup: 1,
436                work_units: None,
437                timeout: None,
438                env: vec![],
439                output_cap_bytes: 1024,
440                allow_nonzero: false,
441                include_hostname_hash: true,
442                significance_alpha: None,
443                significance_min_samples: None,
444                require_significance: false,
445                max_retries: 0,
446                fail_on_regression: None,
447            })
448            .expect("paired run should succeed");
449
450        assert_eq!(outcome.receipt.samples.len(), 2);
451        assert!(outcome.receipt.samples[0].warmup);
452        assert!(!outcome.receipt.samples[1].warmup);
453        assert_eq!(outcome.receipt.samples[0].pair_index, 0);
454        assert_eq!(outcome.receipt.samples[1].pair_index, 1);
455
456        let measured = &outcome.receipt.samples[1];
457        assert_eq!(measured.rss_diff_kb, Some(500));
458
459        assert!(outcome.failed);
460        assert!(
461            outcome
462                .reasons
463                .iter()
464                .any(|r| r.contains("baseline timed out")),
465            "expected baseline timeout reason"
466        );
467        assert!(
468            outcome.reasons.iter().any(|r| r.contains("baseline exit")),
469            "expected baseline exit reason"
470        );
471        assert!(
472            !outcome
473                .reasons
474                .iter()
475                .any(|r| r.contains("pair 1 current exit")),
476            "warmup errors should not be recorded"
477        );
478
479        let seen = host_probe.seen_include_hash.lock().expect("lock seen");
480        assert_eq!(seen.as_slice(), &[true]);
481        assert_eq!(outcome.receipt.run.host, host);
482    }
483
484    #[test]
485    fn paired_run_all_warmup_no_measured_samples() {
486        // 2 warmups, 0 measured → samples has 2 entries, all warmup, no failures
487        let runs = vec![
488            run_result(100, 0, false, None, b"", b""),
489            run_result(90, 0, false, None, b"", b""),
490            run_result(110, 0, false, None, b"", b""),
491            run_result(95, 0, false, None, b"", b""),
492        ];
493
494        let runner = TestRunner::new(runs);
495        let host = HostInfo {
496            os: "linux".to_string(),
497            arch: "x86_64".to_string(),
498            cpu_count: None,
499            memory_bytes: None,
500            hostname_hash: None,
501        };
502        let host_probe = TestHostProbe::new(host);
503        let clock = TestClock::new("2024-01-01T00:00:00Z");
504
505        let usecase = PairedRunUseCase::new(
506            runner,
507            host_probe,
508            clock,
509            ToolInfo {
510                name: "perfgate".to_string(),
511                version: "0.1.0".to_string(),
512            },
513        );
514
515        let outcome = usecase
516            .execute(PairedRunRequest {
517                name: "warmup-only".to_string(),
518                cwd: None,
519                baseline_command: vec!["true".to_string()],
520                current_command: vec!["true".to_string()],
521                repeat: 2,
522                warmup: 0,
523                work_units: None,
524                timeout: None,
525                env: vec![],
526                output_cap_bytes: 1024,
527                allow_nonzero: false,
528                include_hostname_hash: false,
529                significance_alpha: None,
530                significance_min_samples: None,
531                require_significance: false,
532                max_retries: 0,
533                fail_on_regression: None,
534            })
535            .expect("paired run should succeed");
536
537        assert_eq!(outcome.receipt.samples.len(), 2);
538        assert!(!outcome.failed);
539        assert!(outcome.reasons.is_empty());
540    }
541
542    #[test]
543    fn paired_run_runner_error_propagates() {
544        // Runner that immediately fails
545        let runner = TestRunner::new(vec![]);
546
547        let host = HostInfo {
548            os: "linux".to_string(),
549            arch: "x86_64".to_string(),
550            cpu_count: None,
551            memory_bytes: None,
552            hostname_hash: None,
553        };
554        let host_probe = TestHostProbe::new(host);
555        let clock = TestClock::new("2024-01-01T00:00:00Z");
556
557        let usecase = PairedRunUseCase::new(
558            runner,
559            host_probe,
560            clock,
561            ToolInfo {
562                name: "perfgate".to_string(),
563                version: "0.1.0".to_string(),
564            },
565        );
566
567        let err = usecase
568            .execute(PairedRunRequest {
569                name: "fail-bench".to_string(),
570                cwd: None,
571                baseline_command: vec!["true".to_string()],
572                current_command: vec!["true".to_string()],
573                repeat: 1,
574                warmup: 0,
575                work_units: None,
576                timeout: None,
577                env: vec![],
578                output_cap_bytes: 1024,
579                allow_nonzero: false,
580                include_hostname_hash: false,
581                significance_alpha: None,
582                significance_min_samples: None,
583                require_significance: false,
584                max_retries: 0,
585                fail_on_regression: None,
586            })
587            .unwrap_err();
588
589        assert!(
590            err.to_string().contains("no more queued runs")
591                || err.to_string().contains("failed to run"),
592            "expected runner error, got: {}",
593            err
594        );
595    }
596
597    #[test]
598    fn paired_run_wall_diff_computed_correctly() {
599        let runs = vec![
600            // baseline: 200ms, current: 150ms → diff = -50
601            run_result(200, 0, false, Some(1000), b"", b""),
602            run_result(150, 0, false, Some(800), b"", b""),
603        ];
604
605        let runner = TestRunner::new(runs);
606        let host = HostInfo {
607            os: "linux".to_string(),
608            arch: "x86_64".to_string(),
609            cpu_count: None,
610            memory_bytes: None,
611            hostname_hash: None,
612        };
613        let host_probe = TestHostProbe::new(host);
614        let clock = TestClock::new("2024-01-01T00:00:00Z");
615
616        let usecase = PairedRunUseCase::new(
617            runner,
618            host_probe,
619            clock,
620            ToolInfo {
621                name: "perfgate".to_string(),
622                version: "0.1.0".to_string(),
623            },
624        );
625
626        let outcome = usecase
627            .execute(PairedRunRequest {
628                name: "diff-bench".to_string(),
629                cwd: None,
630                baseline_command: vec!["true".to_string()],
631                current_command: vec!["true".to_string()],
632                repeat: 1,
633                warmup: 0,
634                work_units: None,
635                timeout: None,
636                env: vec![],
637                output_cap_bytes: 1024,
638                allow_nonzero: false,
639                include_hostname_hash: false,
640                significance_alpha: None,
641                significance_min_samples: None,
642                require_significance: false,
643                max_retries: 0,
644                fail_on_regression: None,
645            })
646            .expect("paired run should succeed");
647
648        assert_eq!(outcome.receipt.samples.len(), 1);
649        let sample = &outcome.receipt.samples[0];
650        assert_eq!(sample.wall_diff_ms, -50);
651        assert_eq!(sample.rss_diff_kb, Some(-200));
652        assert!(!outcome.failed);
653    }
654
655    #[test]
656    fn paired_run_retries_until_significance() {
657        // We want to simulate:
658        // Initial run (2 pairs): not significant
659        // Retry 1 (1 pair): now significant (or reached max retries)
660
661        // Wall diffs:
662        // Pair 1: 100 - 100 = 0
663        // Pair 2: 100 - 100 = 0
664        // (Mean = 0, StdDev = 0 -> not significant if alpha is tight or we need more samples)
665        // Wait, if StdDev is 0 it MIGHT be significant depending on the test.
666        // Actually compute_paired_stats uses Welch's t-test on the diffs.
667
668        // Let's just use a large enough StdDev to ensure not significant initially.
669        let runs = vec![
670            // Pair 1 (diff 0)
671            run_result(100, 0, false, None, b"", b""),
672            run_result(100, 0, false, None, b"", b""),
673            // Pair 2 (diff 10)
674            run_result(100, 0, false, None, b"", b""),
675            run_result(110, 0, false, None, b"", b""),
676            // Pair 3 (diff 10) - Should be collected because of retry
677            run_result(100, 0, false, None, b"", b""),
678            run_result(110, 0, false, None, b"", b""),
679        ];
680
681        let runner = TestRunner::new(runs);
682        let host = HostInfo {
683            os: "linux".to_string(),
684            arch: "x86_64".to_string(),
685            cpu_count: None,
686            memory_bytes: None,
687            hostname_hash: None,
688        };
689        let host_probe = TestHostProbe::new(host);
690        let clock = TestClock::new("2024-01-01T00:00:00Z");
691
692        let usecase = PairedRunUseCase::new(
693            runner,
694            host_probe,
695            clock,
696            ToolInfo {
697                name: "perfgate".to_string(),
698                version: "0.1.0".to_string(),
699            },
700        );
701
702        let outcome = usecase
703            .execute(PairedRunRequest {
704                name: "retry-bench".to_string(),
705                cwd: None,
706                baseline_command: vec!["true".to_string()],
707                current_command: vec!["true".to_string()],
708                repeat: 2, // Initial 2 pairs
709                warmup: 0,
710                work_units: None,
711                timeout: None,
712                env: vec![],
713                output_cap_bytes: 1024,
714                allow_nonzero: false,
715                include_hostname_hash: false,
716                significance_alpha: Some(0.05),
717                significance_min_samples: Some(2),
718                require_significance: true,
719                max_retries: 5, // Allow up to 5 retries
720                fail_on_regression: None,
721            })
722            .expect("paired run should succeed");
723
724        // It should have at least 3 samples because it retried
725        assert!(outcome.receipt.samples.len() > 2);
726        assert_eq!(
727            outcome.receipt.bench.repeat,
728            outcome.receipt.samples.len() as u32
729        );
730    }
731}