Skip to main content

ftui_harness/
benchmark_gate.rs

1#![forbid(unsafe_code)]
2
3//! Benchmark gate enforcement with structured evidence.
4//!
5//! Loads baseline performance thresholds, compares measured values, and emits
6//! pass/fail evidence in JSONL format. This module provides the programmatic
7//! backbone for CI performance regression gating.
8//!
9//! # Design
10//!
11//! A [`BenchmarkGate`] is configured with a set of [`Threshold`]s (metric name,
12//! budget, tolerance). After collecting [`Measurement`]s, calling
13//! [`evaluate`](BenchmarkGate::evaluate) produces a [`GateResult`] with
14//! per-metric verdicts and an overall pass/fail.
15//!
16//! # Example
17//!
18//! ```ignore
19//! use ftui_harness::benchmark_gate::{BenchmarkGate, Measurement, Threshold};
20//!
21//! let gate = BenchmarkGate::new("render_perf")
22//!     .threshold(Threshold::new("frame_render_p99_us", 2000.0).tolerance_pct(10.0))
23//!     .threshold(Threshold::new("diff_compute_p99_us", 500.0).tolerance_pct(15.0));
24//!
25//! let measurements = vec![
26//!     Measurement::new("frame_render_p99_us", 1850.0),
27//!     Measurement::new("diff_compute_p99_us", 480.0),
28//! ];
29//!
30//! let result = gate.evaluate(&measurements);
31//! assert!(result.passed());
32//! ```
33//!
34//! # Baseline JSON Format
35//!
36//! Thresholds can be loaded from a JSON file matching the format used by
37//! `scripts/perf_regression_gate.sh`:
38//!
39//! ```json
40//! {
41//!   "frame_render_p99_us": { "budget": 2000.0, "tolerance_pct": 10.0 },
42//!   "diff_compute_p99_us": { "budget": 500.0 }
43//! }
44//! ```
45
46use std::collections::BTreeMap;
47
48use crate::determinism::{JsonValue, TestJsonlLogger};
49
50// ============================================================================
51// Threshold
52// ============================================================================
53
54/// A single performance threshold for gating.
55#[derive(Debug, Clone)]
56pub struct Threshold {
57    /// Metric name (must match a [`Measurement`] name).
58    pub metric: String,
59    /// Budget value (upper bound for the metric).
60    pub budget: f64,
61    /// Tolerance as a percentage (0.0–100.0). A measurement is allowed to
62    /// exceed `budget` by up to `budget * tolerance_pct / 100`.
63    pub tolerance_pct: f64,
64}
65
66impl Threshold {
67    /// Create a threshold with zero tolerance.
68    pub fn new(metric: &str, budget: f64) -> Self {
69        Self {
70            metric: metric.to_string(),
71            budget,
72            tolerance_pct: 0.0,
73        }
74    }
75
76    /// Set the tolerance percentage.
77    #[must_use]
78    pub fn tolerance_pct(mut self, pct: f64) -> Self {
79        self.tolerance_pct = pct;
80        self
81    }
82
83    /// Effective ceiling = budget × (1 + tolerance / 100).
84    #[must_use]
85    pub fn ceiling(&self) -> f64 {
86        self.budget * (1.0 + self.tolerance_pct / 100.0)
87    }
88}
89
90// ============================================================================
91// Measurement
92// ============================================================================
93
94/// A single performance measurement to check against a threshold.
95#[derive(Debug, Clone)]
96pub struct Measurement {
97    /// Metric name (should match a [`Threshold`] metric).
98    pub metric: String,
99    /// Measured value.
100    pub value: f64,
101    /// Optional unit label for evidence output (e.g., "μs", "bytes").
102    pub unit: Option<String>,
103}
104
105impl Measurement {
106    /// Create a measurement.
107    pub fn new(metric: &str, value: f64) -> Self {
108        Self {
109            metric: metric.to_string(),
110            value,
111            unit: None,
112        }
113    }
114
115    /// Set the unit label.
116    #[must_use]
117    pub fn unit(mut self, unit: &str) -> Self {
118        self.unit = Some(unit.to_string());
119        self
120    }
121}
122
123// ============================================================================
124// MetricVerdict
125// ============================================================================
126
127/// Verdict for a single metric check.
128#[derive(Debug, Clone, Copy, PartialEq, Eq)]
129pub enum MetricVerdict {
130    /// Measured value is within budget (including tolerance).
131    Pass,
132    /// Measured value exceeds budget + tolerance.
133    Fail,
134    /// No threshold defined for this metric (informational only).
135    Unchecked,
136}
137
138/// Detailed result for a single metric evaluation.
139#[derive(Debug, Clone)]
140pub struct MetricResult {
141    /// Metric name.
142    pub metric: String,
143    /// Measured value.
144    pub value: f64,
145    /// Budget (if a threshold was defined).
146    pub budget: Option<f64>,
147    /// Effective ceiling (budget + tolerance).
148    pub ceiling: Option<f64>,
149    /// Tolerance percentage applied.
150    pub tolerance_pct: Option<f64>,
151    /// How much the value exceeds the budget as a percentage.
152    /// Negative means under budget.
153    pub overshoot_pct: Option<f64>,
154    /// Per-metric verdict.
155    pub verdict: MetricVerdict,
156    /// Unit label (if provided).
157    pub unit: Option<String>,
158}
159
160// ============================================================================
161// GateResult
162// ============================================================================
163
164/// Overall result of a benchmark gate evaluation.
165#[derive(Debug, Clone)]
166pub struct GateResult {
167    /// Gate name.
168    pub gate_name: String,
169    /// Per-metric results (sorted by metric name).
170    pub metrics: Vec<MetricResult>,
171    /// Number of metrics that passed.
172    pub pass_count: usize,
173    /// Number of metrics that failed.
174    pub fail_count: usize,
175    /// Number of metrics with no threshold (unchecked).
176    pub unchecked_count: usize,
177}
178
179impl GateResult {
180    /// True if no metric failed.
181    #[must_use]
182    pub fn passed(&self) -> bool {
183        self.fail_count == 0
184    }
185
186    /// Return only the failed metrics.
187    pub fn failures(&self) -> Vec<&MetricResult> {
188        self.metrics
189            .iter()
190            .filter(|m| m.verdict == MetricVerdict::Fail)
191            .collect()
192    }
193
194    /// Format a human-readable summary.
195    #[must_use]
196    pub fn summary(&self) -> String {
197        let status = if self.passed() { "PASS" } else { "FAIL" };
198        let mut out = format!(
199            "Gate '{}': {} ({} passed, {} failed, {} unchecked)\n",
200            self.gate_name, status, self.pass_count, self.fail_count, self.unchecked_count
201        );
202        for m in &self.metrics {
203            let icon = match m.verdict {
204                MetricVerdict::Pass => "  ok",
205                MetricVerdict::Fail => "FAIL",
206                MetricVerdict::Unchecked => "  --",
207            };
208            let unit = m.unit.as_deref().unwrap_or("");
209            if let Some(budget) = m.budget {
210                let overshoot = m.overshoot_pct.unwrap_or(0.0);
211                out.push_str(&format!(
212                    "  [{icon}] {}: {:.1}{unit} (budget: {:.1}{unit}, overshoot: {overshoot:+.1}%)\n",
213                    m.metric, m.value, budget
214                ));
215            } else {
216                out.push_str(&format!(
217                    "  [{icon}] {}: {:.1}{unit} (no threshold)\n",
218                    m.metric, m.value
219                ));
220            }
221        }
222        out
223    }
224}
225
226// ============================================================================
227// BenchmarkGate
228// ============================================================================
229
230/// Benchmark gate that compares measurements against thresholds.
231#[derive(Debug, Clone)]
232pub struct BenchmarkGate {
233    /// Gate name for evidence output.
234    gate_name: String,
235    /// Thresholds keyed by metric name.
236    thresholds: BTreeMap<String, Threshold>,
237}
238
239impl BenchmarkGate {
240    /// Create a new benchmark gate.
241    pub fn new(gate_name: &str) -> Self {
242        Self {
243            gate_name: gate_name.to_string(),
244            thresholds: BTreeMap::new(),
245        }
246    }
247
248    /// Add a threshold.
249    #[must_use]
250    pub fn threshold(mut self, threshold: Threshold) -> Self {
251        self.thresholds.insert(threshold.metric.clone(), threshold);
252        self
253    }
254
255    /// Load thresholds from a simple JSON map.
256    ///
257    /// Expected format:
258    /// ```json
259    /// {
260    ///   "metric_name": { "budget": 123.0, "tolerance_pct": 10.0 }
261    /// }
262    /// ```
263    ///
264    /// Returns `None` if parsing fails.
265    #[must_use]
266    pub fn load_json(gate_name: &str, json: &str) -> Option<Self> {
267        let parsed: serde_json::Value = serde_json::from_str(json).ok()?;
268        let obj = parsed.as_object()?;
269        let mut gate = Self::new(gate_name);
270        for (metric, value) in obj {
271            let budget = value.get("budget")?.as_f64()?;
272            let tolerance_pct = value
273                .get("tolerance_pct")
274                .and_then(|v| v.as_f64())
275                .unwrap_or(0.0);
276            gate.thresholds.insert(
277                metric.clone(),
278                Threshold {
279                    metric: metric.clone(),
280                    budget,
281                    tolerance_pct,
282                },
283            );
284        }
285        Some(gate)
286    }
287
288    /// Load thresholds from FrankenTUI's `tests/baseline.json` format.
289    ///
290    /// This format uses percentile budgets (`p99_ns`) and `threshold_pct`:
291    /// ```json
292    /// {
293    ///   "frame_render": {
294    ///     "p99_ns": 2000000,
295    ///     "threshold_pct": 10
296    ///   }
297    /// }
298    /// ```
299    ///
300    /// Entries whose keys start with `_` are skipped (metadata comments).
301    /// The `percentile` parameter selects which budget to use (e.g., `"p99_ns"`).
302    ///
303    /// Returns `None` if the JSON is malformed.
304    #[must_use]
305    pub fn load_baseline_json(gate_name: &str, json: &str, percentile: &str) -> Option<Self> {
306        let parsed: serde_json::Value = serde_json::from_str(json).ok()?;
307        let obj = parsed.as_object()?;
308        let mut gate = Self::new(gate_name);
309        for (metric, value) in obj {
310            // Skip metadata keys (e.g., _comment, _format)
311            if metric.starts_with('_') {
312                continue;
313            }
314            let budget = value.get(percentile).and_then(|v| v.as_f64())?;
315            let tolerance_pct = value
316                .get("threshold_pct")
317                .and_then(|v| v.as_f64())
318                .unwrap_or(0.0);
319            gate.thresholds.insert(
320                metric.clone(),
321                Threshold {
322                    metric: metric.clone(),
323                    budget,
324                    tolerance_pct,
325                },
326            );
327        }
328        Some(gate)
329    }
330
331    /// Evaluate measurements against thresholds.
332    ///
333    /// Metrics with no matching threshold get [`MetricVerdict::Unchecked`].
334    /// Emits structured JSONL evidence via [`TestJsonlLogger`].
335    pub fn evaluate(&self, measurements: &[Measurement]) -> GateResult {
336        let mut logger = TestJsonlLogger::new_with(&format!("{}_gate", self.gate_name), 0, true, 0);
337        logger.add_context_str("gate_name", &self.gate_name);
338
339        logger.log(
340            "gate.start",
341            &[
342                ("gate_name", JsonValue::str(&self.gate_name)),
343                (
344                    "threshold_count",
345                    JsonValue::u64(self.thresholds.len() as u64),
346                ),
347                (
348                    "measurement_count",
349                    JsonValue::u64(measurements.len() as u64),
350                ),
351            ],
352        );
353
354        let mut metrics = Vec::new();
355        let mut pass_count = 0usize;
356        let mut fail_count = 0usize;
357        let mut unchecked_count = 0usize;
358
359        for measurement in measurements {
360            let result = if let Some(threshold) = self.thresholds.get(&measurement.metric) {
361                let ceiling = threshold.ceiling();
362                let overshoot_pct = if threshold.budget > 0.0 {
363                    (measurement.value - threshold.budget) / threshold.budget * 100.0
364                } else {
365                    0.0
366                };
367                let verdict = if measurement.value <= ceiling {
368                    MetricVerdict::Pass
369                } else {
370                    MetricVerdict::Fail
371                };
372                MetricResult {
373                    metric: measurement.metric.clone(),
374                    value: measurement.value,
375                    budget: Some(threshold.budget),
376                    ceiling: Some(ceiling),
377                    tolerance_pct: Some(threshold.tolerance_pct),
378                    overshoot_pct: Some(overshoot_pct),
379                    verdict,
380                    unit: measurement.unit.clone(),
381                }
382            } else {
383                MetricResult {
384                    metric: measurement.metric.clone(),
385                    value: measurement.value,
386                    budget: None,
387                    ceiling: None,
388                    tolerance_pct: None,
389                    overshoot_pct: None,
390                    verdict: MetricVerdict::Unchecked,
391                    unit: measurement.unit.clone(),
392                }
393            };
394
395            // Log per-metric evidence
396            let verdict_str = match result.verdict {
397                MetricVerdict::Pass => "pass",
398                MetricVerdict::Fail => "fail",
399                MetricVerdict::Unchecked => "unchecked",
400            };
401
402            let mut fields: Vec<(&str, JsonValue)> = vec![
403                ("metric", JsonValue::str(&result.metric)),
404                ("value", JsonValue::raw(format!("{:.6}", result.value))),
405                ("verdict", JsonValue::str(verdict_str)),
406            ];
407            if let Some(budget) = result.budget {
408                fields.push(("budget", JsonValue::raw(format!("{budget:.6}"))));
409            }
410            if let Some(ceiling) = result.ceiling {
411                fields.push(("ceiling", JsonValue::raw(format!("{ceiling:.6}"))));
412            }
413            if let Some(overshoot) = result.overshoot_pct {
414                fields.push(("overshoot_pct", JsonValue::raw(format!("{overshoot:.2}"))));
415            }
416            logger.log("gate.metric", &fields);
417
418            match result.verdict {
419                MetricVerdict::Pass => pass_count += 1,
420                MetricVerdict::Fail => fail_count += 1,
421                MetricVerdict::Unchecked => unchecked_count += 1,
422            }
423
424            metrics.push(result);
425        }
426
427        // Sort by metric name for stable output
428        metrics.sort_by(|a, b| a.metric.cmp(&b.metric));
429
430        let overall = if fail_count == 0 { "pass" } else { "fail" };
431        logger.log(
432            "gate.result",
433            &[
434                ("gate_name", JsonValue::str(&self.gate_name)),
435                ("verdict", JsonValue::str(overall)),
436                ("pass_count", JsonValue::u64(pass_count as u64)),
437                ("fail_count", JsonValue::u64(fail_count as u64)),
438                ("unchecked_count", JsonValue::u64(unchecked_count as u64)),
439            ],
440        );
441
442        GateResult {
443            gate_name: self.gate_name.clone(),
444            metrics,
445            pass_count,
446            fail_count,
447            unchecked_count,
448        }
449    }
450}
451
452#[cfg(test)]
453mod tests {
454    use super::*;
455
456    #[test]
457    fn threshold_ceiling_with_tolerance() {
458        let t = Threshold::new("render_p99", 2000.0).tolerance_pct(10.0);
459        assert!((t.ceiling() - 2200.0).abs() < f64::EPSILON);
460    }
461
462    #[test]
463    fn threshold_ceiling_zero_tolerance() {
464        let t = Threshold::new("render_p99", 1000.0);
465        assert!((t.ceiling() - 1000.0).abs() < f64::EPSILON);
466    }
467
468    #[test]
469    fn gate_pass_within_budget() {
470        let gate = BenchmarkGate::new("test_gate")
471            .threshold(Threshold::new("metric_a", 100.0).tolerance_pct(10.0));
472
473        let result = gate.evaluate(&[Measurement::new("metric_a", 95.0)]);
474        assert!(result.passed());
475        assert_eq!(result.pass_count, 1);
476        assert_eq!(result.fail_count, 0);
477    }
478
479    #[test]
480    fn gate_pass_within_tolerance() {
481        let gate = BenchmarkGate::new("test_gate")
482            .threshold(Threshold::new("metric_a", 100.0).tolerance_pct(10.0));
483
484        // 105 is above budget (100) but within tolerance (110)
485        let result = gate.evaluate(&[Measurement::new("metric_a", 105.0)]);
486        assert!(result.passed());
487    }
488
489    #[test]
490    fn gate_fail_exceeds_tolerance() {
491        let gate = BenchmarkGate::new("test_gate")
492            .threshold(Threshold::new("metric_a", 100.0).tolerance_pct(10.0));
493
494        // 115 exceeds ceiling of 110
495        let result = gate.evaluate(&[Measurement::new("metric_a", 115.0)]);
496        assert!(!result.passed());
497        assert_eq!(result.fail_count, 1);
498    }
499
500    #[test]
501    fn gate_unchecked_metric() {
502        let gate = BenchmarkGate::new("test_gate").threshold(Threshold::new("metric_a", 100.0));
503
504        let result = gate.evaluate(&[
505            Measurement::new("metric_a", 90.0),
506            Measurement::new("metric_b", 999.0),
507        ]);
508        assert!(result.passed());
509        assert_eq!(result.unchecked_count, 1);
510    }
511
512    #[test]
513    fn gate_multiple_metrics_mixed() {
514        let gate = BenchmarkGate::new("test_gate")
515            .threshold(Threshold::new("fast", 100.0))
516            .threshold(Threshold::new("slow", 200.0).tolerance_pct(5.0));
517
518        let result = gate.evaluate(&[
519            Measurement::new("fast", 80.0),
520            Measurement::new("slow", 250.0), // exceeds 210 ceiling
521        ]);
522        assert!(!result.passed());
523        assert_eq!(result.pass_count, 1);
524        assert_eq!(result.fail_count, 1);
525
526        let failures = result.failures();
527        assert_eq!(failures.len(), 1);
528        assert_eq!(failures[0].metric, "slow");
529    }
530
531    #[test]
532    fn gate_load_json() {
533        let json = r#"{
534            "render_p99": { "budget": 2000.0, "tolerance_pct": 10.0 },
535            "diff_p99": { "budget": 500.0 }
536        }"#;
537        let gate = BenchmarkGate::load_json("perf_gate", json).expect("valid JSON");
538        let result = gate.evaluate(&[
539            Measurement::new("render_p99", 1800.0),
540            Measurement::new("diff_p99", 480.0),
541        ]);
542        assert!(result.passed());
543    }
544
545    #[test]
546    fn gate_load_json_invalid() {
547        assert!(BenchmarkGate::load_json("bad", "not json").is_none());
548    }
549
550    #[test]
551    fn gate_load_baseline_json_format() {
552        let json = r#"{
553            "_comment": "Performance baseline",
554            "_format": "p50/p95/p99/p999 in nanoseconds",
555            "frame_render": {
556                "p50_ns": 500000,
557                "p95_ns": 1000000,
558                "p99_ns": 2000000,
559                "p999_ns": 5000000,
560                "threshold_pct": 10
561            },
562            "diff_strategy": {
563                "p50_ns": 50000,
564                "p99_ns": 200000,
565                "threshold_pct": 10
566            }
567        }"#;
568        let gate = BenchmarkGate::load_baseline_json("perf_gate", json, "p99_ns")
569            .expect("baseline JSON should parse");
570
571        // Under budget
572        let result = gate.evaluate(&[
573            Measurement::new("frame_render", 1_800_000.0).unit("ns"),
574            Measurement::new("diff_strategy", 190_000.0).unit("ns"),
575        ]);
576        assert!(result.passed(), "gate should pass: {}", result.summary());
577
578        // Over budget + tolerance
579        let result = gate.evaluate(&[
580            Measurement::new("frame_render", 2_500_000.0).unit("ns"), // >2.2M ceiling
581            Measurement::new("diff_strategy", 190_000.0).unit("ns"),
582        ]);
583        assert!(!result.passed(), "gate should fail on regression");
584    }
585
586    #[test]
587    fn gate_load_baseline_json_skips_metadata() {
588        let json = r#"{
589            "_comment": "ignored",
590            "metric_a": { "p99_ns": 100.0, "threshold_pct": 5 }
591        }"#;
592        let gate =
593            BenchmarkGate::load_baseline_json("meta_test", json, "p99_ns").expect("should parse");
594        let result = gate.evaluate(&[Measurement::new("metric_a", 95.0)]);
595        assert!(result.passed());
596        // The _comment entry should not appear as a threshold
597        assert_eq!(result.metrics.len(), 1);
598    }
599
600    #[test]
601    fn gate_summary_format() {
602        let gate = BenchmarkGate::new("summary_test").threshold(Threshold::new("metric_a", 100.0));
603        let result = gate.evaluate(&[Measurement::new("metric_a", 90.0).unit("μs")]);
604        let summary = result.summary();
605        assert!(summary.contains("PASS"));
606        assert!(summary.contains("metric_a"));
607        assert!(summary.contains("μs"));
608    }
609
610    #[test]
611    fn gate_overshoot_pct_negative_when_under_budget() {
612        let gate =
613            BenchmarkGate::new("overshoot_test").threshold(Threshold::new("metric_a", 100.0));
614        let result = gate.evaluate(&[Measurement::new("metric_a", 80.0)]);
615        let m = &result.metrics[0];
616        assert!(m.overshoot_pct.unwrap() < 0.0);
617    }
618
619    #[test]
620    fn gate_empty_measurements() {
621        let gate = BenchmarkGate::new("empty_test").threshold(Threshold::new("metric_a", 100.0));
622        let result = gate.evaluate(&[]);
623        assert!(result.passed());
624        assert_eq!(result.pass_count, 0);
625        assert_eq!(result.fail_count, 0);
626    }
627
628    // =========================================================================
629    // Runtime benchmark gate tests (bd-1vb19)
630    // =========================================================================
631
632    #[test]
633    fn load_baseline_includes_runtime_benchmarks() {
634        let json = include_str!("../../../tests/baseline.json");
635        let gate = BenchmarkGate::load_baseline_json("runtime_gate", json, "p99_ns")
636            .expect("baseline.json should parse");
637
638        // Verify runtime benchmarks were loaded
639        let metrics: Vec<&str> = gate
640            .thresholds
641            .keys()
642            .filter(|k| k.starts_with("runtime_"))
643            .map(|k| k.as_str())
644            .collect();
645        assert!(
646            metrics.contains(&"runtime_shutdown_latency"),
647            "shutdown_latency baseline should be loaded"
648        );
649        assert!(
650            metrics.contains(&"runtime_first_frame"),
651            "first_frame baseline should be loaded"
652        );
653        assert!(
654            metrics.contains(&"runtime_command_roundtrip"),
655            "command_roundtrip baseline should be loaded"
656        );
657        assert!(
658            metrics.contains(&"runtime_effect_queue_drain"),
659            "effect_queue_drain baseline should be loaded"
660        );
661    }
662
663    #[test]
664    fn runtime_gate_passes_within_budget() {
665        let json = include_str!("../../../tests/baseline.json");
666        let gate = BenchmarkGate::load_baseline_json("runtime_gate", json, "p99_ns")
667            .expect("baseline.json should parse");
668
669        // Simulate measurements well within budget
670        let measurements = vec![
671            Measurement::new("runtime_shutdown_latency", 1_000_000.0).unit("ns"),
672            Measurement::new("runtime_first_frame", 5_000_000.0).unit("ns"),
673            Measurement::new("runtime_command_roundtrip", 100_000.0).unit("ns"),
674            Measurement::new("runtime_effect_queue_drain", 500_000.0).unit("ns"),
675        ];
676        let result = gate.evaluate(&measurements);
677        assert!(
678            result.passed(),
679            "all runtime metrics should pass: {}",
680            result.summary()
681        );
682    }
683
684    #[test]
685    fn runtime_gate_fails_on_regression() {
686        let json = include_str!("../../../tests/baseline.json");
687        let gate = BenchmarkGate::load_baseline_json("runtime_gate", json, "p99_ns")
688            .expect("baseline.json should parse");
689
690        // Simulate a severe regression on shutdown latency
691        let measurements = vec![
692            Measurement::new("runtime_shutdown_latency", 100_000_000.0).unit("ns"), // 100ms, way over 5ms budget
693            Measurement::new("runtime_first_frame", 5_000_000.0).unit("ns"),
694        ];
695        let result = gate.evaluate(&measurements);
696        assert!(!result.passed(), "regression should fail the gate");
697        assert!(result.fail_count >= 1);
698
699        let failures = result.failures();
700        assert!(
701            failures
702                .iter()
703                .any(|f| f.metric == "runtime_shutdown_latency"),
704            "shutdown latency should be the failing metric"
705        );
706    }
707
708    #[test]
709    fn runtime_gate_summary_readable() {
710        let json = include_str!("../../../tests/baseline.json");
711        let gate = BenchmarkGate::load_baseline_json("runtime_gate", json, "p99_ns")
712            .expect("baseline.json should parse");
713
714        let measurements =
715            vec![Measurement::new("runtime_shutdown_latency", 4_000_000.0).unit("ns")];
716        let result = gate.evaluate(&measurements);
717        let summary = result.summary();
718        assert!(summary.contains("runtime_shutdown_latency"));
719        assert!(summary.contains("PASS") || summary.contains("ok"));
720    }
721}