perspt_sdk/
benchmark.rs

1//! Benchmark harness and metrics (PSP-8 System 13).
2//!
3//! Perspt does not claim SRBN reliability from implementation alone; it ships
4//! mechanism checks and benchmarks that can falsify the PSP's claims. A residual
5//! certificate is a first-class outcome, not a discarded failure: **no benchmark
6//! report omits failed runs**, and `false stability` (claiming success while a
7//! required sensor was missing) is tracked as its own metric and must be zero.
8
9use serde::{Deserialize, Serialize};
10
11/// The terminal outcome of one benchmark case.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum BenchmarkOutcome {
15    /// All required verifiers passed.
16    HardPass,
17    /// Honest non-convergence: terminated with a residual certificate.
18    ResidualCertified,
19    /// A regression appeared after commit.
20    Regression,
21    /// Claimed success while a required sensor was missing — a correctness bug
22    /// that must never occur.
23    FalseStability,
24}
25
26impl BenchmarkOutcome {
27    pub fn is_success(self) -> bool {
28        matches!(self, BenchmarkOutcome::HardPass)
29    }
30
31    /// Whether this outcome is a correctness violation (never acceptable).
32    pub fn is_correctness_violation(self) -> bool {
33        matches!(self, BenchmarkOutcome::FalseStability)
34    }
35}
36
37/// One benchmark case description.
38#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
39pub struct BenchmarkCase {
40    pub case_id: String,
41    pub domain: String,
42    pub description: String,
43}
44
45impl BenchmarkCase {
46    pub fn new(
47        case_id: impl Into<String>,
48        domain: impl Into<String>,
49        description: impl Into<String>,
50    ) -> Self {
51        Self {
52            case_id: case_id.into(),
53            domain: domain.into(),
54            description: description.into(),
55        }
56    }
57}
58
59/// The recorded result of running one case.
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct BenchmarkResult {
62    pub case_id: String,
63    pub outcome: BenchmarkOutcome,
64    pub gate_decisions: u32,
65    pub energy_descents: u32,
66    pub graph_revisions: u32,
67    pub verifier_calls: u32,
68    pub capability_denials: u32,
69}
70
71impl BenchmarkResult {
72    pub fn new(case_id: impl Into<String>, outcome: BenchmarkOutcome) -> Self {
73        Self {
74            case_id: case_id.into(),
75            outcome,
76            gate_decisions: 0,
77            energy_descents: 0,
78            graph_revisions: 0,
79            verifier_calls: 0,
80            capability_denials: 0,
81        }
82    }
83}
84
85/// A full benchmark report (PSP-8 System 13 metrics).
86#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
87pub struct BenchmarkReport {
88    pub results: Vec<BenchmarkResult>,
89}
90
91impl BenchmarkReport {
92    pub fn new() -> Self {
93        Self::default()
94    }
95
96    pub fn add(&mut self, result: BenchmarkResult) {
97        self.results.push(result);
98    }
99
100    pub fn total(&self) -> usize {
101        self.results.len()
102    }
103
104    fn rate(&self, predicate: impl Fn(&BenchmarkResult) -> bool) -> f64 {
105        if self.results.is_empty() {
106            return 0.0;
107        }
108        self.results.iter().filter(|r| predicate(r)).count() as f64 / self.results.len() as f64
109    }
110
111    /// Final hard-pass rate.
112    pub fn hard_pass_rate(&self) -> f64 {
113        self.rate(|r| r.outcome == BenchmarkOutcome::HardPass)
114    }
115
116    /// Residual-certified termination rate.
117    pub fn residual_certified_rate(&self) -> f64 {
118        self.rate(|r| r.outcome == BenchmarkOutcome::ResidualCertified)
119    }
120
121    /// False-stability rate. This MUST be zero for a conformant implementation.
122    pub fn false_stability_rate(&self) -> f64 {
123        self.rate(|r| r.outcome == BenchmarkOutcome::FalseStability)
124    }
125
126    /// Regression-after-commit rate.
127    pub fn regression_rate(&self) -> f64 {
128        self.rate(|r| r.outcome == BenchmarkOutcome::Regression)
129    }
130
131    /// Whether the report preserves failures (a report of only successes that
132    /// hides certified/failed runs would violate System 13). True if the report
133    /// retains every case it was given — which it always does, since `add`
134    /// appends unconditionally. This predicate exists to assert the invariant in
135    /// tests and to document it.
136    pub fn preserves_failures(&self) -> bool {
137        // By construction nothing is filtered; the report is the source of truth.
138        true
139    }
140
141    /// Whether the implementation is correctness-conformant: no false-stability
142    /// outcomes occurred.
143    pub fn is_correctness_conformant(&self) -> bool {
144        self.results
145            .iter()
146            .all(|r| !r.outcome.is_correctness_violation())
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    #[test]
155    fn report_computes_rates_over_all_outcomes() {
156        let mut report = BenchmarkReport::new();
157        report.add(BenchmarkResult::new("c1", BenchmarkOutcome::HardPass));
158        report.add(BenchmarkResult::new("c2", BenchmarkOutcome::HardPass));
159        report.add(BenchmarkResult::new(
160            "c3",
161            BenchmarkOutcome::ResidualCertified,
162        ));
163        report.add(BenchmarkResult::new("c4", BenchmarkOutcome::Regression));
164        assert_eq!(report.total(), 4);
165        assert_eq!(report.hard_pass_rate(), 0.5);
166        assert_eq!(report.residual_certified_rate(), 0.25);
167        assert_eq!(report.regression_rate(), 0.25);
168    }
169
170    #[test]
171    fn failed_runs_are_preserved_not_omitted() {
172        let mut report = BenchmarkReport::new();
173        report.add(BenchmarkResult::new("ok", BenchmarkOutcome::HardPass));
174        report.add(BenchmarkResult::new(
175            "certified",
176            BenchmarkOutcome::ResidualCertified,
177        ));
178        // The certified (non-success) run is retained in the report.
179        assert!(report
180            .results
181            .iter()
182            .any(|r| r.outcome == BenchmarkOutcome::ResidualCertified));
183        assert!(report.preserves_failures());
184    }
185
186    #[test]
187    fn false_stability_breaks_correctness_conformance() {
188        let mut report = BenchmarkReport::new();
189        report.add(BenchmarkResult::new("ok", BenchmarkOutcome::HardPass));
190        assert!(report.is_correctness_conformant());
191        report.add(BenchmarkResult::new(
192            "bad",
193            BenchmarkOutcome::FalseStability,
194        ));
195        assert!(!report.is_correctness_conformant());
196        assert!(report.false_stability_rate() > 0.0);
197    }
198}
perspt_sdk/benchmark.rs

perspt_sdk/
benchmark.rs