1use serde::{Deserialize, Serialize};
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum BenchmarkOutcome {
15 HardPass,
17 ResidualCertified,
19 Regression,
21 FalseStability,
24}
25
26impl BenchmarkOutcome {
27 pub fn is_success(self) -> bool {
28 matches!(self, BenchmarkOutcome::HardPass)
29 }
30
31 pub fn is_correctness_violation(self) -> bool {
33 matches!(self, BenchmarkOutcome::FalseStability)
34 }
35}
36
37#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
39pub struct BenchmarkCase {
40 pub case_id: String,
41 pub domain: String,
42 pub description: String,
43}
44
45impl BenchmarkCase {
46 pub fn new(
47 case_id: impl Into<String>,
48 domain: impl Into<String>,
49 description: impl Into<String>,
50 ) -> Self {
51 Self {
52 case_id: case_id.into(),
53 domain: domain.into(),
54 description: description.into(),
55 }
56 }
57}
58
59#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct BenchmarkResult {
62 pub case_id: String,
63 pub outcome: BenchmarkOutcome,
64 pub gate_decisions: u32,
65 pub energy_descents: u32,
66 pub graph_revisions: u32,
67 pub verifier_calls: u32,
68 pub capability_denials: u32,
69}
70
71impl BenchmarkResult {
72 pub fn new(case_id: impl Into<String>, outcome: BenchmarkOutcome) -> Self {
73 Self {
74 case_id: case_id.into(),
75 outcome,
76 gate_decisions: 0,
77 energy_descents: 0,
78 graph_revisions: 0,
79 verifier_calls: 0,
80 capability_denials: 0,
81 }
82 }
83}
84
85#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
87pub struct BenchmarkReport {
88 pub results: Vec<BenchmarkResult>,
89}
90
91impl BenchmarkReport {
92 pub fn new() -> Self {
93 Self::default()
94 }
95
96 pub fn add(&mut self, result: BenchmarkResult) {
97 self.results.push(result);
98 }
99
100 pub fn total(&self) -> usize {
101 self.results.len()
102 }
103
104 fn rate(&self, predicate: impl Fn(&BenchmarkResult) -> bool) -> f64 {
105 if self.results.is_empty() {
106 return 0.0;
107 }
108 self.results.iter().filter(|r| predicate(r)).count() as f64 / self.results.len() as f64
109 }
110
111 pub fn hard_pass_rate(&self) -> f64 {
113 self.rate(|r| r.outcome == BenchmarkOutcome::HardPass)
114 }
115
116 pub fn residual_certified_rate(&self) -> f64 {
118 self.rate(|r| r.outcome == BenchmarkOutcome::ResidualCertified)
119 }
120
121 pub fn false_stability_rate(&self) -> f64 {
123 self.rate(|r| r.outcome == BenchmarkOutcome::FalseStability)
124 }
125
126 pub fn regression_rate(&self) -> f64 {
128 self.rate(|r| r.outcome == BenchmarkOutcome::Regression)
129 }
130
131 pub fn preserves_failures(&self) -> bool {
137 true
139 }
140
141 pub fn is_correctness_conformant(&self) -> bool {
144 self.results
145 .iter()
146 .all(|r| !r.outcome.is_correctness_violation())
147 }
148}
149
150#[cfg(test)]
151mod tests {
152 use super::*;
153
154 #[test]
155 fn report_computes_rates_over_all_outcomes() {
156 let mut report = BenchmarkReport::new();
157 report.add(BenchmarkResult::new("c1", BenchmarkOutcome::HardPass));
158 report.add(BenchmarkResult::new("c2", BenchmarkOutcome::HardPass));
159 report.add(BenchmarkResult::new(
160 "c3",
161 BenchmarkOutcome::ResidualCertified,
162 ));
163 report.add(BenchmarkResult::new("c4", BenchmarkOutcome::Regression));
164 assert_eq!(report.total(), 4);
165 assert_eq!(report.hard_pass_rate(), 0.5);
166 assert_eq!(report.residual_certified_rate(), 0.25);
167 assert_eq!(report.regression_rate(), 0.25);
168 }
169
170 #[test]
171 fn failed_runs_are_preserved_not_omitted() {
172 let mut report = BenchmarkReport::new();
173 report.add(BenchmarkResult::new("ok", BenchmarkOutcome::HardPass));
174 report.add(BenchmarkResult::new(
175 "certified",
176 BenchmarkOutcome::ResidualCertified,
177 ));
178 assert!(report
180 .results
181 .iter()
182 .any(|r| r.outcome == BenchmarkOutcome::ResidualCertified));
183 assert!(report.preserves_failures());
184 }
185
186 #[test]
187 fn false_stability_breaks_correctness_conformance() {
188 let mut report = BenchmarkReport::new();
189 report.add(BenchmarkResult::new("ok", BenchmarkOutcome::HardPass));
190 assert!(report.is_correctness_conformant());
191 report.add(BenchmarkResult::new(
192 "bad",
193 BenchmarkOutcome::FalseStability,
194 ));
195 assert!(!report.is_correctness_conformant());
196 assert!(report.false_stability_rate() > 0.0);
197 }
198}