1use std::collections::{BTreeSet, HashMap};
21
22use serde::{Deserialize, Serialize};
23
24use crate::hooks::decide::{
25 classify_command, evaluate, extract_file_paths, CommandClass, Decision, EnforcementInput,
26};
27
28const DETECTION_KNOWN_GOOD: &str = include_str!("../tests/fixtures/eval/detection/known_good.json");
30const DETECTION_BENIGN: &str = include_str!("../tests/fixtures/eval/detection/benign.json");
31const DETECTION_ADVERSARIAL: &str =
32 include_str!("../tests/fixtures/eval/detection/adversarial.json");
33const DECISION_CASES: &str = include_str!("../tests/fixtures/eval/decision/cases.json");
34const BASELINE: &str = include_str!("../tests/fixtures/eval/baseline.json");
35
36#[derive(Debug, Deserialize)]
39struct DetectionCase {
40 id: String,
41 cmd: String,
42 label: String,
45 expect_class: String,
47 #[serde(default)]
51 expect_paths: Vec<String>,
52 #[serde(default)]
53 #[allow(dead_code)]
54 note: Option<String>,
55}
56
57#[derive(Debug, Deserialize)]
58struct DecisionCase {
59 id: String,
60 label: String,
62 rel_path: String,
63 #[serde(default)]
64 file_record: Option<serde_json::Value>,
65 #[serde(default)]
66 gotcha_records: HashMap<String, serde_json::Value>,
67 #[serde(default)]
68 already_consulted: bool,
69 expect: String,
72 #[serde(default)]
73 #[allow(dead_code)]
74 note: Option<String>,
75}
76
77#[derive(Debug, Deserialize)]
78struct Baseline {
79 #[serde(default)]
80 detection: Vec<String>,
81 #[serde(default)]
82 decision: Vec<String>,
83}
84
85#[derive(Debug, Serialize)]
89pub struct LayerReport {
90 pub layer: &'static str,
91 pub cases: u32,
92 pub tp: u32,
93 #[serde(rename = "fn")]
94 pub fn_: u32,
95 pub tn: u32,
96 pub fp: u32,
97 pub recall: f64,
98 pub fp_rate: f64,
99 pub precision: f64,
100 pub failing: Vec<String>,
102 pub known_gaps: Vec<String>,
104 pub regressions: Vec<String>,
106 pub newly_fixed: Vec<String>,
108}
109
110#[derive(Debug, Serialize)]
111pub struct EvalReport {
112 pub detection: LayerReport,
113 pub decision: LayerReport,
114}
115
116impl EvalReport {
117 pub fn gate(&self) -> Result<(), String> {
121 let mut errs = Vec::new();
122 for l in [&self.detection, &self.decision] {
123 if !l.regressions.is_empty() {
124 errs.push(format!(
125 "[{}] REGRESSION — output now wrong for cases not in baseline: {}\n \
126 Fix the regression, or (if intended) add these ids to \
127 tests/fixtures/eval/baseline.json.",
128 l.layer,
129 l.regressions.join(", ")
130 ));
131 }
132 if !l.newly_fixed.is_empty() {
133 errs.push(format!(
134 "[{}] IMPROVEMENT — baseline gaps now PASS: {}\n \
135 Remove them from tests/fixtures/eval/baseline.json so the \
136 baseline stays honest and recall ratchets up.",
137 l.layer,
138 l.newly_fixed.join(", ")
139 ));
140 }
141 }
142 if errs.is_empty() {
143 Ok(())
144 } else {
145 Err(errs.join("\n"))
146 }
147 }
148}
149
150fn parse_class(s: &str) -> Option<CommandClass> {
153 match s {
154 "cat_like" => Some(CommandClass::CatLike),
155 "grep_like" => Some(CommandClass::GrepLike),
156 "none" => None,
157 other => panic!("corpus: bad expect_class {other:?}"),
158 }
159}
160
161fn detection_pass(c: &DetectionCase) -> bool {
162 let got_class = classify_command(&c.cmd);
163 if got_class != parse_class(&c.expect_class) {
164 return false;
165 }
166 let mut got_paths = match got_class {
169 Some(cl) => extract_file_paths(&c.cmd, cl),
170 None => Vec::new(),
171 };
172 let mut want = c.expect_paths.clone();
173 got_paths.sort();
174 want.sort();
175 got_paths == want
176}
177
178fn decision_variant(d: &Decision) -> &'static str {
181 match d {
182 Decision::Allow => "allow",
183 Decision::Deny { .. } => "deny",
184 Decision::AlreadyConsulted { .. } => "already_consulted",
185 Decision::Advisory { .. } => "advisory",
186 Decision::Liability { .. } => "liability",
187 Decision::Tombstone => "tombstone",
188 Decision::NoRecord => "no_record",
189 Decision::NotFileRead => "not_file_read",
190 }
191}
192
193const DECISION_VARIANTS: &[&str] = &[
194 "allow",
195 "advisory",
196 "deny",
197 "already_consulted",
198 "liability",
199 "tombstone",
200 "no_record",
201 "not_file_read",
202];
203
204fn decision_pass(c: &DecisionCase) -> bool {
205 let input = EnforcementInput {
206 rel_path: c.rel_path.clone(),
207 file_record: c.file_record.clone(),
208 gotcha_records: c.gotcha_records.clone(),
209 already_consulted: c.already_consulted,
210 };
211 decision_variant(&evaluate(&input).decision) == c.expect
212}
213
214fn score(
216 layer: &'static str,
217 rows: &[(String, bool, bool)],
218 known_gaps: Vec<String>,
219) -> LayerReport {
220 let (mut tp, mut fn_, mut tn, mut fp) = (0u32, 0u32, 0u32, 0u32);
221 let mut failing: BTreeSet<String> = BTreeSet::new();
222 for (id, is_violation, pass) in rows {
223 match (is_violation, pass) {
224 (true, true) => tp += 1,
225 (true, false) => fn_ += 1,
226 (false, true) => tn += 1,
227 (false, false) => fp += 1,
228 }
229 if !pass {
230 failing.insert(id.clone());
231 }
232 }
233 let recall = if tp + fn_ == 0 {
234 1.0
235 } else {
236 tp as f64 / (tp + fn_) as f64
237 };
238 let fp_rate = if fp + tn == 0 {
239 0.0
240 } else {
241 fp as f64 / (fp + tn) as f64
242 };
243 let precision = if tp + fp == 0 {
244 1.0
245 } else {
246 tp as f64 / (tp + fp) as f64
247 };
248 let known: BTreeSet<String> = known_gaps.iter().cloned().collect();
249 let regressions = failing.difference(&known).cloned().collect();
250 let newly_fixed = known.difference(&failing).cloned().collect();
251 LayerReport {
252 layer,
253 cases: rows.len() as u32,
254 tp,
255 fn_,
256 tn,
257 fp,
258 recall,
259 fp_rate,
260 precision,
261 failing: failing.into_iter().collect(),
262 known_gaps,
263 regressions,
264 newly_fixed,
265 }
266}
267
268fn assert_unique_ids<'a>(layer: &str, ids: impl Iterator<Item = &'a str>) {
269 let mut seen = BTreeSet::new();
270 for id in ids {
271 assert!(seen.insert(id), "{layer} corpus: duplicate case id {id:?}");
272 }
273}
274
275pub fn run() -> EvalReport {
281 let baseline: Baseline = serde_json::from_str(BASELINE).expect("parse baseline.json");
282
283 let mut detection: Vec<DetectionCase> = Vec::new();
285 for raw in [
286 DETECTION_KNOWN_GOOD,
287 DETECTION_BENIGN,
288 DETECTION_ADVERSARIAL,
289 ] {
290 detection.extend(serde_json::from_str::<Vec<DetectionCase>>(raw).expect("parse detection"));
291 }
292 assert_unique_ids("detection", detection.iter().map(|c| c.id.as_str()));
293 let det_rows: Vec<(String, bool, bool)> = detection
294 .iter()
295 .map(|c| {
296 assert!(
297 c.label == "violation" || c.label == "benign",
298 "detection {}: bad label {:?}",
299 c.id,
300 c.label
301 );
302 (c.id.clone(), c.label == "violation", detection_pass(c))
303 })
304 .collect();
305 let detection = score("detection", &det_rows, baseline.detection);
306
307 let decision: Vec<DecisionCase> =
309 serde_json::from_str(DECISION_CASES).expect("parse decision corpus");
310 assert_unique_ids("decision", decision.iter().map(|c| c.id.as_str()));
311 let dec_rows: Vec<(String, bool, bool)> = decision
312 .iter()
313 .map(|c| {
314 assert!(
315 c.label == "violation" || c.label == "benign",
316 "decision {}: bad label {:?}",
317 c.id,
318 c.label
319 );
320 assert!(
321 DECISION_VARIANTS.contains(&c.expect.as_str()),
322 "decision {}: bad expect {:?}",
323 c.id,
324 c.expect
325 );
326 assert_eq!(
329 c.label == "violation",
330 c.expect == "deny",
331 "decision {}: label/expect mismatch (violation iff expect==deny)",
332 c.id
333 );
334 (c.id.clone(), c.label == "violation", decision_pass(c))
335 })
336 .collect();
337 let decision = score("decision", &dec_rows, baseline.decision);
338
339 EvalReport {
340 detection,
341 decision,
342 }
343}
344
345#[cfg(test)]
348mod tests {
349 use super::*;
350
351 #[test]
352 fn detection_pass_can_fail() {
353 let mk = |expect_class: &str, expect_paths: &[&str]| DetectionCase {
355 id: "x".into(),
356 cmd: "cat src/main.rs".into(),
357 label: "violation".into(),
358 expect_class: expect_class.into(),
359 expect_paths: expect_paths.iter().map(|s| s.to_string()).collect(),
360 note: None,
361 };
362 assert!(detection_pass(&mk("cat_like", &["src/main.rs"])));
363 assert!(!detection_pass(&mk("cat_like", &["WRONG.rs"])));
364 assert!(!detection_pass(&mk("none", &[])));
365 }
366
367 #[test]
368 fn decision_pass_can_fail() {
369 let deny_input = DecisionCase {
370 id: "x".into(),
371 label: "violation".into(),
372 rel_path: "src/a.rs".into(),
373 file_record: Some(serde_json::json!({
374 "confidence": {"value": 0.9}, "quality": {"value": 0.8},
375 "staleness": {"value": 0.1, "tier": "fresh"},
376 "payload": {"gotcha_keys": ["g"]}
377 })),
378 gotcha_records: HashMap::from([(
379 "g".to_string(),
380 serde_json::json!({
381 "value": "r", "confidence": {"value": 0.9}, "quality": {"value": 0.8},
382 "payload": {"confirmed": true}
383 }),
384 )]),
385 already_consulted: false,
386 expect: "deny".into(),
387 note: None,
388 };
389 assert!(decision_pass(&deny_input), "real deny case must pass");
390
391 let mut wrong = deny_input;
392 wrong.expect = "allow".into();
393 assert!(
394 !decision_pass(&wrong),
395 "a deny scored against expect=allow must fail"
396 );
397 }
398
399 #[test]
400 fn corpus_is_well_formed_and_gates() {
401 let report = run();
404 assert!(report.detection.cases > 0);
405 assert!(report.decision.cases > 0);
406 report
407 .gate()
408 .expect("embedded corpus must match its baseline");
409 }
410}