use std::collections::{BTreeSet, HashMap};
use serde::{Deserialize, Serialize};
use crate::hooks::decide::{
classify_command, evaluate, extract_file_paths, CommandClass, Decision, EnforcementInput,
};
const DETECTION_KNOWN_GOOD: &str = include_str!("../tests/fixtures/eval/detection/known_good.json");
const DETECTION_BENIGN: &str = include_str!("../tests/fixtures/eval/detection/benign.json");
const DETECTION_ADVERSARIAL: &str =
include_str!("../tests/fixtures/eval/detection/adversarial.json");
const DECISION_CASES: &str = include_str!("../tests/fixtures/eval/decision/cases.json");
const BASELINE: &str = include_str!("../tests/fixtures/eval/baseline.json");
#[derive(Debug, Deserialize)]
struct DetectionCase {
id: String,
cmd: String,
label: String,
expect_class: String,
#[serde(default)]
expect_paths: Vec<String>,
#[serde(default)]
#[allow(dead_code)]
note: Option<String>,
}
#[derive(Debug, Deserialize)]
struct DecisionCase {
id: String,
label: String,
rel_path: String,
#[serde(default)]
file_record: Option<serde_json::Value>,
#[serde(default)]
gotcha_records: HashMap<String, serde_json::Value>,
#[serde(default)]
already_consulted: bool,
expect: String,
#[serde(default)]
#[allow(dead_code)]
note: Option<String>,
}
#[derive(Debug, Deserialize)]
struct Baseline {
#[serde(default)]
detection: Vec<String>,
#[serde(default)]
decision: Vec<String>,
}
#[derive(Debug, Serialize)]
pub struct LayerReport {
pub layer: &'static str,
pub cases: u32,
pub tp: u32,
#[serde(rename = "fn")]
pub fn_: u32,
pub tn: u32,
pub fp: u32,
pub recall: f64,
pub fp_rate: f64,
pub precision: f64,
pub failing: Vec<String>,
pub known_gaps: Vec<String>,
pub regressions: Vec<String>,
pub newly_fixed: Vec<String>,
}
#[derive(Debug, Serialize)]
pub struct EvalReport {
pub detection: LayerReport,
pub decision: LayerReport,
}
impl EvalReport {
pub fn gate(&self) -> Result<(), String> {
let mut errs = Vec::new();
for l in [&self.detection, &self.decision] {
if !l.regressions.is_empty() {
errs.push(format!(
"[{}] REGRESSION — output now wrong for cases not in baseline: {}\n \
Fix the regression, or (if intended) add these ids to \
tests/fixtures/eval/baseline.json.",
l.layer,
l.regressions.join(", ")
));
}
if !l.newly_fixed.is_empty() {
errs.push(format!(
"[{}] IMPROVEMENT — baseline gaps now PASS: {}\n \
Remove them from tests/fixtures/eval/baseline.json so the \
baseline stays honest and recall ratchets up.",
l.layer,
l.newly_fixed.join(", ")
));
}
}
if errs.is_empty() {
Ok(())
} else {
Err(errs.join("\n"))
}
}
}
fn parse_class(s: &str) -> Option<CommandClass> {
match s {
"cat_like" => Some(CommandClass::CatLike),
"grep_like" => Some(CommandClass::GrepLike),
"none" => None,
other => panic!("corpus: bad expect_class {other:?}"),
}
}
fn detection_pass(c: &DetectionCase) -> bool {
let got_class = classify_command(&c.cmd);
if got_class != parse_class(&c.expect_class) {
return false;
}
let mut got_paths = match got_class {
Some(cl) => extract_file_paths(&c.cmd, cl),
None => Vec::new(),
};
let mut want = c.expect_paths.clone();
got_paths.sort();
want.sort();
got_paths == want
}
fn decision_variant(d: &Decision) -> &'static str {
match d {
Decision::Allow => "allow",
Decision::Deny { .. } => "deny",
Decision::AlreadyConsulted { .. } => "already_consulted",
Decision::Advisory { .. } => "advisory",
Decision::Liability { .. } => "liability",
Decision::Tombstone => "tombstone",
Decision::NoRecord => "no_record",
Decision::NotFileRead => "not_file_read",
}
}
const DECISION_VARIANTS: &[&str] = &[
"allow",
"advisory",
"deny",
"already_consulted",
"liability",
"tombstone",
"no_record",
"not_file_read",
];
fn decision_pass(c: &DecisionCase) -> bool {
let input = EnforcementInput {
rel_path: c.rel_path.clone(),
file_record: c.file_record.clone(),
gotcha_records: c.gotcha_records.clone(),
already_consulted: c.already_consulted,
};
decision_variant(&evaluate(&input).decision) == c.expect
}
fn score(
layer: &'static str,
rows: &[(String, bool, bool)],
known_gaps: Vec<String>,
) -> LayerReport {
let (mut tp, mut fn_, mut tn, mut fp) = (0u32, 0u32, 0u32, 0u32);
let mut failing: BTreeSet<String> = BTreeSet::new();
for (id, is_violation, pass) in rows {
match (is_violation, pass) {
(true, true) => tp += 1,
(true, false) => fn_ += 1,
(false, true) => tn += 1,
(false, false) => fp += 1,
}
if !pass {
failing.insert(id.clone());
}
}
let recall = if tp + fn_ == 0 {
1.0
} else {
tp as f64 / (tp + fn_) as f64
};
let fp_rate = if fp + tn == 0 {
0.0
} else {
fp as f64 / (fp + tn) as f64
};
let precision = if tp + fp == 0 {
1.0
} else {
tp as f64 / (tp + fp) as f64
};
let known: BTreeSet<String> = known_gaps.iter().cloned().collect();
let regressions = failing.difference(&known).cloned().collect();
let newly_fixed = known.difference(&failing).cloned().collect();
LayerReport {
layer,
cases: rows.len() as u32,
tp,
fn_,
tn,
fp,
recall,
fp_rate,
precision,
failing: failing.into_iter().collect(),
known_gaps,
regressions,
newly_fixed,
}
}
fn assert_unique_ids<'a>(layer: &str, ids: impl Iterator<Item = &'a str>) {
let mut seen = BTreeSet::new();
for id in ids {
assert!(seen.insert(id), "{layer} corpus: duplicate case id {id:?}");
}
}
pub fn run() -> EvalReport {
let baseline: Baseline = serde_json::from_str(BASELINE).expect("parse baseline.json");
let mut detection: Vec<DetectionCase> = Vec::new();
for raw in [
DETECTION_KNOWN_GOOD,
DETECTION_BENIGN,
DETECTION_ADVERSARIAL,
] {
detection.extend(serde_json::from_str::<Vec<DetectionCase>>(raw).expect("parse detection"));
}
assert_unique_ids("detection", detection.iter().map(|c| c.id.as_str()));
let det_rows: Vec<(String, bool, bool)> = detection
.iter()
.map(|c| {
assert!(
c.label == "violation" || c.label == "benign",
"detection {}: bad label {:?}",
c.id,
c.label
);
(c.id.clone(), c.label == "violation", detection_pass(c))
})
.collect();
let detection = score("detection", &det_rows, baseline.detection);
let decision: Vec<DecisionCase> =
serde_json::from_str(DECISION_CASES).expect("parse decision corpus");
assert_unique_ids("decision", decision.iter().map(|c| c.id.as_str()));
let dec_rows: Vec<(String, bool, bool)> = decision
.iter()
.map(|c| {
assert!(
c.label == "violation" || c.label == "benign",
"decision {}: bad label {:?}",
c.id,
c.label
);
assert!(
DECISION_VARIANTS.contains(&c.expect.as_str()),
"decision {}: bad expect {:?}",
c.id,
c.expect
);
assert_eq!(
c.label == "violation",
c.expect == "deny",
"decision {}: label/expect mismatch (violation iff expect==deny)",
c.id
);
(c.id.clone(), c.label == "violation", decision_pass(c))
})
.collect();
let decision = score("decision", &dec_rows, baseline.decision);
EvalReport {
detection,
decision,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detection_pass_can_fail() {
let mk = |expect_class: &str, expect_paths: &[&str]| DetectionCase {
id: "x".into(),
cmd: "cat src/main.rs".into(),
label: "violation".into(),
expect_class: expect_class.into(),
expect_paths: expect_paths.iter().map(|s| s.to_string()).collect(),
note: None,
};
assert!(detection_pass(&mk("cat_like", &["src/main.rs"])));
assert!(!detection_pass(&mk("cat_like", &["WRONG.rs"])));
assert!(!detection_pass(&mk("none", &[])));
}
#[test]
fn decision_pass_can_fail() {
let deny_input = DecisionCase {
id: "x".into(),
label: "violation".into(),
rel_path: "src/a.rs".into(),
file_record: Some(serde_json::json!({
"confidence": {"value": 0.9}, "quality": {"value": 0.8},
"staleness": {"value": 0.1, "tier": "fresh"},
"payload": {"gotcha_keys": ["g"]}
})),
gotcha_records: HashMap::from([(
"g".to_string(),
serde_json::json!({
"value": "r", "confidence": {"value": 0.9}, "quality": {"value": 0.8},
"payload": {"confirmed": true}
}),
)]),
already_consulted: false,
expect: "deny".into(),
note: None,
};
assert!(decision_pass(&deny_input), "real deny case must pass");
let mut wrong = deny_input;
wrong.expect = "allow".into();
assert!(
!decision_pass(&wrong),
"a deny scored against expect=allow must fail"
);
}
#[test]
fn corpus_is_well_formed_and_gates() {
let report = run();
assert!(report.detection.cases > 0);
assert!(report.decision.cases > 0);
report
.gate()
.expect("embedded corpus must match its baseline");
}
}