mati_core/
eval.rs

1//! Eval / regression corpus runner (idea 4).
2//!
3//! Replays a labeled corpus through the REAL pure enforcement functions and
4//! scores a confusion matrix per layer:
5//!   - **detection** — `classify_command` + `extract_file_path`: which file a
6//!     bash command reads (the read gate's first stage);
7//!   - **decision** — `evaluate()`: what enforcement does given a file/gotcha
8//!     state (Allow / Advisory / Deny / …).
9//!
10//! Ground truth is independent of current behavior; cases the engine currently
11//! mishandles are tracked in `baseline.json`. The gate asserts each layer's
12//! failing set equals its baseline exactly — a new miss is a regression, a
13//! fixed gap forces a baseline update (ratcheting recall up). That makes the
14//! "how do I know it doesn't miss?" number a measured, regression-gated fact.
15//!
16//! The corpus + baseline are embedded at compile time so `mati eval` runs the
17//! identical corpus in a shipped binary. Pure — no store, daemon, or network;
18//! the eval path stays inside mati's zero-network invariant.
19
20use std::collections::{BTreeSet, HashMap};
21
22use serde::{Deserialize, Serialize};
23
24use crate::hooks::decide::{
25    classify_command, evaluate, extract_file_paths, CommandClass, Decision, EnforcementInput,
26};
27
28// ── Embedded corpus (relative to this file, src/eval.rs) ─────────────────────
29const DETECTION_KNOWN_GOOD: &str = include_str!("../tests/fixtures/eval/detection/known_good.json");
30const DETECTION_BENIGN: &str = include_str!("../tests/fixtures/eval/detection/benign.json");
31const DETECTION_ADVERSARIAL: &str =
32    include_str!("../tests/fixtures/eval/detection/adversarial.json");
33const DECISION_CASES: &str = include_str!("../tests/fixtures/eval/decision/cases.json");
34const BASELINE: &str = include_str!("../tests/fixtures/eval/baseline.json");
35
36// ── Corpus case types ────────────────────────────────────────────────────────
37
38#[derive(Debug, Deserialize)]
39struct DetectionCase {
40    id: String,
41    cmd: String,
42    /// "violation" = a real file-read enforcement must catch; "benign" = not a
43    /// file-read.
44    label: String,
45    /// Ground-truth class: "cat_like" | "grep_like" | "none".
46    expect_class: String,
47    /// Ground-truth set of files the gate must check (order-independent). One
48    /// entry for a single-file read, several for `cat a.rs b.rs`, empty when no
49    /// file is read (benign, or `grep PATTERN` with no file).
50    #[serde(default)]
51    expect_paths: Vec<String>,
52    #[serde(default)]
53    #[allow(dead_code)]
54    note: Option<String>,
55}
56
57#[derive(Debug, Deserialize)]
58struct DecisionCase {
59    id: String,
60    /// "violation" = must Deny; "benign" = must NOT Deny.
61    label: String,
62    rel_path: String,
63    #[serde(default)]
64    file_record: Option<serde_json::Value>,
65    #[serde(default)]
66    gotcha_records: HashMap<String, serde_json::Value>,
67    #[serde(default)]
68    already_consulted: bool,
69    /// Ground-truth decision variant: "allow" | "advisory" | "deny" |
70    /// "already_consulted" | "liability" | "tombstone" | "no_record".
71    expect: String,
72    #[serde(default)]
73    #[allow(dead_code)]
74    note: Option<String>,
75}
76
77#[derive(Debug, Deserialize)]
78struct Baseline {
79    #[serde(default)]
80    detection: Vec<String>,
81    #[serde(default)]
82    decision: Vec<String>,
83}
84
85// ── Report types ─────────────────────────────────────────────────────────────
86
87/// Per-layer confusion matrix and baseline comparison.
88#[derive(Debug, Serialize)]
89pub struct LayerReport {
90    pub layer: &'static str,
91    pub cases: u32,
92    pub tp: u32,
93    #[serde(rename = "fn")]
94    pub fn_: u32,
95    pub tn: u32,
96    pub fp: u32,
97    pub recall: f64,
98    pub fp_rate: f64,
99    pub precision: f64,
100    /// Case ids whose current output != ground truth.
101    pub failing: Vec<String>,
102    /// Baseline-accepted gaps for this layer.
103    pub known_gaps: Vec<String>,
104    /// `failing` − `known_gaps`: new misses. Must be empty for a healthy gate.
105    pub regressions: Vec<String>,
106    /// `known_gaps` − `failing`: fixed cases that should leave the baseline.
107    pub newly_fixed: Vec<String>,
108}
109
110#[derive(Debug, Serialize)]
111pub struct EvalReport {
112    pub detection: LayerReport,
113    pub decision: LayerReport,
114}
115
116impl EvalReport {
117    /// Ok iff every layer's failing set equals its baseline (no regressions,
118    /// no stale baseline entries). The single source of truth for both the CI
119    /// gate and `mati eval`'s exit code.
120    pub fn gate(&self) -> Result<(), String> {
121        let mut errs = Vec::new();
122        for l in [&self.detection, &self.decision] {
123            if !l.regressions.is_empty() {
124                errs.push(format!(
125                    "[{}] REGRESSION — output now wrong for cases not in baseline: {}\n  \
126                     Fix the regression, or (if intended) add these ids to \
127                     tests/fixtures/eval/baseline.json.",
128                    l.layer,
129                    l.regressions.join(", ")
130                ));
131            }
132            if !l.newly_fixed.is_empty() {
133                errs.push(format!(
134                    "[{}] IMPROVEMENT — baseline gaps now PASS: {}\n  \
135                     Remove them from tests/fixtures/eval/baseline.json so the \
136                     baseline stays honest and recall ratchets up.",
137                    l.layer,
138                    l.newly_fixed.join(", ")
139                ));
140            }
141        }
142        if errs.is_empty() {
143            Ok(())
144        } else {
145            Err(errs.join("\n"))
146        }
147    }
148}
149
150// ── Scoring ──────────────────────────────────────────────────────────────────
151
152fn parse_class(s: &str) -> Option<CommandClass> {
153    match s {
154        "cat_like" => Some(CommandClass::CatLike),
155        "grep_like" => Some(CommandClass::GrepLike),
156        "none" => None,
157        other => panic!("corpus: bad expect_class {other:?}"),
158    }
159}
160
161fn detection_pass(c: &DetectionCase) -> bool {
162    let got_class = classify_command(&c.cmd);
163    if got_class != parse_class(&c.expect_class) {
164        return false;
165    }
166    // The gate checks the SET of files a command reads, so compare
167    // order-independently. `cat a.rs b.rs` must yield {a.rs, b.rs}.
168    let mut got_paths = match got_class {
169        Some(cl) => extract_file_paths(&c.cmd, cl),
170        None => Vec::new(),
171    };
172    let mut want = c.expect_paths.clone();
173    got_paths.sort();
174    want.sort();
175    got_paths == want
176}
177
178/// Stable name for a `Decision` variant (ignores the inner context strings,
179/// which carry human-readable detail that is not part of the contract).
180fn decision_variant(d: &Decision) -> &'static str {
181    match d {
182        Decision::Allow => "allow",
183        Decision::Deny { .. } => "deny",
184        Decision::AlreadyConsulted { .. } => "already_consulted",
185        Decision::Advisory { .. } => "advisory",
186        Decision::Liability { .. } => "liability",
187        Decision::Tombstone => "tombstone",
188        Decision::NoRecord => "no_record",
189        Decision::NotFileRead => "not_file_read",
190    }
191}
192
193const DECISION_VARIANTS: &[&str] = &[
194    "allow",
195    "advisory",
196    "deny",
197    "already_consulted",
198    "liability",
199    "tombstone",
200    "no_record",
201    "not_file_read",
202];
203
204fn decision_pass(c: &DecisionCase) -> bool {
205    let input = EnforcementInput {
206        rel_path: c.rel_path.clone(),
207        file_record: c.file_record.clone(),
208        gotcha_records: c.gotcha_records.clone(),
209        already_consulted: c.already_consulted,
210    };
211    decision_variant(&evaluate(&input).decision) == c.expect
212}
213
214/// Build a `LayerReport` from `(id, is_violation, passed)` rows.
215fn score(
216    layer: &'static str,
217    rows: &[(String, bool, bool)],
218    known_gaps: Vec<String>,
219) -> LayerReport {
220    let (mut tp, mut fn_, mut tn, mut fp) = (0u32, 0u32, 0u32, 0u32);
221    let mut failing: BTreeSet<String> = BTreeSet::new();
222    for (id, is_violation, pass) in rows {
223        match (is_violation, pass) {
224            (true, true) => tp += 1,
225            (true, false) => fn_ += 1,
226            (false, true) => tn += 1,
227            (false, false) => fp += 1,
228        }
229        if !pass {
230            failing.insert(id.clone());
231        }
232    }
233    let recall = if tp + fn_ == 0 {
234        1.0
235    } else {
236        tp as f64 / (tp + fn_) as f64
237    };
238    let fp_rate = if fp + tn == 0 {
239        0.0
240    } else {
241        fp as f64 / (fp + tn) as f64
242    };
243    let precision = if tp + fp == 0 {
244        1.0
245    } else {
246        tp as f64 / (tp + fp) as f64
247    };
248    let known: BTreeSet<String> = known_gaps.iter().cloned().collect();
249    let regressions = failing.difference(&known).cloned().collect();
250    let newly_fixed = known.difference(&failing).cloned().collect();
251    LayerReport {
252        layer,
253        cases: rows.len() as u32,
254        tp,
255        fn_,
256        tn,
257        fp,
258        recall,
259        fp_rate,
260        precision,
261        failing: failing.into_iter().collect(),
262        known_gaps,
263        regressions,
264        newly_fixed,
265    }
266}
267
268fn assert_unique_ids<'a>(layer: &str, ids: impl Iterator<Item = &'a str>) {
269    let mut seen = BTreeSet::new();
270    for id in ids {
271        assert!(seen.insert(id), "{layer} corpus: duplicate case id {id:?}");
272    }
273}
274
275/// Run the embedded corpus through the real enforcement functions and score it.
276///
277/// Panics only on a malformed corpus (bad label/expect/class, duplicate id,
278/// or label↔expect inconsistency) — these are compile-embedded fixtures, so a
279/// panic is a developer error caught immediately by the test or `mati eval`.
280pub fn run() -> EvalReport {
281    let baseline: Baseline = serde_json::from_str(BASELINE).expect("parse baseline.json");
282
283    // Detection layer.
284    let mut detection: Vec<DetectionCase> = Vec::new();
285    for raw in [
286        DETECTION_KNOWN_GOOD,
287        DETECTION_BENIGN,
288        DETECTION_ADVERSARIAL,
289    ] {
290        detection.extend(serde_json::from_str::<Vec<DetectionCase>>(raw).expect("parse detection"));
291    }
292    assert_unique_ids("detection", detection.iter().map(|c| c.id.as_str()));
293    let det_rows: Vec<(String, bool, bool)> = detection
294        .iter()
295        .map(|c| {
296            assert!(
297                c.label == "violation" || c.label == "benign",
298                "detection {}: bad label {:?}",
299                c.id,
300                c.label
301            );
302            (c.id.clone(), c.label == "violation", detection_pass(c))
303        })
304        .collect();
305    let detection = score("detection", &det_rows, baseline.detection);
306
307    // Decision layer.
308    let decision: Vec<DecisionCase> =
309        serde_json::from_str(DECISION_CASES).expect("parse decision corpus");
310    assert_unique_ids("decision", decision.iter().map(|c| c.id.as_str()));
311    let dec_rows: Vec<(String, bool, bool)> = decision
312        .iter()
313        .map(|c| {
314            assert!(
315                c.label == "violation" || c.label == "benign",
316                "decision {}: bad label {:?}",
317                c.id,
318                c.label
319            );
320            assert!(
321                DECISION_VARIANTS.contains(&c.expect.as_str()),
322                "decision {}: bad expect {:?}",
323                c.id,
324                c.expect
325            );
326            // The confusion-matrix axis is "must Deny": keep label and expect
327            // consistent so the matrix can't silently mislabel.
328            assert_eq!(
329                c.label == "violation",
330                c.expect == "deny",
331                "decision {}: label/expect mismatch (violation iff expect==deny)",
332                c.id
333            );
334            (c.id.clone(), c.label == "violation", decision_pass(c))
335        })
336        .collect();
337    let decision = score("decision", &dec_rows, baseline.decision);
338
339    EvalReport {
340        detection,
341        decision,
342    }
343}
344
345// ── Tests ────────────────────────────────────────────────────────────────────
346
347#[cfg(test)]
348mod tests {
349    use super::*;
350
351    #[test]
352    fn detection_pass_can_fail() {
353        // Same discipline as the 1.2 grep validation: prove the scorer trips.
354        let mk = |expect_class: &str, expect_paths: &[&str]| DetectionCase {
355            id: "x".into(),
356            cmd: "cat src/main.rs".into(),
357            label: "violation".into(),
358            expect_class: expect_class.into(),
359            expect_paths: expect_paths.iter().map(|s| s.to_string()).collect(),
360            note: None,
361        };
362        assert!(detection_pass(&mk("cat_like", &["src/main.rs"])));
363        assert!(!detection_pass(&mk("cat_like", &["WRONG.rs"])));
364        assert!(!detection_pass(&mk("none", &[])));
365    }
366
367    #[test]
368    fn decision_pass_can_fail() {
369        let deny_input = DecisionCase {
370            id: "x".into(),
371            label: "violation".into(),
372            rel_path: "src/a.rs".into(),
373            file_record: Some(serde_json::json!({
374                "confidence": {"value": 0.9}, "quality": {"value": 0.8},
375                "staleness": {"value": 0.1, "tier": "fresh"},
376                "payload": {"gotcha_keys": ["g"]}
377            })),
378            gotcha_records: HashMap::from([(
379                "g".to_string(),
380                serde_json::json!({
381                    "value": "r", "confidence": {"value": 0.9}, "quality": {"value": 0.8},
382                    "payload": {"confirmed": true}
383                }),
384            )]),
385            already_consulted: false,
386            expect: "deny".into(),
387            note: None,
388        };
389        assert!(decision_pass(&deny_input), "real deny case must pass");
390
391        let mut wrong = deny_input;
392        wrong.expect = "allow".into();
393        assert!(
394            !decision_pass(&wrong),
395            "a deny scored against expect=allow must fail"
396        );
397    }
398
399    #[test]
400    fn corpus_is_well_formed_and_gates() {
401        // Loads + validates the embedded corpus (panics on malformed data) and
402        // confirms the committed baseline matches current behavior.
403        let report = run();
404        assert!(report.detection.cases > 0);
405        assert!(report.decision.cases > 0);
406        report
407            .gate()
408            .expect("embedded corpus must match its baseline");
409    }
410}
mati_core/eval.rs

mati_core/
eval.rs