mati 0.1.2

An enforcement layer for codebase knowledge: confirmed gotchas gate what AI agents read and edit at the hook level. Not a passive memory store.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
//! Eval / regression corpus runner (idea 4).
//!
//! Replays a labeled corpus through the REAL pure enforcement functions and
//! scores a confusion matrix per layer:
//!   - **detection** — `classify_command` + `extract_file_path`: which file a
//!     bash command reads (the read gate's first stage);
//!   - **decision** — `evaluate()`: what enforcement does given a file/gotcha
//!     state (Allow / Advisory / Deny / …).
//!
//! Ground truth is independent of current behavior; cases the engine currently
//! mishandles are tracked in `baseline.json`. The gate asserts each layer's
//! failing set equals its baseline exactly — a new miss is a regression, a
//! fixed gap forces a baseline update (ratcheting recall up). That makes the
//! "how do I know it doesn't miss?" number a measured, regression-gated fact.
//!
//! The corpus + baseline are embedded at compile time so `mati eval` runs the
//! identical corpus in a shipped binary. Pure — no store, daemon, or network;
//! the eval path stays inside mati's zero-network invariant.

use std::collections::{BTreeSet, HashMap};

use serde::{Deserialize, Serialize};

use crate::hooks::decide::{
    classify_command, evaluate, extract_file_paths, CommandClass, Decision, EnforcementInput,
};

// ── Embedded corpus (relative to this file, src/eval.rs) ─────────────────────
const DETECTION_KNOWN_GOOD: &str = include_str!("../tests/fixtures/eval/detection/known_good.json");
const DETECTION_BENIGN: &str = include_str!("../tests/fixtures/eval/detection/benign.json");
const DETECTION_ADVERSARIAL: &str =
    include_str!("../tests/fixtures/eval/detection/adversarial.json");
const DECISION_CASES: &str = include_str!("../tests/fixtures/eval/decision/cases.json");
const BASELINE: &str = include_str!("../tests/fixtures/eval/baseline.json");

// ── Corpus case types ────────────────────────────────────────────────────────

#[derive(Debug, Deserialize)]
struct DetectionCase {
    id: String,
    cmd: String,
    /// "violation" = a real file-read enforcement must catch; "benign" = not a
    /// file-read.
    label: String,
    /// Ground-truth class: "cat_like" | "grep_like" | "none".
    expect_class: String,
    /// Ground-truth set of files the gate must check (order-independent). One
    /// entry for a single-file read, several for `cat a.rs b.rs`, empty when no
    /// file is read (benign, or `grep PATTERN` with no file).
    #[serde(default)]
    expect_paths: Vec<String>,
    #[serde(default)]
    #[allow(dead_code)]
    note: Option<String>,
}

#[derive(Debug, Deserialize)]
struct DecisionCase {
    id: String,
    /// "violation" = must Deny; "benign" = must NOT Deny.
    label: String,
    rel_path: String,
    #[serde(default)]
    file_record: Option<serde_json::Value>,
    #[serde(default)]
    gotcha_records: HashMap<String, serde_json::Value>,
    #[serde(default)]
    already_consulted: bool,
    /// Ground-truth decision variant: "allow" | "advisory" | "deny" |
    /// "already_consulted" | "liability" | "tombstone" | "no_record".
    expect: String,
    #[serde(default)]
    #[allow(dead_code)]
    note: Option<String>,
}

#[derive(Debug, Deserialize)]
struct Baseline {
    #[serde(default)]
    detection: Vec<String>,
    #[serde(default)]
    decision: Vec<String>,
}

// ── Report types ─────────────────────────────────────────────────────────────

/// Per-layer confusion matrix and baseline comparison.
#[derive(Debug, Serialize)]
pub struct LayerReport {
    pub layer: &'static str,
    pub cases: u32,
    pub tp: u32,
    #[serde(rename = "fn")]
    pub fn_: u32,
    pub tn: u32,
    pub fp: u32,
    pub recall: f64,
    pub fp_rate: f64,
    pub precision: f64,
    /// Case ids whose current output != ground truth.
    pub failing: Vec<String>,
    /// Baseline-accepted gaps for this layer.
    pub known_gaps: Vec<String>,
    /// `failing` − `known_gaps`: new misses. Must be empty for a healthy gate.
    pub regressions: Vec<String>,
    /// `known_gaps` − `failing`: fixed cases that should leave the baseline.
    pub newly_fixed: Vec<String>,
}

#[derive(Debug, Serialize)]
pub struct EvalReport {
    pub detection: LayerReport,
    pub decision: LayerReport,
}

impl EvalReport {
    /// Ok iff every layer's failing set equals its baseline (no regressions,
    /// no stale baseline entries). The single source of truth for both the CI
    /// gate and `mati eval`'s exit code.
    pub fn gate(&self) -> Result<(), String> {
        let mut errs = Vec::new();
        for l in [&self.detection, &self.decision] {
            if !l.regressions.is_empty() {
                errs.push(format!(
                    "[{}] REGRESSION — output now wrong for cases not in baseline: {}\n  \
                     Fix the regression, or (if intended) add these ids to \
                     tests/fixtures/eval/baseline.json.",
                    l.layer,
                    l.regressions.join(", ")
                ));
            }
            if !l.newly_fixed.is_empty() {
                errs.push(format!(
                    "[{}] IMPROVEMENT — baseline gaps now PASS: {}\n  \
                     Remove them from tests/fixtures/eval/baseline.json so the \
                     baseline stays honest and recall ratchets up.",
                    l.layer,
                    l.newly_fixed.join(", ")
                ));
            }
        }
        if errs.is_empty() {
            Ok(())
        } else {
            Err(errs.join("\n"))
        }
    }
}

// ── Scoring ──────────────────────────────────────────────────────────────────

fn parse_class(s: &str) -> Option<CommandClass> {
    match s {
        "cat_like" => Some(CommandClass::CatLike),
        "grep_like" => Some(CommandClass::GrepLike),
        "none" => None,
        other => panic!("corpus: bad expect_class {other:?}"),
    }
}

fn detection_pass(c: &DetectionCase) -> bool {
    let got_class = classify_command(&c.cmd);
    if got_class != parse_class(&c.expect_class) {
        return false;
    }
    // The gate checks the SET of files a command reads, so compare
    // order-independently. `cat a.rs b.rs` must yield {a.rs, b.rs}.
    let mut got_paths = match got_class {
        Some(cl) => extract_file_paths(&c.cmd, cl),
        None => Vec::new(),
    };
    let mut want = c.expect_paths.clone();
    got_paths.sort();
    want.sort();
    got_paths == want
}

/// Stable name for a `Decision` variant (ignores the inner context strings,
/// which carry human-readable detail that is not part of the contract).
fn decision_variant(d: &Decision) -> &'static str {
    match d {
        Decision::Allow => "allow",
        Decision::Deny { .. } => "deny",
        Decision::AlreadyConsulted { .. } => "already_consulted",
        Decision::Advisory { .. } => "advisory",
        Decision::Liability { .. } => "liability",
        Decision::Tombstone => "tombstone",
        Decision::NoRecord => "no_record",
        Decision::NotFileRead => "not_file_read",
    }
}

const DECISION_VARIANTS: &[&str] = &[
    "allow",
    "advisory",
    "deny",
    "already_consulted",
    "liability",
    "tombstone",
    "no_record",
    "not_file_read",
];

fn decision_pass(c: &DecisionCase) -> bool {
    let input = EnforcementInput {
        rel_path: c.rel_path.clone(),
        file_record: c.file_record.clone(),
        gotcha_records: c.gotcha_records.clone(),
        already_consulted: c.already_consulted,
    };
    decision_variant(&evaluate(&input).decision) == c.expect
}

/// Build a `LayerReport` from `(id, is_violation, passed)` rows.
fn score(
    layer: &'static str,
    rows: &[(String, bool, bool)],
    known_gaps: Vec<String>,
) -> LayerReport {
    let (mut tp, mut fn_, mut tn, mut fp) = (0u32, 0u32, 0u32, 0u32);
    let mut failing: BTreeSet<String> = BTreeSet::new();
    for (id, is_violation, pass) in rows {
        match (is_violation, pass) {
            (true, true) => tp += 1,
            (true, false) => fn_ += 1,
            (false, true) => tn += 1,
            (false, false) => fp += 1,
        }
        if !pass {
            failing.insert(id.clone());
        }
    }
    let recall = if tp + fn_ == 0 {
        1.0
    } else {
        tp as f64 / (tp + fn_) as f64
    };
    let fp_rate = if fp + tn == 0 {
        0.0
    } else {
        fp as f64 / (fp + tn) as f64
    };
    let precision = if tp + fp == 0 {
        1.0
    } else {
        tp as f64 / (tp + fp) as f64
    };
    let known: BTreeSet<String> = known_gaps.iter().cloned().collect();
    let regressions = failing.difference(&known).cloned().collect();
    let newly_fixed = known.difference(&failing).cloned().collect();
    LayerReport {
        layer,
        cases: rows.len() as u32,
        tp,
        fn_,
        tn,
        fp,
        recall,
        fp_rate,
        precision,
        failing: failing.into_iter().collect(),
        known_gaps,
        regressions,
        newly_fixed,
    }
}

fn assert_unique_ids<'a>(layer: &str, ids: impl Iterator<Item = &'a str>) {
    let mut seen = BTreeSet::new();
    for id in ids {
        assert!(seen.insert(id), "{layer} corpus: duplicate case id {id:?}");
    }
}

/// Run the embedded corpus through the real enforcement functions and score it.
///
/// Panics only on a malformed corpus (bad label/expect/class, duplicate id,
/// or label↔expect inconsistency) — these are compile-embedded fixtures, so a
/// panic is a developer error caught immediately by the test or `mati eval`.
pub fn run() -> EvalReport {
    let baseline: Baseline = serde_json::from_str(BASELINE).expect("parse baseline.json");

    // Detection layer.
    let mut detection: Vec<DetectionCase> = Vec::new();
    for raw in [
        DETECTION_KNOWN_GOOD,
        DETECTION_BENIGN,
        DETECTION_ADVERSARIAL,
    ] {
        detection.extend(serde_json::from_str::<Vec<DetectionCase>>(raw).expect("parse detection"));
    }
    assert_unique_ids("detection", detection.iter().map(|c| c.id.as_str()));
    let det_rows: Vec<(String, bool, bool)> = detection
        .iter()
        .map(|c| {
            assert!(
                c.label == "violation" || c.label == "benign",
                "detection {}: bad label {:?}",
                c.id,
                c.label
            );
            (c.id.clone(), c.label == "violation", detection_pass(c))
        })
        .collect();
    let detection = score("detection", &det_rows, baseline.detection);

    // Decision layer.
    let decision: Vec<DecisionCase> =
        serde_json::from_str(DECISION_CASES).expect("parse decision corpus");
    assert_unique_ids("decision", decision.iter().map(|c| c.id.as_str()));
    let dec_rows: Vec<(String, bool, bool)> = decision
        .iter()
        .map(|c| {
            assert!(
                c.label == "violation" || c.label == "benign",
                "decision {}: bad label {:?}",
                c.id,
                c.label
            );
            assert!(
                DECISION_VARIANTS.contains(&c.expect.as_str()),
                "decision {}: bad expect {:?}",
                c.id,
                c.expect
            );
            // The confusion-matrix axis is "must Deny": keep label and expect
            // consistent so the matrix can't silently mislabel.
            assert_eq!(
                c.label == "violation",
                c.expect == "deny",
                "decision {}: label/expect mismatch (violation iff expect==deny)",
                c.id
            );
            (c.id.clone(), c.label == "violation", decision_pass(c))
        })
        .collect();
    let decision = score("decision", &dec_rows, baseline.decision);

    EvalReport {
        detection,
        decision,
    }
}

// ── Tests ────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn detection_pass_can_fail() {
        // Same discipline as the 1.2 grep validation: prove the scorer trips.
        let mk = |expect_class: &str, expect_paths: &[&str]| DetectionCase {
            id: "x".into(),
            cmd: "cat src/main.rs".into(),
            label: "violation".into(),
            expect_class: expect_class.into(),
            expect_paths: expect_paths.iter().map(|s| s.to_string()).collect(),
            note: None,
        };
        assert!(detection_pass(&mk("cat_like", &["src/main.rs"])));
        assert!(!detection_pass(&mk("cat_like", &["WRONG.rs"])));
        assert!(!detection_pass(&mk("none", &[])));
    }

    #[test]
    fn decision_pass_can_fail() {
        let deny_input = DecisionCase {
            id: "x".into(),
            label: "violation".into(),
            rel_path: "src/a.rs".into(),
            file_record: Some(serde_json::json!({
                "confidence": {"value": 0.9}, "quality": {"value": 0.8},
                "staleness": {"value": 0.1, "tier": "fresh"},
                "payload": {"gotcha_keys": ["g"]}
            })),
            gotcha_records: HashMap::from([(
                "g".to_string(),
                serde_json::json!({
                    "value": "r", "confidence": {"value": 0.9}, "quality": {"value": 0.8},
                    "payload": {"confirmed": true}
                }),
            )]),
            already_consulted: false,
            expect: "deny".into(),
            note: None,
        };
        assert!(decision_pass(&deny_input), "real deny case must pass");

        let mut wrong = deny_input;
        wrong.expect = "allow".into();
        assert!(
            !decision_pass(&wrong),
            "a deny scored against expect=allow must fail"
        );
    }

    #[test]
    fn corpus_is_well_formed_and_gates() {
        // Loads + validates the embedded corpus (panics on malformed data) and
        // confirms the committed baseline matches current behavior.
        let report = run();
        assert!(report.detection.cases > 0);
        assert!(report.decision.cases > 0);
        report
            .gate()
            .expect("embedded corpus must match its baseline");
    }
}