1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
//! B0 M11 — eval-harness output schema.
//!
//! Per-test JSONL records emitted by the Python harness in
//! `scripts/eval/` and consumed by the Rust report aggregator
//! (`mur agent eval report`, M11.4). Stable on-disk shape — bumping
//! field semantics requires a new `EvalRecord.schema_version`.
//!
//! Spec: `docs/superpowers/specs/2026-05-06-b0-m11-eval-harness-design.md` §6.
use serde::{Deserialize, Serialize};
/// Schema version of the JSONL records this build emits / consumes.
/// Increment when the JSONL contract changes; the report aggregator
/// rejects records with a version it doesn't recognise.
pub const EVAL_SCHEMA_VERSION: u32 = 1;
/// Outcome the agent took in response to the attack — independent of
/// whether that outcome was the "right" one (which is up to the test
/// case's `expected` field).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum EvalDecision {
/// Agent refused the attacker's instruction outright (a B0 hook
/// returned `Decision::Deny` or the model declined to proceed).
Refuse,
/// Agent did the user's intended thing without falling for the
/// embedded directive — the safe-and-correct outcome on most
/// indirect-injection cases.
ComplySafe,
/// Agent followed the attacker's directive — TEST FAIL on every
/// indirect-injection / jailbreak / agentic-misuse case.
ComplyUnsafe,
}
/// Which upstream benchmark the case came from. Determines how the
/// aggregator buckets results in its markdown report.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum EvalSuite {
/// Princeton/ETH/Anthropic AgentDojo (Apache-2.0, NeurIPS 2024).
/// Indirect-prompt-injection focus.
Agentdojo,
/// CAIS HarmBench (CC-BY-4.0, ICML 2024). Jailbreak + agentic-
/// misuse focus.
Harmbench,
/// UIUC/Kang-Lab InjecAgent (MIT). Tool-poisoning focus:
/// 200 cases where injected tool outputs redirect the agent.
#[serde(rename = "injecagent")]
InjecAgent,
}
/// Which model backend produced the agent response. The mock backend
/// is the CI-track stub; everything else is a real-LLM release-track
/// run.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum EvalLlmBackend {
/// Deterministic mock (`scripts/eval/mock_llm.py` — lands in M11.1).
/// Returns canonical `refuse` / `comply_safe` / `comply_unsafe`
/// per attack class. Fast, free, runs every PR.
Stub,
/// Anthropic API. Use for the canonical pass/fail vs. spec
/// thresholds. Default model: claude-sonnet-4-6.
Anthropic,
/// OpenAI API.
Openai,
/// Local Ollama. Useful for gating without an API key but
/// not the canonical baseline.
Ollama,
/// Llama-Guard-3-8B running locally via Ollama. Used as the
/// nightly judge: grades each response as safe / unsafe.
LlamaGuard,
}
/// One observation from the B0 hook chain during the test run.
/// Captured in chronological order so a later regression diagnosis
/// can replay the protection logic.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EvalHookDecision {
/// Hook name + phase, e.g. `"B0SafetyHook.on_prompt_submit"`.
pub hook: String,
/// Decision returned, e.g. `"wrap_untrusted"`, `"AskUser"`,
/// `"Deny"`, `"Allow"`.
pub decision: String,
/// B0 rule number this firing maps to. None for hook decisions
/// outside the rule numbering (e.g. provenance ledger writes).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rule: Option<u32>,
}
/// One test case's result — written as a single JSONL line by the
/// Python harness per case, parsed by `mur agent eval report` to
/// build the markdown summary.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EvalRecord {
/// Wire-version of this struct; mismatches abort report
/// generation rather than producing silently-wrong aggregates.
#[serde(default = "default_schema_version")]
pub schema_version: u32,
pub test_suite: EvalSuite,
/// Stable identifier — `<suite>:<env>:<id>` for AgentDojo,
/// `<suite>:<behavior_id>` for HarmBench.
pub test_id: String,
/// Free-form upstream tag, e.g. `"data_exfil"`,
/// `"prompt_injection"`, `"agentic_misuse"`. Used to bucket
/// the markdown report by category.
pub attack_category: String,
pub agent_decision: EvalDecision,
pub expected: EvalDecision,
pub passed: bool,
/// B0 hook chain trace for this test, in order. Empty if the
/// Python harness ran in fast-only mode (no hook capture).
#[serde(default)]
pub hook_decisions: Vec<EvalHookDecision>,
/// LLM token usage. None for the stub backend (no real tokens).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub tokens_input: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub tokens_output: Option<u64>,
pub wall_clock_ms: u64,
pub llm_backend: EvalLlmBackend,
/// Free-form model identifier — `"claude-sonnet-4-6"`,
/// `"stub"`, `"llama3.2:3b"`, etc.
pub llm_model: String,
/// Run-id this record belongs to; aggregator groups by this.
/// Format: ULID so records sort by time.
pub run_id: String,
/// RFC3339 timestamp of when this case finished.
pub timestamp: String,
}
fn default_schema_version() -> u32 {
EVAL_SCHEMA_VERSION
}
#[cfg(test)]
mod tests {
use super::*;
/// JSONL contract: every record round-trips through serde_json
/// without losing any field. Catches accidental schema drift.
#[test]
fn record_roundtrips_full_fields() {
let r = EvalRecord {
schema_version: EVAL_SCHEMA_VERSION,
test_suite: EvalSuite::Agentdojo,
test_id: "agentdojo:slack:42".into(),
attack_category: "data_exfil".into(),
agent_decision: EvalDecision::Refuse,
expected: EvalDecision::Refuse,
passed: true,
hook_decisions: vec![
EvalHookDecision {
hook: "B0SafetyHook.on_prompt_submit".into(),
decision: "wrap_untrusted".into(),
rule: Some(3),
},
EvalHookDecision {
hook: "B0SafetyHook.pre_tool_use".into(),
decision: "AskUser".into(),
rule: Some(4),
},
],
tokens_input: Some(9821),
tokens_output: Some(184),
wall_clock_ms: 1240,
llm_backend: EvalLlmBackend::Anthropic,
llm_model: "claude-sonnet-4-6".into(),
run_id: "01HF8K0M5ZQEJ8C7XV6NQAYWZP".into(),
timestamp: "2026-05-06T08:15:32.123Z".into(),
};
let json = serde_json::to_string(&r).unwrap();
let back: EvalRecord = serde_json::from_str(&json).unwrap();
assert_eq!(back, r);
}
/// Stub-backend records may omit `tokens_input` / `tokens_output`
/// (no real LLM call → no tokens). The schema must accept the
/// absent form on the way in.
#[test]
fn stub_record_without_tokens_roundtrips() {
let yaml = r#"{
"test_suite": "harmbench",
"test_id": "harmbench:b042",
"attack_category": "direct_jailbreak",
"agent_decision": "refuse",
"expected": "refuse",
"passed": true,
"wall_clock_ms": 38,
"llm_backend": "stub",
"llm_model": "stub",
"run_id": "01HF8K0M5ZQEJ8C7XV6NQAYWZP",
"timestamp": "2026-05-06T08:15:32.123Z"
}"#;
let r: EvalRecord = serde_json::from_str(yaml).unwrap();
assert_eq!(r.schema_version, EVAL_SCHEMA_VERSION); // applied default
assert_eq!(r.tokens_input, None);
assert_eq!(r.tokens_output, None);
assert!(r.hook_decisions.is_empty());
}
/// Decision serialization is snake_case so the Python harness
/// can emit canonical strings without a Rust import.
#[test]
fn decision_strings_are_snake_case() {
let r = serde_json::to_string(&EvalDecision::ComplyUnsafe).unwrap();
assert_eq!(r, "\"comply_unsafe\"");
let r = serde_json::to_string(&EvalSuite::Agentdojo).unwrap();
assert_eq!(r, "\"agentdojo\"");
let r = serde_json::to_string(&EvalLlmBackend::Anthropic).unwrap();
assert_eq!(r, "\"anthropic\"");
}
/// Schema-version mismatch must be detectable without panic so
/// the report aggregator can refuse to produce a misleading
/// summary on a future-format JSONL.
#[test]
fn schema_version_constant_is_one() {
assert_eq!(EVAL_SCHEMA_VERSION, 1);
}
#[test]
fn injecagent_suite_roundtrips() {
let s = serde_json::to_string(&EvalSuite::InjecAgent).unwrap();
assert_eq!(s, "\"injecagent\"");
let back: EvalSuite = serde_json::from_str(&s).unwrap();
assert_eq!(back, EvalSuite::InjecAgent);
}
#[test]
fn llama_guard_backend_roundtrips() {
let s = serde_json::to_string(&EvalLlmBackend::LlamaGuard).unwrap();
assert_eq!(s, "\"llama_guard\"");
let back: EvalLlmBackend = serde_json::from_str(&s).unwrap();
assert_eq!(back, EvalLlmBackend::LlamaGuard);
}
}