codetether_agent/session/oracle.rs
1//! Replay-oracle evaluation harness (ClawVM §3).
2//!
3//! ## Role
4//!
5//! ClawVM proposes a replay oracle to separate *policy quality* from
6//! *budget insufficiency*: given a recorded trace and the same budget,
7//! an oracle with bounded future lookahead `h` picks representations
8//! that minimise faults. The online-minus-oracle fault gap then
9//! measures headroom vs. unavoidable workload pressure.
10//!
11//! ## Scope (Phase B step 22)
12//!
13//! This module delivers the evaluation primitive without wiring it
14//! into the live prompt loop (it is intentionally offline). It takes
15//! a recorded `Vec<Message>` plus a horizon `h` and produces a
16//! per-turn demand trace — which prior messages get referenced within
17//! the next `h` turns. That trace is the "ground truth" the Pareto
18//! eval harness can grade any [`DerivePolicy`] against.
19//!
20//! Demand signal today is a conservative lexical heuristic: a turn at
21//! index `t` is said to "need" turn `u` (with `u < t`) when:
22//!
23//! * The body of `t` mentions a file path that first appeared in `u`.
24//! * A `ToolResult` at `t` matches a `ToolCall` id introduced at `u`.
25//!
26//! Both signals are overly conservative — the real oracle can see
27//! model-internal attention — but they are faithful enough to seed
28//! the oracle-gap metric and to motivate future refinement.
29//!
30//! ## Invariants
31//!
32//! * Pure and offline: no provider, RLM, or filesystem IO.
33//! * Deterministic: same input → same output.
34//! * The full message vector is borrowed read-only; no mutation.
35//!
36//! ## Examples
37//!
38//! ```rust
39//! use codetether_agent::provider::{ContentPart, Message, Role};
40//! use codetether_agent::session::oracle::{replay_oracle, OracleReport};
41//!
42//! let msgs = vec![
43//! Message {
44//! role: Role::User,
45//! content: vec![ContentPart::Text { text: "edit src/lib.rs".into() }],
46//! },
47//! Message {
48//! role: Role::Assistant,
49//! content: vec![ContentPart::ToolCall {
50//! id: "call-1".into(),
51//! name: "Shell".into(),
52//! arguments: "{}".into(),
53//! thought_signature: None,
54//! }],
55//! },
56//! Message {
57//! role: Role::Tool,
58//! content: vec![ContentPart::ToolResult {
59//! tool_call_id: "call-1".into(),
60//! content: "ok".into(),
61//! }],
62//! },
63//! ];
64//! let report: OracleReport = replay_oracle(&msgs, 2);
65//! assert_eq!(report.demand.len(), msgs.len());
66//! ```
67
68use std::collections::HashMap;
69
70use crate::provider::{ContentPart, Message};
71use crate::session::relevance::extract;
72
73/// Summary of an oracle replay over a recorded trace.
74#[derive(Debug, Clone)]
75pub struct OracleReport {
76 /// For each turn `t` in the trace, the sorted list of prior turn
77 /// indices `u < t` that `t` references within the `h`-turn
78 /// lookahead window.
79 pub demand: Vec<Vec<usize>>,
80 /// Horizon the oracle used.
81 pub horizon: usize,
82}
83
84impl OracleReport {
85 /// Number of distinct (t, u) reference edges the oracle observed.
86 ///
87 /// Useful as a sanity check: zero edges on a long trace usually
88 /// means the heuristic wasn't broad enough, not that the agent was
89 /// uninteresting.
90 pub fn reference_count(&self) -> usize {
91 self.demand.iter().map(Vec::len).sum()
92 }
93}
94
95/// Compute an [`OracleReport`] over `messages` with horizon `h`.
96///
97/// Given the demand signal (lexical file references plus tool-call id
98/// matches) the report records, for every turn `t`, the set of prior
99/// turns needed to satisfy demand from `[t, t+h]`. Pareto harnesses
100/// compare that set to what a [`DerivePolicy`] actually kept.
101pub fn replay_oracle(messages: &[Message], h: usize) -> OracleReport {
102 let files_per_turn = index_files_per_turn(messages);
103 let tool_call_owner = index_tool_call_owners(messages);
104
105 let mut demand: Vec<Vec<usize>> = vec![Vec::new(); messages.len()];
106
107 for t in 0..messages.len() {
108 let window_end = (t + h).min(messages.len().saturating_sub(1));
109 for future_t in t..=window_end {
110 let future = &messages[future_t];
111 for u in references_for_turn(future, t, &files_per_turn, &tool_call_owner) {
112 if u < t && !demand[t].contains(&u) {
113 demand[t].push(u);
114 }
115 }
116 }
117 demand[t].sort_unstable();
118 }
119
120 OracleReport { demand, horizon: h }
121}
122
123/// Index files referenced by each turn so later turns can fast-lookup
124/// "first turn this path appeared in".
125fn index_files_per_turn(messages: &[Message]) -> Vec<Vec<String>> {
126 messages.iter().map(|m| extract(m).files).collect()
127}
128
129/// Map tool_call_id → owning turn index, so a later `ToolResult` can
130/// locate the `ToolCall` that introduced it.
131fn index_tool_call_owners(messages: &[Message]) -> HashMap<String, usize> {
132 let mut owners = HashMap::new();
133 for (idx, msg) in messages.iter().enumerate() {
134 for part in &msg.content {
135 if let ContentPart::ToolCall { id, .. } = part {
136 owners.insert(id.clone(), idx);
137 }
138 }
139 }
140 owners
141}
142
143/// Compute which prior turns the `future` message references.
144fn references_for_turn(
145 future: &Message,
146 current_idx: usize,
147 files_per_turn: &[Vec<String>],
148 tool_call_owner: &HashMap<String, usize>,
149) -> Vec<usize> {
150 let mut out = Vec::new();
151 for part in &future.content {
152 match part {
153 ContentPart::ToolResult { tool_call_id, .. } => {
154 if let Some(&owner) = tool_call_owner.get(tool_call_id) {
155 out.push(owner);
156 }
157 }
158 ContentPart::Text { text } => {
159 let file_refs = extract_text_file_tokens(text);
160 for file in file_refs {
161 for (u, files) in files_per_turn.iter().enumerate().take(current_idx) {
162 if files.iter().any(|f| f == &file) {
163 out.push(u);
164 break;
165 }
166 }
167 }
168 }
169 _ => {}
170 }
171 }
172 out
173}
174
175/// Re-use the relevance extractor on a synthetic single-text message
176/// to get the file tokens. Duplicated logic is cheaper than exposing
177/// the internal token-splitter.
178fn extract_text_file_tokens(text: &str) -> Vec<String> {
179 use crate::provider::Role;
180 let synthetic = Message {
181 role: Role::User,
182 content: vec![ContentPart::Text {
183 text: text.to_string(),
184 }],
185 };
186 extract(&synthetic).files
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192 use crate::provider::{ContentPart, Message, Role};
193
194 fn text(role: Role, s: &str) -> Message {
195 Message {
196 role,
197 content: vec![ContentPart::Text {
198 text: s.to_string(),
199 }],
200 }
201 }
202
203 fn tool_call(id: &str, name: &str) -> Message {
204 Message {
205 role: Role::Assistant,
206 content: vec![ContentPart::ToolCall {
207 id: id.to_string(),
208 name: name.to_string(),
209 arguments: "{}".to_string(),
210 thought_signature: None,
211 }],
212 }
213 }
214
215 fn tool_result(id: &str, body: &str) -> Message {
216 Message {
217 role: Role::Tool,
218 content: vec![ContentPart::ToolResult {
219 tool_call_id: id.to_string(),
220 content: body.to_string(),
221 }],
222 }
223 }
224
225 #[test]
226 fn oracle_binds_tool_results_to_their_call_owners() {
227 let msgs = vec![
228 text(Role::User, "do the thing"),
229 tool_call("call-1", "Shell"),
230 tool_result("call-1", "ok"),
231 ];
232 let report = replay_oracle(&msgs, 5);
233 // Turn 2 (ToolResult) should reference turn 1 (its ToolCall).
234 assert!(report.demand[2].contains(&1));
235 assert!(report.reference_count() >= 1);
236 }
237
238 #[test]
239 fn oracle_tracks_file_references_across_turns() {
240 let msgs = vec![
241 text(Role::User, "edit src/lib.rs"),
242 text(Role::Assistant, "ok"),
243 text(Role::User, "now open src/lib.rs again"),
244 ];
245 let report = replay_oracle(&msgs, 5);
246 // Turn 2 re-mentions the file first seen at turn 0.
247 assert!(report.demand[2].contains(&0));
248 }
249
250 #[test]
251 fn oracle_respects_horizon_bound() {
252 let msgs = vec![
253 text(Role::User, "edit src/lib.rs"),
254 text(Role::Assistant, "noop"),
255 text(Role::Assistant, "noop"),
256 text(Role::User, "reopen src/lib.rs"),
257 ];
258 let short = replay_oracle(&msgs, 1);
259 let long = replay_oracle(&msgs, 10);
260 // With a tiny horizon, turn 0's future window barely extends.
261 assert!(long.reference_count() >= short.reference_count());
262 }
263
264 #[test]
265 fn report_over_empty_trace_is_empty() {
266 let report = replay_oracle(&[], 4);
267 assert!(report.demand.is_empty());
268 assert_eq!(report.horizon, 4);
269 assert_eq!(report.reference_count(), 0);
270 }
271}