Skip to main content

codetether_agent/session/
oracle.rs

1//! Replay-oracle evaluation harness (ClawVM §3).
2//!
3//! ## Role
4//!
5//! ClawVM proposes a replay oracle to separate *policy quality* from
6//! *budget insufficiency*: given a recorded trace and the same budget,
7//! an oracle with bounded future lookahead `h` picks representations
8//! that minimise faults. The online-minus-oracle fault gap then
9//! measures headroom vs. unavoidable workload pressure.
10//!
11//! ## Scope (Phase B step 22)
12//!
13//! This module delivers the evaluation primitive without wiring it
14//! into the live prompt loop (it is intentionally offline). It takes
15//! a recorded `Vec<Message>` plus a horizon `h` and produces a
16//! per-turn demand trace — which prior messages get referenced within
17//! the next `h` turns. That trace is the "ground truth" the Pareto
18//! eval harness can grade any [`DerivePolicy`] against.
19//!
20//! Demand signal today is a conservative lexical heuristic: a turn at
21//! index `t` is said to "need" turn `u` (with `u < t`) when:
22//!
23//! * The body of `t` mentions a file path that first appeared in `u`.
24//! * A `ToolResult` at `t` matches a `ToolCall` id introduced at `u`.
25//!
26//! Both signals are overly conservative — the real oracle can see
27//! model-internal attention — but they are faithful enough to seed
28//! the oracle-gap metric and to motivate future refinement.
29//!
30//! ## Invariants
31//!
32//! * Pure and offline: no provider, RLM, or filesystem IO.
33//! * Deterministic: same input → same output.
34//! * The full message vector is borrowed read-only; no mutation.
35//!
36//! ## Examples
37//!
38//! ```rust
39//! use codetether_agent::provider::{ContentPart, Message, Role};
40//! use codetether_agent::session::oracle::{replay_oracle, OracleReport};
41//!
42//! let msgs = vec![
43//!     Message {
44//!         role: Role::User,
45//!         content: vec![ContentPart::Text { text: "edit src/lib.rs".into() }],
46//!     },
47//!     Message {
48//!         role: Role::Assistant,
49//!         content: vec![ContentPart::ToolCall {
50//!             id: "call-1".into(),
51//!             name: "Shell".into(),
52//!             arguments: "{}".into(),
53//!             thought_signature: None,
54//!         }],
55//!     },
56//!     Message {
57//!         role: Role::Tool,
58//!         content: vec![ContentPart::ToolResult {
59//!             tool_call_id: "call-1".into(),
60//!             content: "ok".into(),
61//!         }],
62//!     },
63//! ];
64//! let report: OracleReport = replay_oracle(&msgs, 2);
65//! assert_eq!(report.demand.len(), msgs.len());
66//! ```
67
68use std::collections::HashMap;
69
70use crate::provider::{ContentPart, Message};
71use crate::session::relevance::extract;
72
73/// Summary of an oracle replay over a recorded trace.
74#[derive(Debug, Clone)]
75pub struct OracleReport {
76    /// For each turn `t` in the trace, the sorted list of prior turn
77    /// indices `u < t` that `t` references within the `h`-turn
78    /// lookahead window.
79    pub demand: Vec<Vec<usize>>,
80    /// Horizon the oracle used.
81    pub horizon: usize,
82}
83
84impl OracleReport {
85    /// Number of distinct (t, u) reference edges the oracle observed.
86    ///
87    /// Useful as a sanity check: zero edges on a long trace usually
88    /// means the heuristic wasn't broad enough, not that the agent was
89    /// uninteresting.
90    pub fn reference_count(&self) -> usize {
91        self.demand.iter().map(Vec::len).sum()
92    }
93}
94
95/// Compute an [`OracleReport`] over `messages` with horizon `h`.
96///
97/// Given the demand signal (lexical file references plus tool-call id
98/// matches) the report records, for every turn `t`, the set of prior
99/// turns needed to satisfy demand from `[t, t+h]`. Pareto harnesses
100/// compare that set to what a [`DerivePolicy`] actually kept.
101pub fn replay_oracle(messages: &[Message], h: usize) -> OracleReport {
102    let files_per_turn = index_files_per_turn(messages);
103    let tool_call_owner = index_tool_call_owners(messages);
104
105    let mut demand: Vec<Vec<usize>> = vec![Vec::new(); messages.len()];
106
107    for t in 0..messages.len() {
108        let window_end = (t + h).min(messages.len().saturating_sub(1));
109        for future_t in t..=window_end {
110            let future = &messages[future_t];
111            for u in references_for_turn(future, t, &files_per_turn, &tool_call_owner) {
112                if u < t && !demand[t].contains(&u) {
113                    demand[t].push(u);
114                }
115            }
116        }
117        demand[t].sort_unstable();
118    }
119
120    OracleReport { demand, horizon: h }
121}
122
123/// Index files referenced by each turn so later turns can fast-lookup
124/// "first turn this path appeared in".
125fn index_files_per_turn(messages: &[Message]) -> Vec<Vec<String>> {
126    messages.iter().map(|m| extract(m).files).collect()
127}
128
129/// Map tool_call_id → owning turn index, so a later `ToolResult` can
130/// locate the `ToolCall` that introduced it.
131fn index_tool_call_owners(messages: &[Message]) -> HashMap<String, usize> {
132    let mut owners = HashMap::new();
133    for (idx, msg) in messages.iter().enumerate() {
134        for part in &msg.content {
135            if let ContentPart::ToolCall { id, .. } = part {
136                owners.insert(id.clone(), idx);
137            }
138        }
139    }
140    owners
141}
142
143/// Compute which prior turns the `future` message references.
144fn references_for_turn(
145    future: &Message,
146    current_idx: usize,
147    files_per_turn: &[Vec<String>],
148    tool_call_owner: &HashMap<String, usize>,
149) -> Vec<usize> {
150    let mut out = Vec::new();
151    for part in &future.content {
152        match part {
153            ContentPart::ToolResult { tool_call_id, .. } => {
154                if let Some(&owner) = tool_call_owner.get(tool_call_id) {
155                    out.push(owner);
156                }
157            }
158            ContentPart::Text { text } => {
159                let file_refs = extract_text_file_tokens(text);
160                for file in file_refs {
161                    for (u, files) in files_per_turn.iter().enumerate().take(current_idx) {
162                        if files.iter().any(|f| f == &file) {
163                            out.push(u);
164                            break;
165                        }
166                    }
167                }
168            }
169            _ => {}
170        }
171    }
172    out
173}
174
175/// Re-use the relevance extractor on a synthetic single-text message
176/// to get the file tokens. Duplicated logic is cheaper than exposing
177/// the internal token-splitter.
178fn extract_text_file_tokens(text: &str) -> Vec<String> {
179    use crate::provider::Role;
180    let synthetic = Message {
181        role: Role::User,
182        content: vec![ContentPart::Text {
183            text: text.to_string(),
184        }],
185    };
186    extract(&synthetic).files
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192    use crate::provider::{ContentPart, Message, Role};
193
194    fn text(role: Role, s: &str) -> Message {
195        Message {
196            role,
197            content: vec![ContentPart::Text {
198                text: s.to_string(),
199            }],
200        }
201    }
202
203    fn tool_call(id: &str, name: &str) -> Message {
204        Message {
205            role: Role::Assistant,
206            content: vec![ContentPart::ToolCall {
207                id: id.to_string(),
208                name: name.to_string(),
209                arguments: "{}".to_string(),
210                thought_signature: None,
211            }],
212        }
213    }
214
215    fn tool_result(id: &str, body: &str) -> Message {
216        Message {
217            role: Role::Tool,
218            content: vec![ContentPart::ToolResult {
219                tool_call_id: id.to_string(),
220                content: body.to_string(),
221            }],
222        }
223    }
224
225    #[test]
226    fn oracle_binds_tool_results_to_their_call_owners() {
227        let msgs = vec![
228            text(Role::User, "do the thing"),
229            tool_call("call-1", "Shell"),
230            tool_result("call-1", "ok"),
231        ];
232        let report = replay_oracle(&msgs, 5);
233        // Turn 2 (ToolResult) should reference turn 1 (its ToolCall).
234        assert!(report.demand[2].contains(&1));
235        assert!(report.reference_count() >= 1);
236    }
237
238    #[test]
239    fn oracle_tracks_file_references_across_turns() {
240        let msgs = vec![
241            text(Role::User, "edit src/lib.rs"),
242            text(Role::Assistant, "ok"),
243            text(Role::User, "now open src/lib.rs again"),
244        ];
245        let report = replay_oracle(&msgs, 5);
246        // Turn 2 re-mentions the file first seen at turn 0.
247        assert!(report.demand[2].contains(&0));
248    }
249
250    #[test]
251    fn oracle_respects_horizon_bound() {
252        let msgs = vec![
253            text(Role::User, "edit src/lib.rs"),
254            text(Role::Assistant, "noop"),
255            text(Role::Assistant, "noop"),
256            text(Role::User, "reopen src/lib.rs"),
257        ];
258        let short = replay_oracle(&msgs, 1);
259        let long = replay_oracle(&msgs, 10);
260        // With a tiny horizon, turn 0's future window barely extends.
261        assert!(long.reference_count() >= short.reference_count());
262    }
263
264    #[test]
265    fn report_over_empty_trace_is_empty() {
266        let report = replay_oracle(&[], 4);
267        assert!(report.demand.is_empty());
268        assert_eq!(report.horizon, 4);
269        assert_eq!(report.reference_count(), 0);
270    }
271}