brainwires_eval/
recorder.rs

1//! Tool call sequence recording and diff.
2//!
3//! [`ToolSequenceRecorder`] is a lightweight, thread-safe recorder that
4//! captures the ordered sequence of tool calls made during an agent run.
5//! Attach it to an agent's pre-execution hook and call
6//! [`ToolSequenceRecorder::diff_against`] at the end of a trial to verify
7//! behavioural correctness.
8
9use std::collections::hash_map::DefaultHasher;
10use std::hash::{Hash, Hasher};
11use std::sync::{Arc, Mutex};
12use std::time::{SystemTime, UNIX_EPOCH};
13
14use serde::{Deserialize, Serialize};
15
16// ── Tool call record ──────────────────────────────────────────────────────────
17
18/// A single recorded tool call.
19#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
20pub struct ToolCallRecord {
21    /// Name of the tool that was invoked.
22    pub name: String,
23    /// A short fingerprint of the tool's input arguments (first 16 hex chars
24    /// of a FNV-style hash).  Used for lightweight argument comparison without
25    /// storing the full payload.
26    pub args_fingerprint: String,
27    /// Wall-clock timestamp of the call in milliseconds since Unix epoch.
28    pub timestamp_ms: u64,
29}
30
31impl ToolCallRecord {
32    fn new(name: impl Into<String>, args: &serde_json::Value) -> Self {
33        let name = name.into();
34        let args_fingerprint = fingerprint_json(args);
35        let timestamp_ms = SystemTime::now()
36            .duration_since(UNIX_EPOCH)
37            .map(|d| d.as_millis() as u64)
38            .unwrap_or(0);
39        Self {
40            name,
41            args_fingerprint,
42            timestamp_ms,
43        }
44    }
45}
46
47fn fingerprint_json(v: &serde_json::Value) -> String {
48    let mut h = DefaultHasher::new();
49    v.to_string().hash(&mut h);
50    format!("{:016x}", h.finish())
51}
52
53// ── Sequence diff ─────────────────────────────────────────────────────────────
54
55/// Result of comparing the recorded tool sequence against an expected sequence.
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct SequenceDiff {
58    /// The expected tool names (in order).
59    pub expected: Vec<String>,
60    /// The actual tool names recorded (in order).
61    pub actual: Vec<String>,
62    /// Edit distance between the two sequences (Levenshtein).
63    pub edit_distance: usize,
64    /// Similarity in [0, 1]: `1.0 − edit_distance / max(len_expected, len_actual)`.
65    /// `1.0` means an exact match; `0.0` means maximally different.
66    pub similarity: f64,
67}
68
69impl SequenceDiff {
70    /// Compute the diff between `expected` and `actual` name sequences.
71    pub fn compute(expected: &[&str], actual: &[String]) -> Self {
72        let exp: Vec<String> = expected.iter().map(|s| s.to_string()).collect();
73        let ed = levenshtein(
74            expected,
75            actual
76                .iter()
77                .map(|s| s.as_str())
78                .collect::<Vec<_>>()
79                .as_slice(),
80        );
81        let max_len = exp.len().max(actual.len());
82        let similarity = if max_len == 0 {
83            1.0
84        } else {
85            1.0 - (ed as f64 / max_len as f64)
86        };
87        Self {
88            expected: exp,
89            actual: actual.to_vec(),
90            edit_distance: ed,
91            similarity,
92        }
93    }
94
95    /// Returns `true` if the actual sequence exactly matches the expected one.
96    pub fn is_exact_match(&self) -> bool {
97        self.edit_distance == 0
98    }
99}
100
101/// Compute Levenshtein edit distance between two string slices.
102fn levenshtein(a: &[&str], b: &[&str]) -> usize {
103    let n = a.len();
104    let m = b.len();
105    let mut dp = vec![vec![0usize; m + 1]; n + 1];
106    for (i, row) in dp.iter_mut().enumerate().take(n + 1) {
107        row[0] = i;
108    }
109    for (j, val) in dp[0].iter_mut().enumerate().take(m + 1) {
110        *val = j;
111    }
112    for i in 1..=n {
113        for j in 1..=m {
114            dp[i][j] = if a[i - 1] == b[j - 1] {
115                dp[i - 1][j - 1]
116            } else {
117                1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1])
118            };
119        }
120    }
121    dp[n][m]
122}
123
124// ── Recorder ──────────────────────────────────────────────────────────────────
125
126/// Thread-safe recorder for tool call sequences.
127///
128/// Wrap in `Arc` and share across async tasks / agent hooks.
129///
130/// ## Example
131/// ```rust,ignore
132/// let recorder = ToolSequenceRecorder::new();
133/// recorder.record("read_file", &json!({"path": "main.rs"}));
134/// recorder.record("write_file", &json!({"path": "out.rs"}));
135///
136/// let diff = recorder.diff_against(&["read_file", "write_file"]);
137/// assert!(diff.is_exact_match());
138/// ```
139#[derive(Debug, Clone, Default)]
140pub struct ToolSequenceRecorder {
141    inner: Arc<Mutex<Vec<ToolCallRecord>>>,
142}
143
144impl ToolSequenceRecorder {
145    /// Create a new, empty recorder.
146    pub fn new() -> Self {
147        Self::default()
148    }
149
150    /// Record a tool call.  Safe to call from multiple threads / async tasks.
151    pub fn record(&self, name: impl Into<String>, args: &serde_json::Value) {
152        let record = ToolCallRecord::new(name, args);
153        self.inner
154            .lock()
155            .expect("recorder lock poisoned")
156            .push(record);
157    }
158
159    /// Return a snapshot of all recorded calls in insertion order.
160    pub fn calls(&self) -> Vec<ToolCallRecord> {
161        self.inner.lock().expect("recorder lock poisoned").clone()
162    }
163
164    /// Return only the tool names in insertion order.
165    pub fn call_names(&self) -> Vec<String> {
166        self.inner
167            .lock()
168            .expect("recorder lock poisoned")
169            .iter()
170            .map(|r| r.name.clone())
171            .collect()
172    }
173
174    /// Diff the recorded sequence against an expected list of tool names.
175    pub fn diff_against(&self, expected: &[&str]) -> SequenceDiff {
176        let actual = self.call_names();
177        SequenceDiff::compute(expected, &actual)
178    }
179
180    /// Clear all recorded calls.
181    pub fn reset(&self) {
182        self.inner.lock().expect("recorder lock poisoned").clear();
183    }
184
185    /// Number of recorded calls.
186    pub fn len(&self) -> usize {
187        self.inner.lock().expect("recorder lock poisoned").len()
188    }
189
190    /// Returns `true` if no calls have been recorded.
191    pub fn is_empty(&self) -> bool {
192        self.len() == 0
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use serde_json::json;
200
201    #[test]
202    fn test_record_and_retrieve() {
203        let recorder = ToolSequenceRecorder::new();
204        recorder.record("read_file", &json!({"path": "a.rs"}));
205        recorder.record("write_file", &json!({"path": "b.rs"}));
206
207        let calls = recorder.calls();
208        assert_eq!(calls.len(), 2);
209        assert_eq!(calls[0].name, "read_file");
210        assert_eq!(calls[1].name, "write_file");
211    }
212
213    #[test]
214    fn test_call_names() {
215        let recorder = ToolSequenceRecorder::new();
216        recorder.record("bash", &json!({}));
217        recorder.record("read_file", &json!({}));
218        assert_eq!(recorder.call_names(), vec!["bash", "read_file"]);
219    }
220
221    #[test]
222    fn test_diff_exact_match() {
223        let recorder = ToolSequenceRecorder::new();
224        recorder.record("a", &json!({}));
225        recorder.record("b", &json!({}));
226        recorder.record("c", &json!({}));
227
228        let diff = recorder.diff_against(&["a", "b", "c"]);
229        assert!(diff.is_exact_match());
230        assert!((diff.similarity - 1.0).abs() < 1e-9);
231    }
232
233    #[test]
234    fn test_diff_partial_match() {
235        let recorder = ToolSequenceRecorder::new();
236        recorder.record("a", &json!({}));
237        recorder.record("x", &json!({})); // unexpected
238        recorder.record("c", &json!({}));
239
240        let diff = recorder.diff_against(&["a", "b", "c"]);
241        assert!(!diff.is_exact_match());
242        assert_eq!(diff.edit_distance, 1);
243        assert!(diff.similarity > 0.5);
244    }
245
246    #[test]
247    fn test_diff_empty_vs_expected() {
248        let recorder = ToolSequenceRecorder::new();
249        let diff = recorder.diff_against(&["a", "b"]);
250        assert_eq!(diff.edit_distance, 2);
251        assert_eq!(diff.similarity, 0.0);
252    }
253
254    #[test]
255    fn test_diff_both_empty() {
256        let recorder = ToolSequenceRecorder::new();
257        let diff = recorder.diff_against(&[]);
258        assert!(diff.is_exact_match());
259        assert!((diff.similarity - 1.0).abs() < 1e-9);
260    }
261
262    #[test]
263    fn test_reset_clears_calls() {
264        let recorder = ToolSequenceRecorder::new();
265        recorder.record("a", &json!({}));
266        recorder.reset();
267        assert!(recorder.is_empty());
268    }
269
270    #[test]
271    fn test_args_fingerprint_differs_for_different_args() {
272        let r1 = ToolCallRecord::new("tool", &json!({"a": 1}));
273        let r2 = ToolCallRecord::new("tool", &json!({"a": 2}));
274        assert_ne!(r1.args_fingerprint, r2.args_fingerprint);
275    }
276
277    #[test]
278    fn test_args_fingerprint_same_for_same_args() {
279        let r1 = ToolCallRecord::new("tool", &json!({"x": "hello"}));
280        let r2 = ToolCallRecord::new("tool", &json!({"x": "hello"}));
281        assert_eq!(r1.args_fingerprint, r2.args_fingerprint);
282    }
283
284    #[test]
285    fn test_levenshtein_identical() {
286        assert_eq!(levenshtein(&["a", "b", "c"], &["a", "b", "c"]), 0);
287    }
288
289    #[test]
290    fn test_levenshtein_single_substitution() {
291        assert_eq!(levenshtein(&["a", "b", "c"], &["a", "x", "c"]), 1);
292    }
293
294    #[test]
295    fn test_levenshtein_insert_delete() {
296        assert_eq!(levenshtein(&["a", "b"], &["a", "b", "c"]), 1);
297        assert_eq!(levenshtein(&["a", "b", "c"], &["a", "b"]), 1);
298    }
299}
brainwires_eval/recorder.rs

brainwires_eval/
recorder.rs