atomcode_core/turn/
loop_guard.rs

1//! Cross-batch tool-call loop guard.
2//!
3//! The runner already deduplicates *within a single LLM batch*
4//! (`is_dup` in `runner.rs`). This module covers the orthogonal case:
5//! a model emitting the same `(tool, arguments)` pair across many
6//! sequential batches with no progress between them — the symptom
7//! that produced the 22-identical-`Bash(cargo check)` screenshot.
8//!
9//! Trigger conditions (all required, to avoid the false positives that
10//! killed commit `9339cf1`'s removed `detect_call_loop`):
11//!
12//! * `(name, arguments)` exactly matches a prior call's pair —
13//!   normalised once via `\0` separator. No fuzzy / token-streak
14//!   detection (that was the bit that wrongly caught batches like ssh
15//!   `ls`/`cat`/`sshpass`).
16//! * Every prior matching call had identical output and identical
17//!   `success` — polling a flaky endpoint where the body changes is
18//!   real progress, not a loop.
19//! * No state-changing tool succeeded with a *new* key in between —
20//!   when an `edit_file` to a previously-untouched file lands, the
21//!   world has changed and any prior repeats no longer count. (A
22//!   state-changing call to a key that is *already* in the window
23//!   does NOT clear it: that means `edit_file` itself is being
24//!   spammed with identical args, which is a loop we want to catch.)
25//!
26//! The trigger threshold is intentionally low (third attempt blocks)
27//! so the model gets two "maybe this time" chances on transient
28//! flakes but doesn't burn a full turn budget repeating itself. The
29//! agent clears the window at every user-message boundary, so the
30//! per-turn-only semantics are guaranteed by the caller.
31
32use std::collections::hash_map::DefaultHasher;
33use std::hash::{Hash, Hasher};
34
35/// Strict trigger: identical `(name, args, output, success)` repeats.
36/// Catches deterministic loops where output is byte-stable. With
37/// THRESHOLD = 3, the first two attempts execute normally and the
38/// third is short-circuited.
39const THRESHOLD: usize = 3;
40
41/// Hard cap regardless of output drift. Catches loops where the model
42/// is clearly stuck on the same `(name, args)` but output varies
43/// slightly between runs — e.g., `cargo check 2>&1 | tail -20` whose
44/// warning order, paths, or timing differ run-to-run. Without this
45/// gate the strict trigger above never fires for cargo/test/lint
46/// polling and the model can spam 30+ identical commands before
47/// hitting the per-turn step budget. Threshold 6 leaves room for
48/// legitimate progress-polling (curl health-check waiting for a
49/// service to start) before blocking.
50const HARD_THRESHOLD: usize = 6;
51
52/// Maximum number of recent entries to keep. The window is per-turn
53/// (cleared at every user-message boundary by the agent), so a small
54/// bounded ring is enough — even a chatty model rarely emits more
55/// than a dozen tools per assistant turn. Sized above HARD_THRESHOLD
56/// so the soft-trigger count survives the ring eviction.
57const WINDOW: usize = 32;
58
59/// Tools whose successful execution counts as "progress" — when one
60/// fires with a previously-unseen key, the loop guard resets so any
61/// prior read-only repeats no longer count toward the threshold.
62const STATE_CHANGING: &[&str] =
63    &["edit_file", "write_file", "create_file", "search_replace"];
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66struct Entry {
67    /// `name + "\0" + arguments` — the raw `(name, args)` pair pre-joined
68    /// so equality checks are a single string compare instead of a
69    /// tuple compare on every lookup.
70    key: String,
71    output_hash: u64,
72    success: bool,
73}
74
75#[derive(Default, Debug)]
76pub struct LoopGuardState {
77    recent: Vec<Entry>,
78}
79
80/// What the runner should do with the call it's about to dispatch.
81#[derive(Debug, PartialEq, Eq)]
82pub enum LoopGuardDecision {
83    /// Run the tool normally. After execution, call `record(...)`.
84    Allow,
85    /// Skip execution. The string carries a synthetic `ToolResult`
86    /// body explaining to the model why this attempt was blocked.
87    Block(String),
88}
89
90impl LoopGuardState {
91    pub fn clear(&mut self) {
92        self.recent.clear();
93    }
94
95    /// Decide whether to execute or block the call. Read-only — the
96    /// caller still needs `record(...)` after a real execution.
97    pub fn check(&self, name: &str, arguments: &str) -> LoopGuardDecision {
98        let key = make_key(name, arguments);
99        let matches: Vec<&Entry> = self.recent.iter().filter(|e| e.key == key).collect();
100
101        // Hard cap first: HARD_THRESHOLD identical-key repeats regardless
102        // of output drift. The strict trigger below would never fire for
103        // cargo / test / lint polling because their output is timing- or
104        // cache-dependent.
105        if matches.len() >= HARD_THRESHOLD - 1 {
106            return LoopGuardDecision::Block(format!(
107                "[Loop guard] `{}` with identical arguments has run {} time(s) \
108                 already in this turn. Output varies slightly each run but you \
109                 keep issuing the same query — that's a loop. Try a different \
110                 approach: change the command (different filter / different \
111                 file), edit code to make progress, or stop and ask the user. \
112                 Continuing to repeat will keep getting blocked.",
113                name,
114                matches.len()
115            ));
116        }
117
118        // Strict trigger: THRESHOLD identical-output, identical-success
119        // repeats. Catches deterministic loops where the model keeps
120        // hitting the exact same wall (same compile error, same grep
121        // result, etc).
122        if matches.len() < THRESHOLD - 1 {
123            return LoopGuardDecision::Allow;
124        }
125        let first = matches[0];
126        let all_same =
127            matches.iter().all(|e| e.output_hash == first.output_hash && e.success == first.success);
128        if !all_same {
129            return LoopGuardDecision::Allow;
130        }
131        LoopGuardDecision::Block(format!(
132            "[Loop guard] This exact tool call (`{}` with identical arguments) has \
133             run {} time(s) already in this turn with the same output and no \
134             intervening state change. Try a different approach: change the \
135             command, read different files, edit code to make progress, or stop \
136             and ask the user. Repeating the same query will not change the \
137             result.",
138            name,
139            matches.len()
140        ))
141    }
142
143    /// Record a real execution result. Call exactly once per non-blocked
144    /// dispatch. Handles the state-change reset internally so the runner
145    /// doesn't have to know the tool taxonomy.
146    pub fn record(&mut self, name: &str, arguments: &str, output: &str, success: bool) {
147        let key = make_key(name, arguments);
148        let is_state_changing = STATE_CHANGING.contains(&name);
149        let key_is_new = !self.recent.iter().any(|e| e.key == key);
150        if is_state_changing && success && key_is_new {
151            // Genuine progress: a previously-untouched edit/write/create
152            // landed. Wipe prior repeats — they're stale relative to the
153            // new world state. The just-executed call is still recorded
154            // below so we can detect spam-of-the-same-edit.
155            self.recent.clear();
156        }
157        self.recent.push(Entry {
158            key,
159            output_hash: hash_str(output),
160            success,
161        });
162        if self.recent.len() > WINDOW {
163            // Drop oldest. `remove(0)` on a Vec<Entry> of bounded size
164            // is fine — WINDOW is tiny and this runs at most once per
165            // tool dispatch.
166            self.recent.remove(0);
167        }
168    }
169}
170
171fn make_key(name: &str, arguments: &str) -> String {
172    // Canonicalise the JSON arguments so cosmetically-different repeats
173    // ({"a":1, "b":2} vs {"b":2,"a":1} vs {"a": 1, "b": 2}) collapse to the
174    // same key. Mirrors the in-batch `is_dup` normalisation in `runner.rs`
175    // so cross-batch and in-batch dedup share semantics. Non-JSON args
176    // pass through unchanged.
177    let normalised = match serde_json::from_str::<serde_json::Value>(arguments) {
178        Ok(v) => serde_json::to_string(&v).unwrap_or_else(|_| arguments.to_string()),
179        Err(_) => arguments.to_string(),
180    };
181    let mut s = String::with_capacity(name.len() + 1 + normalised.len());
182    s.push_str(name);
183    s.push('\0');
184    s.push_str(&normalised);
185    s
186}
187
188fn hash_str(s: &str) -> u64 {
189    let mut h = DefaultHasher::new();
190    s.hash(&mut h);
191    h.finish()
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    fn allow(d: &LoopGuardDecision) -> bool {
199        matches!(d, LoopGuardDecision::Allow)
200    }
201    fn block(d: &LoopGuardDecision) -> bool {
202        matches!(d, LoopGuardDecision::Block(_))
203    }
204
205    #[test]
206    fn third_identical_call_is_blocked() {
207        let mut g = LoopGuardState::default();
208        let args = r#"{"command":"cargo check"}"#;
209        let out = "error[E0599]: no method named `foo`";
210
211        assert!(allow(&g.check("bash", args)));
212        g.record("bash", args, out, false);
213
214        assert!(allow(&g.check("bash", args)));
215        g.record("bash", args, out, false);
216
217        // Third call with identical (name, args, output, success) → blocked.
218        let d = g.check("bash", args);
219        assert!(block(&d), "expected Block, got {:?}", d);
220    }
221
222    #[test]
223    fn rotating_arguments_do_not_trigger() {
224        // Pins the false-positive case from the deleted `detect_call_loop`
225        // (commit 9339cf1): a model running `cargo check 2>&1 | head` with
226        // different `head -N` filters each time looks superficially
227        // similar but has different args. Must not trip the guard.
228        let mut g = LoopGuardState::default();
229        let cmds = [
230            r#"{"command":"cargo check 2>&1 | head -50"}"#,
231            r#"{"command":"cargo check 2>&1 | head -100"}"#,
232            r#"{"command":"cargo check 2>&1 | head -200"}"#,
233            r#"{"command":"cargo check --message-format=short"}"#,
234        ];
235        for c in cmds {
236            assert!(allow(&g.check("bash", c)));
237            g.record("bash", c, "compile error", false);
238        }
239        // Even the 5th distinct command is allowed.
240        assert!(allow(&g.check("bash", r#"{"command":"cargo build"}"#)));
241    }
242
243    #[test]
244    fn ssh_batch_with_distinct_commands_does_not_trigger() {
245        // The other 9339cf1 false positive: a five-tool ssh-into-box
246        // sequence (sshpass / ls / cat / cp / chmod). All bash, all
247        // distinct args → must not trigger. The (name, args) discrimination
248        // alone handles this; included here as a regression pin.
249        let mut g = LoopGuardState::default();
250        let cmds = [
251            r#"{"command":"sshpass -p ... ssh user@host echo ok"}"#,
252            r#"{"command":"sshpass -p ... ssh user@host ls /var/log"}"#,
253            r#"{"command":"sshpass -p ... ssh user@host cat /etc/hostname"}"#,
254            r#"{"command":"sshpass -p ... ssh user@host uname -a"}"#,
255            r#"{"command":"sshpass -p ... ssh user@host df -h"}"#,
256        ];
257        for c in cmds {
258            let d = g.check("bash", c);
259            assert!(allow(&d), "expected Allow for {:?}, got {:?}", c, d);
260            g.record("bash", c, "ok", true);
261        }
262    }
263
264    #[test]
265    fn changing_output_does_not_trigger() {
266        // Polling a flaky endpoint where the body genuinely changes
267        // each call → real signal, must not block.
268        let mut g = LoopGuardState::default();
269        let args = r#"{"command":"curl -s http://service/health"}"#;
270
271        g.record("bash", args, "starting up", false);
272        g.record("bash", args, "starting up", false);
273        // Third call: output would have changed → not the loop pattern.
274        // We simulate by recording a different prior output for #2,
275        // then asserting #3 is allowed.
276        let mut g2 = LoopGuardState::default();
277        g2.record("bash", args, "starting up", false);
278        g2.record("bash", args, "ready", true);
279        assert!(allow(&g2.check("bash", args)));
280    }
281
282    #[test]
283    fn intervening_state_change_resets_window() {
284        // The TDD scenario: bash(cargo test) → fails → edit_file →
285        // bash(cargo test) again → fails (same output) → bash(cargo test) again.
286        // Without the reset, the third bash would block. With it, the
287        // edit clears the window, so the post-edit bash calls start
288        // counting fresh.
289        let mut g = LoopGuardState::default();
290        let bash_args = r#"{"command":"cargo test"}"#;
291        let edit_args = r#"{"file_path":"src/lib.rs","old":"a","new":"b"}"#;
292        let test_out = "test failed: assertion 1 != 2";
293
294        g.record("bash", bash_args, test_out, false);
295        g.record("bash", bash_args, test_out, false);
296        // Now an edit lands → progress.
297        g.record("edit_file", edit_args, "ok", true);
298        // The bash calls from before the edit should no longer count.
299        assert!(allow(&g.check("bash", bash_args)));
300        g.record("bash", bash_args, test_out, false);
301        assert!(allow(&g.check("bash", bash_args)));
302        g.record("bash", bash_args, test_out, false);
303        // Now we've hit the threshold *post-edit* → block.
304        assert!(block(&g.check("bash", bash_args)));
305    }
306
307    #[test]
308    fn state_change_with_repeated_key_does_not_reset() {
309        // A model spamming `edit_file` to the same path with the same
310        // diff is itself a loop — the state-change reset rule only
311        // fires on a *new* key. After three identical edit attempts
312        // the third must block.
313        let mut g = LoopGuardState::default();
314        let edit_args = r#"{"file_path":"a.rs","old":"x","new":"y"}"#;
315
316        g.record("edit_file", edit_args, "ok", true);
317        g.record("edit_file", edit_args, "ok", true);
318        // Third identical edit is still a loop, not progress.
319        assert!(block(&g.check("edit_file", edit_args)));
320    }
321
322    #[test]
323    fn clear_resets_state() {
324        // Per-user-message boundary: agent calls clear() and the next
325        // call starts fresh.
326        let mut g = LoopGuardState::default();
327        let args = r#"{"command":"ls"}"#;
328        g.record("bash", args, "out", true);
329        g.record("bash", args, "out", true);
330        assert!(block(&g.check("bash", args)));
331        g.clear();
332        assert!(allow(&g.check("bash", args)));
333    }
334
335    #[test]
336    fn hard_cap_blocks_loops_even_when_output_drifts() {
337        // The cargo-check-30-times scenario: model keeps issuing the
338        // same bash command, but each run produces slightly different
339        // output (warning order / timing / cache state). The strict
340        // THRESHOLD never fires because outputs differ; the hard cap
341        // catches it at HARD_THRESHOLD-1 prior matches (5 records →
342        // 6th is blocked).
343        let mut g = LoopGuardState::default();
344        let args = r#"{"command":"cargo check 2>&1 | tail -20"}"#;
345        // Each record is the same call but with subtly different output.
346        for i in 0..5 {
347            assert!(
348                allow(&g.check("bash", args)),
349                "call {} should still allow",
350                i + 1
351            );
352            g.record("bash", args, &format!("warning #{}: drift", i), false);
353        }
354        // 6th attempt: HARD_THRESHOLD reached, block regardless of drift.
355        let d = g.check("bash", args);
356        assert!(block(&d), "expected hard-cap Block on 6th call, got {:?}", d);
357        if let LoopGuardDecision::Block(msg) = d {
358            assert!(
359                msg.contains("Output varies slightly"),
360                "hard-cap message expected, got: {}",
361                msg
362            );
363        }
364    }
365
366    #[test]
367    fn polling_below_hard_cap_still_allowed() {
368        // Legitimate progress polling (5 attempts of curl health-check)
369        // must still be allowed under the hard cap. Only at attempt 6+
370        // do we intervene. Five attempts is enough budget for typical
371        // service-startup polling.
372        let mut g = LoopGuardState::default();
373        let args = r#"{"command":"curl http://service/health"}"#;
374        for i in 0..5 {
375            let d = g.check("bash", args);
376            assert!(
377                allow(&d),
378                "polling attempt {} should be allowed, got {:?}",
379                i + 1,
380                d
381            );
382            g.record("bash", args, &format!("attempt {}: starting", i), false);
383        }
384        // 5 records made, 6th about to be blocked — pin the boundary.
385        assert!(block(&g.check("bash", args)));
386    }
387
388    #[test]
389    fn json_whitespace_variants_collapse_across_batches() {
390        // The cross-batch counterpart to the runner's in-batch dedup
391        // normalisation: same call, different JSON formatting across
392        // turns must still trip the guard at THRESHOLD. Mirrors the
393        // deepseek-v4-flash symptom: model emits identical Glob
394        // intent with whitespace drift turn-after-turn.
395        let mut g = LoopGuardState::default();
396        let v1 = r#"{"pattern":"**/*.rs"}"#;
397        let v2 = r#"{"pattern": "**/*.rs"}"#; // extra space
398        let v3 = r#"{ "pattern":"**/*.rs" }"#; // padded braces
399        let out = "203 files found";
400
401        g.record("glob", v1, out, true);
402        g.record("glob", v2, out, true);
403        // v3 is the same call by intent → must block.
404        let d = g.check("glob", v3);
405        assert!(block(&d), "expected Block on JSON-variant repeat, got {:?}", d);
406    }
407
408    #[test]
409    fn name_collision_is_not_a_match() {
410        // Defensive: the `\0` separator in make_key prevents
411        // (name="foo", args="bar") from colliding with
412        // (name="foob", args="ar") even though concatenation would.
413        let mut g = LoopGuardState::default();
414        g.record("foo", "bar", "out", true);
415        g.record("foo", "bar", "out", true);
416        // A call whose name+args concatenate to "foobar" but with a
417        // different split must not be considered a repeat.
418        assert!(allow(&g.check("foob", "ar")));
419    }
420}
atomcode_core/turn/loop_guard.rs

atomcode_core/turn/
loop_guard.rs