atomcode_core/turn/loop_guard.rs
1//! Cross-batch tool-call loop guard.
2//!
3//! The runner already deduplicates *within a single LLM batch*
4//! (`is_dup` in `runner.rs`). This module covers the orthogonal case:
5//! a model emitting the same `(tool, arguments)` pair across many
6//! sequential batches with no progress between them — the symptom
7//! that produced the 22-identical-`Bash(cargo check)` screenshot.
8//!
9//! Trigger conditions (all required, to avoid the false positives that
10//! killed commit `9339cf1`'s removed `detect_call_loop`):
11//!
12//! * `(name, arguments)` exactly matches a prior call's pair —
13//! normalised once via `\0` separator. No fuzzy / token-streak
14//! detection (that was the bit that wrongly caught batches like ssh
15//! `ls`/`cat`/`sshpass`).
16//! * Every prior matching call had identical output and identical
17//! `success` — polling a flaky endpoint where the body changes is
18//! real progress, not a loop.
19//! * No state-changing tool succeeded with a *new* key in between —
20//! when an `edit_file` to a previously-untouched file lands, the
21//! world has changed and any prior repeats no longer count. (A
22//! state-changing call to a key that is *already* in the window
23//! does NOT clear it: that means `edit_file` itself is being
24//! spammed with identical args, which is a loop we want to catch.)
25//!
26//! The trigger threshold is intentionally low (third attempt blocks)
27//! so the model gets two "maybe this time" chances on transient
28//! flakes but doesn't burn a full turn budget repeating itself. The
29//! agent clears the window at every user-message boundary, so the
30//! per-turn-only semantics are guaranteed by the caller.
31
32use std::collections::hash_map::DefaultHasher;
33use std::hash::{Hash, Hasher};
34
35/// Strict trigger: identical `(name, args, output, success)` repeats.
36/// Catches deterministic loops where output is byte-stable. With
37/// THRESHOLD = 3, the first two attempts execute normally and the
38/// third is short-circuited.
39const THRESHOLD: usize = 3;
40
41/// Hard cap regardless of output drift. Catches loops where the model
42/// is clearly stuck on the same `(name, args)` but output varies
43/// slightly between runs — e.g., `cargo check 2>&1 | tail -20` whose
44/// warning order, paths, or timing differ run-to-run. Without this
45/// gate the strict trigger above never fires for cargo/test/lint
46/// polling and the model can spam 30+ identical commands before
47/// hitting the per-turn step budget. Threshold 6 leaves room for
48/// legitimate progress-polling (curl health-check waiting for a
49/// service to start) before blocking.
50const HARD_THRESHOLD: usize = 6;
51
52/// Maximum number of recent entries to keep. The window is per-turn
53/// (cleared at every user-message boundary by the agent), so a small
54/// bounded ring is enough — even a chatty model rarely emits more
55/// than a dozen tools per assistant turn. Sized above HARD_THRESHOLD
56/// so the soft-trigger count survives the ring eviction.
57const WINDOW: usize = 32;
58
59/// Tools whose successful execution counts as "progress" — when one
60/// fires with a previously-unseen key, the loop guard resets so any
61/// prior read-only repeats no longer count toward the threshold.
62const STATE_CHANGING: &[&str] =
63 &["edit_file", "write_file", "create_file", "search_replace"];
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66struct Entry {
67 /// `name + "\0" + arguments` — the raw `(name, args)` pair pre-joined
68 /// so equality checks are a single string compare instead of a
69 /// tuple compare on every lookup.
70 key: String,
71 output_hash: u64,
72 success: bool,
73}
74
75#[derive(Default, Debug)]
76pub struct LoopGuardState {
77 recent: Vec<Entry>,
78}
79
80/// What the runner should do with the call it's about to dispatch.
81#[derive(Debug, PartialEq, Eq)]
82pub enum LoopGuardDecision {
83 /// Run the tool normally. After execution, call `record(...)`.
84 Allow,
85 /// Skip execution. The string carries a synthetic `ToolResult`
86 /// body explaining to the model why this attempt was blocked.
87 Block(String),
88}
89
90impl LoopGuardState {
91 pub fn clear(&mut self) {
92 self.recent.clear();
93 }
94
95 /// Decide whether to execute or block the call. Read-only — the
96 /// caller still needs `record(...)` after a real execution.
97 pub fn check(&self, name: &str, arguments: &str) -> LoopGuardDecision {
98 let key = make_key(name, arguments);
99 let matches: Vec<&Entry> = self.recent.iter().filter(|e| e.key == key).collect();
100
101 // Hard cap first: HARD_THRESHOLD identical-key repeats regardless
102 // of output drift. The strict trigger below would never fire for
103 // cargo / test / lint polling because their output is timing- or
104 // cache-dependent.
105 if matches.len() >= HARD_THRESHOLD - 1 {
106 return LoopGuardDecision::Block(format!(
107 "[Loop guard] `{}` with identical arguments has run {} time(s) \
108 already in this turn. Output varies slightly each run but you \
109 keep issuing the same query — that's a loop. Try a different \
110 approach: change the command (different filter / different \
111 file), edit code to make progress, or stop and ask the user. \
112 Continuing to repeat will keep getting blocked.",
113 name,
114 matches.len()
115 ));
116 }
117
118 // Strict trigger: THRESHOLD identical-output, identical-success
119 // repeats. Catches deterministic loops where the model keeps
120 // hitting the exact same wall (same compile error, same grep
121 // result, etc).
122 if matches.len() < THRESHOLD - 1 {
123 return LoopGuardDecision::Allow;
124 }
125 let first = matches[0];
126 let all_same =
127 matches.iter().all(|e| e.output_hash == first.output_hash && e.success == first.success);
128 if !all_same {
129 return LoopGuardDecision::Allow;
130 }
131 LoopGuardDecision::Block(format!(
132 "[Loop guard] This exact tool call (`{}` with identical arguments) has \
133 run {} time(s) already in this turn with the same output and no \
134 intervening state change. Try a different approach: change the \
135 command, read different files, edit code to make progress, or stop \
136 and ask the user. Repeating the same query will not change the \
137 result.",
138 name,
139 matches.len()
140 ))
141 }
142
143 /// Record a real execution result. Call exactly once per non-blocked
144 /// dispatch. Handles the state-change reset internally so the runner
145 /// doesn't have to know the tool taxonomy.
146 pub fn record(&mut self, name: &str, arguments: &str, output: &str, success: bool) {
147 let key = make_key(name, arguments);
148 let is_state_changing = STATE_CHANGING.contains(&name);
149 let key_is_new = !self.recent.iter().any(|e| e.key == key);
150 if is_state_changing && success && key_is_new {
151 // Genuine progress: a previously-untouched edit/write/create
152 // landed. Wipe prior repeats — they're stale relative to the
153 // new world state. The just-executed call is still recorded
154 // below so we can detect spam-of-the-same-edit.
155 self.recent.clear();
156 }
157 self.recent.push(Entry {
158 key,
159 output_hash: hash_str(output),
160 success,
161 });
162 if self.recent.len() > WINDOW {
163 // Drop oldest. `remove(0)` on a Vec<Entry> of bounded size
164 // is fine — WINDOW is tiny and this runs at most once per
165 // tool dispatch.
166 self.recent.remove(0);
167 }
168 }
169}
170
171fn make_key(name: &str, arguments: &str) -> String {
172 // Canonicalise the JSON arguments so cosmetically-different repeats
173 // ({"a":1, "b":2} vs {"b":2,"a":1} vs {"a": 1, "b": 2}) collapse to the
174 // same key. Mirrors the in-batch `is_dup` normalisation in `runner.rs`
175 // so cross-batch and in-batch dedup share semantics. Non-JSON args
176 // pass through unchanged.
177 let normalised = match serde_json::from_str::<serde_json::Value>(arguments) {
178 Ok(v) => serde_json::to_string(&v).unwrap_or_else(|_| arguments.to_string()),
179 Err(_) => arguments.to_string(),
180 };
181 let mut s = String::with_capacity(name.len() + 1 + normalised.len());
182 s.push_str(name);
183 s.push('\0');
184 s.push_str(&normalised);
185 s
186}
187
188fn hash_str(s: &str) -> u64 {
189 let mut h = DefaultHasher::new();
190 s.hash(&mut h);
191 h.finish()
192}
193
194#[cfg(test)]
195mod tests {
196 use super::*;
197
198 fn allow(d: &LoopGuardDecision) -> bool {
199 matches!(d, LoopGuardDecision::Allow)
200 }
201 fn block(d: &LoopGuardDecision) -> bool {
202 matches!(d, LoopGuardDecision::Block(_))
203 }
204
205 #[test]
206 fn third_identical_call_is_blocked() {
207 let mut g = LoopGuardState::default();
208 let args = r#"{"command":"cargo check"}"#;
209 let out = "error[E0599]: no method named `foo`";
210
211 assert!(allow(&g.check("bash", args)));
212 g.record("bash", args, out, false);
213
214 assert!(allow(&g.check("bash", args)));
215 g.record("bash", args, out, false);
216
217 // Third call with identical (name, args, output, success) → blocked.
218 let d = g.check("bash", args);
219 assert!(block(&d), "expected Block, got {:?}", d);
220 }
221
222 #[test]
223 fn rotating_arguments_do_not_trigger() {
224 // Pins the false-positive case from the deleted `detect_call_loop`
225 // (commit 9339cf1): a model running `cargo check 2>&1 | head` with
226 // different `head -N` filters each time looks superficially
227 // similar but has different args. Must not trip the guard.
228 let mut g = LoopGuardState::default();
229 let cmds = [
230 r#"{"command":"cargo check 2>&1 | head -50"}"#,
231 r#"{"command":"cargo check 2>&1 | head -100"}"#,
232 r#"{"command":"cargo check 2>&1 | head -200"}"#,
233 r#"{"command":"cargo check --message-format=short"}"#,
234 ];
235 for c in cmds {
236 assert!(allow(&g.check("bash", c)));
237 g.record("bash", c, "compile error", false);
238 }
239 // Even the 5th distinct command is allowed.
240 assert!(allow(&g.check("bash", r#"{"command":"cargo build"}"#)));
241 }
242
243 #[test]
244 fn ssh_batch_with_distinct_commands_does_not_trigger() {
245 // The other 9339cf1 false positive: a five-tool ssh-into-box
246 // sequence (sshpass / ls / cat / cp / chmod). All bash, all
247 // distinct args → must not trigger. The (name, args) discrimination
248 // alone handles this; included here as a regression pin.
249 let mut g = LoopGuardState::default();
250 let cmds = [
251 r#"{"command":"sshpass -p ... ssh user@host echo ok"}"#,
252 r#"{"command":"sshpass -p ... ssh user@host ls /var/log"}"#,
253 r#"{"command":"sshpass -p ... ssh user@host cat /etc/hostname"}"#,
254 r#"{"command":"sshpass -p ... ssh user@host uname -a"}"#,
255 r#"{"command":"sshpass -p ... ssh user@host df -h"}"#,
256 ];
257 for c in cmds {
258 let d = g.check("bash", c);
259 assert!(allow(&d), "expected Allow for {:?}, got {:?}", c, d);
260 g.record("bash", c, "ok", true);
261 }
262 }
263
264 #[test]
265 fn changing_output_does_not_trigger() {
266 // Polling a flaky endpoint where the body genuinely changes
267 // each call → real signal, must not block.
268 let mut g = LoopGuardState::default();
269 let args = r#"{"command":"curl -s http://service/health"}"#;
270
271 g.record("bash", args, "starting up", false);
272 g.record("bash", args, "starting up", false);
273 // Third call: output would have changed → not the loop pattern.
274 // We simulate by recording a different prior output for #2,
275 // then asserting #3 is allowed.
276 let mut g2 = LoopGuardState::default();
277 g2.record("bash", args, "starting up", false);
278 g2.record("bash", args, "ready", true);
279 assert!(allow(&g2.check("bash", args)));
280 }
281
282 #[test]
283 fn intervening_state_change_resets_window() {
284 // The TDD scenario: bash(cargo test) → fails → edit_file →
285 // bash(cargo test) again → fails (same output) → bash(cargo test) again.
286 // Without the reset, the third bash would block. With it, the
287 // edit clears the window, so the post-edit bash calls start
288 // counting fresh.
289 let mut g = LoopGuardState::default();
290 let bash_args = r#"{"command":"cargo test"}"#;
291 let edit_args = r#"{"file_path":"src/lib.rs","old":"a","new":"b"}"#;
292 let test_out = "test failed: assertion 1 != 2";
293
294 g.record("bash", bash_args, test_out, false);
295 g.record("bash", bash_args, test_out, false);
296 // Now an edit lands → progress.
297 g.record("edit_file", edit_args, "ok", true);
298 // The bash calls from before the edit should no longer count.
299 assert!(allow(&g.check("bash", bash_args)));
300 g.record("bash", bash_args, test_out, false);
301 assert!(allow(&g.check("bash", bash_args)));
302 g.record("bash", bash_args, test_out, false);
303 // Now we've hit the threshold *post-edit* → block.
304 assert!(block(&g.check("bash", bash_args)));
305 }
306
307 #[test]
308 fn state_change_with_repeated_key_does_not_reset() {
309 // A model spamming `edit_file` to the same path with the same
310 // diff is itself a loop — the state-change reset rule only
311 // fires on a *new* key. After three identical edit attempts
312 // the third must block.
313 let mut g = LoopGuardState::default();
314 let edit_args = r#"{"file_path":"a.rs","old":"x","new":"y"}"#;
315
316 g.record("edit_file", edit_args, "ok", true);
317 g.record("edit_file", edit_args, "ok", true);
318 // Third identical edit is still a loop, not progress.
319 assert!(block(&g.check("edit_file", edit_args)));
320 }
321
322 #[test]
323 fn clear_resets_state() {
324 // Per-user-message boundary: agent calls clear() and the next
325 // call starts fresh.
326 let mut g = LoopGuardState::default();
327 let args = r#"{"command":"ls"}"#;
328 g.record("bash", args, "out", true);
329 g.record("bash", args, "out", true);
330 assert!(block(&g.check("bash", args)));
331 g.clear();
332 assert!(allow(&g.check("bash", args)));
333 }
334
335 #[test]
336 fn hard_cap_blocks_loops_even_when_output_drifts() {
337 // The cargo-check-30-times scenario: model keeps issuing the
338 // same bash command, but each run produces slightly different
339 // output (warning order / timing / cache state). The strict
340 // THRESHOLD never fires because outputs differ; the hard cap
341 // catches it at HARD_THRESHOLD-1 prior matches (5 records →
342 // 6th is blocked).
343 let mut g = LoopGuardState::default();
344 let args = r#"{"command":"cargo check 2>&1 | tail -20"}"#;
345 // Each record is the same call but with subtly different output.
346 for i in 0..5 {
347 assert!(
348 allow(&g.check("bash", args)),
349 "call {} should still allow",
350 i + 1
351 );
352 g.record("bash", args, &format!("warning #{}: drift", i), false);
353 }
354 // 6th attempt: HARD_THRESHOLD reached, block regardless of drift.
355 let d = g.check("bash", args);
356 assert!(block(&d), "expected hard-cap Block on 6th call, got {:?}", d);
357 if let LoopGuardDecision::Block(msg) = d {
358 assert!(
359 msg.contains("Output varies slightly"),
360 "hard-cap message expected, got: {}",
361 msg
362 );
363 }
364 }
365
366 #[test]
367 fn polling_below_hard_cap_still_allowed() {
368 // Legitimate progress polling (5 attempts of curl health-check)
369 // must still be allowed under the hard cap. Only at attempt 6+
370 // do we intervene. Five attempts is enough budget for typical
371 // service-startup polling.
372 let mut g = LoopGuardState::default();
373 let args = r#"{"command":"curl http://service/health"}"#;
374 for i in 0..5 {
375 let d = g.check("bash", args);
376 assert!(
377 allow(&d),
378 "polling attempt {} should be allowed, got {:?}",
379 i + 1,
380 d
381 );
382 g.record("bash", args, &format!("attempt {}: starting", i), false);
383 }
384 // 5 records made, 6th about to be blocked — pin the boundary.
385 assert!(block(&g.check("bash", args)));
386 }
387
388 #[test]
389 fn json_whitespace_variants_collapse_across_batches() {
390 // The cross-batch counterpart to the runner's in-batch dedup
391 // normalisation: same call, different JSON formatting across
392 // turns must still trip the guard at THRESHOLD. Mirrors the
393 // deepseek-v4-flash symptom: model emits identical Glob
394 // intent with whitespace drift turn-after-turn.
395 let mut g = LoopGuardState::default();
396 let v1 = r#"{"pattern":"**/*.rs"}"#;
397 let v2 = r#"{"pattern": "**/*.rs"}"#; // extra space
398 let v3 = r#"{ "pattern":"**/*.rs" }"#; // padded braces
399 let out = "203 files found";
400
401 g.record("glob", v1, out, true);
402 g.record("glob", v2, out, true);
403 // v3 is the same call by intent → must block.
404 let d = g.check("glob", v3);
405 assert!(block(&d), "expected Block on JSON-variant repeat, got {:?}", d);
406 }
407
408 #[test]
409 fn name_collision_is_not_a_match() {
410 // Defensive: the `\0` separator in make_key prevents
411 // (name="foo", args="bar") from colliding with
412 // (name="foob", args="ar") even though concatenation would.
413 let mut g = LoopGuardState::default();
414 g.record("foo", "bar", "out", true);
415 g.record("foo", "bar", "out", true);
416 // A call whose name+args concatenate to "foobar" but with a
417 // different split must not be considered a repeat.
418 assert!(allow(&g.check("foob", "ar")));
419 }
420}