noos 0.4.1

Reliability layer for Rust LLM agents: scope drift, cost circuit breaks, and procedural correction memory as event-driven Decisions.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
//! Tool-call observation channel (Path A, 0.3.0).
//!
//! **Scope note (P1 / P9b)**: I/O adapter sub-module, not cognitive.
//! Everything here is sequence tracking over opaque tool-name strings
//! — no sentiment lexicon, no topic inference, no text content
//! analysis. Tool names are treated as identifiers chosen by the host
//! application (`get_weather`, `search_kb`); the detector only cares
//! whether consecutive calls repeat. P1 applies to the wrapped
//! [`CognitiveSession`](crate::session::CognitiveSession); P9b is
//! satisfied by construction.
//!
//! Modern LLM agents make tool calls (function calls, API calls,
//! retrieval). Without visibility into these, the regulator cannot
//! detect a very common failure mode: the agent calls the same tool
//! over and over, burning cost without producing progress.
//!
//! This module tracks per-turn tool-call sequences. It exposes a
//! detector `detected_loop()` that fires `Some((tool, count))` when the
//! last [`TOOL_LOOP_THRESHOLD`] consecutive tool calls are the same
//! tool name — the signature pattern of "agent is stuck in a retry
//! loop on a single tool that is failing / not making progress."
//!
//! Counts reset on every [`LLMEvent::TurnStart`] so a new turn gets a
//! fresh view — accumulation across turns would conflate "legitimately
//! hit this tool 3 times across a multi-step plan" with "stuck in a
//! one-tool loop." The loop-detection unit of account is the turn.
//!
//! `ToolResult` events carry `duration_ms` + `success` for observability
//! hooks (accessible via [`ToolStatsAccumulator::total_calls`] and
//! related getters); they do not influence loop detection in 0.3.0.
//!
//! ## Gating (P10)
//!
//! This module produces the
//! [`CircuitBreakReason::RepeatedToolCallLoop`] variant of
//! [`Decision::CircuitBreak`] via [`Regulator::decide`].
//!
//! - **Suppresses**:
//!   [`Decision::ScopeDriftWarn`],
//!   [`Decision::ProceduralWarning`],
//!   [`Decision::Continue`]. Tool-loop halt is a circuit break — it
//!   dominates every advisory below it.
//! - **Suppressed by**:
//!   [`CircuitBreakReason::CostCapReached`] and
//!   [`CircuitBreakReason::QualityDeclineNoRecovery`]. The two
//!   cost / quality circuit breaks are considered more urgent in the
//!   priority chain — if the budget has already been blown or quality
//!   is collapsing, halting on tool pattern alone would mask the
//!   bigger problem.
//! - **Inactive when**: fewer than [`TOOL_LOOP_THRESHOLD`] consecutive
//!   [`LLMEvent::ToolCall`] events have landed since the last
//!   [`LLMEvent::TurnStart`], OR the trailing run includes at least
//!   two distinct tool names (interleaving breaks a loop).
//!
//! [`CircuitBreakReason::RepeatedToolCallLoop`]: super::CircuitBreakReason::RepeatedToolCallLoop
//! [`CircuitBreakReason::CostCapReached`]: super::CircuitBreakReason::CostCapReached
//! [`CircuitBreakReason::QualityDeclineNoRecovery`]: super::CircuitBreakReason::QualityDeclineNoRecovery
//! [`Decision::CircuitBreak`]: super::Decision::CircuitBreak
//! [`Decision::ScopeDriftWarn`]: super::Decision::ScopeDriftWarn
//! [`Decision::ProceduralWarning`]: super::Decision::ProceduralWarning
//! [`Decision::Continue`]: super::Decision::Continue
//! [`Regulator::decide`]: super::Regulator::decide
//! [`LLMEvent::ToolCall`]: super::LLMEvent::ToolCall
//! [`LLMEvent::TurnStart`]: super::LLMEvent::TurnStart

use std::collections::HashMap;

// ── Constants ──────────────────────────────────────────────────────────

/// Number of consecutive calls to the same tool that trip the loop
/// circuit-break. Chosen to balance "false alarms on a legitimate
/// 3-retry strategy" against "waste from a runaway loop."
///
/// 5 is conservative: a well-behaved agent almost never calls the
/// same tool 5× in a row without interleaving a reasoning or
/// different-tool step. Anthropic's agentic-retrieval loop
/// research observes loops typically exceed 10 repetitions when
/// they occur, so 5 catches them with margin.
pub const TOOL_LOOP_THRESHOLD: usize = 5;

// ── Records ────────────────────────────────────────────────────────────

/// One observed tool call.
#[derive(Debug, Clone)]
pub struct ToolCallRecord {
    /// Tool name as reported by the agent (the caller owns the naming
    /// convention — `"search"`, `"db.query"`, `"exec_python"`, etc.).
    pub tool_name: String,
    /// Optional JSON-serialised args. Opaque to the regulator; stored
    /// only for observability / downstream inspection by the app.
    pub args_json: Option<String>,
    /// Index within the current turn (0-based). Resets on `TurnStart`.
    pub turn_local_index: usize,
}

/// One observed tool result.
#[derive(Debug, Clone)]
pub struct ToolResultRecord {
    pub tool_name: String,
    pub success: bool,
    pub duration_ms: u64,
    pub error_summary: Option<String>,
    pub turn_local_index: usize,
}

// ── Accumulator ────────────────────────────────────────────────────────

/// Per-turn tool-call history + loop detector.
///
/// Reset via [`ToolStatsAccumulator::reset_turn`] at every
/// `LLMEvent::TurnStart`. The regulator drives that lifecycle; callers
/// don't interact with this type directly.
#[derive(Debug, Clone, Default)]
pub struct ToolStatsAccumulator {
    /// All tool calls observed in the current turn, in emission order.
    calls: Vec<ToolCallRecord>,
    /// All tool results observed in the current turn, in emission order.
    results: Vec<ToolResultRecord>,
}

impl ToolStatsAccumulator {
    /// Fresh accumulator with empty history.
    pub fn new() -> Self {
        Self::default()
    }

    /// Mutable: clear per-turn history. Called by the regulator on
    /// every `LLMEvent::TurnStart` so tool-loop detection only fires
    /// within a single turn (cross-turn accumulation would conflate
    /// "stuck in a loop this turn" with "legitimately touched this
    /// tool in several adjacent turns").
    pub fn reset_turn(&mut self) {
        self.calls.clear();
        self.results.clear();
    }

    /// Mutable: append a tool-call record to the per-turn history.
    /// Requires mutation because the history is accumulated state
    /// feeding [`Self::detected_loop`] and the observability getters.
    pub fn record_call(&mut self, tool_name: String, args_json: Option<String>) {
        let turn_local_index = self.calls.len();
        self.calls.push(ToolCallRecord {
            tool_name,
            args_json,
            turn_local_index,
        });
    }

    /// Mutable: append a tool-result record to the per-turn history.
    /// Requires mutation because the history drives the observability
    /// accessors ([`Self::total_duration_ms`], [`Self::failure_count`]).
    /// Does not feed loop detection in 0.3.0.
    pub fn record_result(
        &mut self,
        tool_name: String,
        success: bool,
        duration_ms: u64,
        error_summary: Option<String>,
    ) {
        let turn_local_index = self.results.len();
        self.results.push(ToolResultRecord {
            tool_name,
            success,
            duration_ms,
            error_summary,
            turn_local_index,
        });
    }

    /// Total tool calls observed this turn.
    pub fn total_calls(&self) -> usize {
        self.calls.len()
    }

    /// Total tool results observed this turn.
    pub fn total_results(&self) -> usize {
        self.results.len()
    }

    /// Per-tool call counts this turn. Useful for observability —
    /// `{"db.query": 3, "search": 1}` style reporting.
    pub fn counts_by_tool(&self) -> HashMap<String, usize> {
        let mut counts: HashMap<String, usize> = HashMap::new();
        for call in &self.calls {
            *counts.entry(call.tool_name.clone()).or_insert(0) += 1;
        }
        counts
    }

    /// Detect a consecutive-same-tool loop.
    ///
    /// Returns `Some((tool_name, count))` when the last [`TOOL_LOOP_THRESHOLD`]
    /// calls were all the same tool. `count` is the number of consecutive
    /// same-tool calls counting back from the most recent.
    ///
    /// Returns `None` when:
    /// - Fewer than `TOOL_LOOP_THRESHOLD` calls have been observed.
    /// - The last `TOOL_LOOP_THRESHOLD` calls include at least two
    ///   distinct tools (the agent interleaved, which breaks a loop).
    pub fn detected_loop(&self) -> Option<(String, usize)> {
        if self.calls.len() < TOOL_LOOP_THRESHOLD {
            return None;
        }
        let last_tool = &self.calls.last()?.tool_name;
        // Count back from the end while the tool name matches. If we
        // hit >= TOOL_LOOP_THRESHOLD before seeing a different tool (or
        // running out), we have a loop.
        let mut consecutive = 0usize;
        for call in self.calls.iter().rev() {
            if &call.tool_name == last_tool {
                consecutive += 1;
            } else {
                break;
            }
        }
        if consecutive >= TOOL_LOOP_THRESHOLD {
            Some((last_tool.clone(), consecutive))
        } else {
            None
        }
    }

    /// Total wall-clock time spent in tool results this turn, in
    /// milliseconds. Zero when no results have been observed.
    pub fn total_duration_ms(&self) -> u64 {
        self.results.iter().map(|r| r.duration_ms).sum()
    }

    /// Count of failed results this turn.
    pub fn failure_count(&self) -> usize {
        self.results.iter().filter(|r| !r.success).count()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_has_no_loop() {
        let acc = ToolStatsAccumulator::new();
        assert!(acc.detected_loop().is_none());
        assert_eq!(acc.total_calls(), 0);
    }

    #[test]
    fn below_threshold_has_no_loop() {
        let mut acc = ToolStatsAccumulator::new();
        for _ in 0..(TOOL_LOOP_THRESHOLD - 1) {
            acc.record_call("search".into(), None);
        }
        assert!(acc.detected_loop().is_none());
    }

    #[test]
    fn threshold_same_tool_detects_loop() {
        let mut acc = ToolStatsAccumulator::new();
        for _ in 0..TOOL_LOOP_THRESHOLD {
            acc.record_call("search".into(), None);
        }
        let (tool, count) = acc.detected_loop().expect("loop should fire");
        assert_eq!(tool, "search");
        assert_eq!(count, TOOL_LOOP_THRESHOLD);
    }

    #[test]
    fn interleaved_tools_break_loop() {
        let mut acc = ToolStatsAccumulator::new();
        // 4 search + 1 db.query + 4 search ⇒ trailing same-tool run is 4,
        // below threshold.
        for _ in 0..4 {
            acc.record_call("search".into(), None);
        }
        acc.record_call("db.query".into(), None);
        for _ in 0..4 {
            acc.record_call("search".into(), None);
        }
        assert!(acc.detected_loop().is_none());
    }

    #[test]
    fn reset_turn_clears_state() {
        let mut acc = ToolStatsAccumulator::new();
        for _ in 0..TOOL_LOOP_THRESHOLD {
            acc.record_call("search".into(), None);
        }
        assert!(acc.detected_loop().is_some());
        acc.reset_turn();
        assert!(acc.detected_loop().is_none());
        assert_eq!(acc.total_calls(), 0);
    }

    #[test]
    fn counts_by_tool_aggregates_correctly() {
        let mut acc = ToolStatsAccumulator::new();
        acc.record_call("search".into(), None);
        acc.record_call("search".into(), None);
        acc.record_call("db.query".into(), None);
        let counts = acc.counts_by_tool();
        assert_eq!(counts.get("search"), Some(&2));
        assert_eq!(counts.get("db.query"), Some(&1));
    }

    #[test]
    fn duration_and_failure_counters() {
        let mut acc = ToolStatsAccumulator::new();
        acc.record_result("search".into(), true, 100, None);
        acc.record_result("db.query".into(), false, 250, Some("timeout".into()));
        acc.record_result("search".into(), true, 50, None);
        assert_eq!(acc.total_duration_ms(), 400);
        assert_eq!(acc.failure_count(), 1);
        assert_eq!(acc.total_results(), 3);
    }

    // ── Adversarial tests (Session 31) ─────────────────────────────
    //
    // Session 30 audit flagged the existing test suite as "staged" —
    // all cases were simple "N same-tool calls in a row" patterns.
    // These tests cover shapes that could plausibly fool the detector
    // one way or the other, and document the exact design boundary
    // ("consecutive same tool at the trailing run, by name, regardless
    // of args, ignoring historical loops that resolved").

    #[test]
    fn loop_fires_on_tail_run_after_mixed_prefix() {
        // Agent starts exploring (A, B), then falls into A×THRESHOLD
        // at the tail. Detection must look at the trailing run only —
        // the initial mixed prefix is irrelevant.
        let mut acc = ToolStatsAccumulator::new();
        acc.record_call("explore".into(), None);
        acc.record_call("db.query".into(), None);
        for _ in 0..TOOL_LOOP_THRESHOLD {
            acc.record_call("search".into(), None);
        }
        let (tool, count) = acc
            .detected_loop()
            .expect("tail same-tool run should fire regardless of prefix");
        assert_eq!(tool, "search");
        assert_eq!(count, TOOL_LOOP_THRESHOLD);
    }

    #[test]
    fn loop_fires_on_same_tool_with_diverging_args() {
        // The $47k LangChain incident shape: agent "refines" args on
        // each retry but keeps calling the same tool. Args are opaque
        // to the detector — only the tool NAME drives loop detection.
        let mut acc = ToolStatsAccumulator::new();
        for i in 0..TOOL_LOOP_THRESHOLD {
            acc.record_call(
                "search_orders".into(),
                Some(format!("{{\"user_id\":42,\"attempt\":{i}}}")),
            );
        }
        let (tool, count) = acc
            .detected_loop()
            .expect("same name + varying args is still a loop");
        assert_eq!(tool, "search_orders");
        assert_eq!(count, TOOL_LOOP_THRESHOLD);
    }

    #[test]
    fn over_threshold_returns_actual_count_not_threshold() {
        // 8 same-tool in a row → count is 8, not 5. The detector
        // reports the full run depth so apps can surface "loop depth"
        // for debugging / telemetry.
        let mut acc = ToolStatsAccumulator::new();
        for _ in 0..8 {
            acc.record_call("search".into(), None);
        }
        let (_, count) = acc.detected_loop().expect("well over threshold");
        assert_eq!(count, 8, "count reflects actual run depth, not threshold");
    }

    #[test]
    fn resolved_loop_followed_by_short_tail_does_not_fire() {
        // An earlier same-tool run that RESOLVED (agent switched
        // strategy) must not fire. Detection is on the trailing run —
        // a historical loop that the agent moved past is explicitly
        // out-of-scope.
        let mut acc = ToolStatsAccumulator::new();
        // Loop phase — but later interrupted
        for _ in 0..TOOL_LOOP_THRESHOLD {
            acc.record_call("search".into(), None);
        }
        // Agent recognizes the loop and switches
        acc.record_call("db.query".into(), None);
        acc.record_call("db.query".into(), None);
        assert!(
            acc.detected_loop().is_none(),
            "historical loops that resolved are explicitly out-of-scope"
        );
    }

    #[test]
    fn alternating_high_volume_tools_do_not_fire() {
        // 20 calls alternating A/B — agent IS stuck in a ping-pong,
        // but this is a different pathology ("A↔B ping-pong") that
        // RepeatedToolCallLoop does NOT claim to catch. Documented
        // boundary — a future CircuitBreakReason (e.g.,
        // AlternatingToolPingPong) could cover this; Noos 0.3.0
        // deliberately does not.
        let mut acc = ToolStatsAccumulator::new();
        for i in 0..20 {
            let tool = if i % 2 == 0 { "search" } else { "db.query" };
            acc.record_call(tool.into(), None);
        }
        assert!(
            acc.detected_loop().is_none(),
            "alternating A/B is not a same-tool consecutive loop by design"
        );
    }

    #[test]
    fn exact_threshold_boundary_both_sides() {
        // Paired boundary: THRESHOLD-1 silent, THRESHOLD fires. Guards
        // against an off-by-one regression.
        let mut under = ToolStatsAccumulator::new();
        for _ in 0..(TOOL_LOOP_THRESHOLD - 1) {
            under.record_call("t".into(), None);
        }
        assert!(
            under.detected_loop().is_none(),
            "THRESHOLD - 1 must not fire"
        );

        let mut at = ToolStatsAccumulator::new();
        for _ in 0..TOOL_LOOP_THRESHOLD {
            at.record_call("t".into(), None);
        }
        assert!(at.detected_loop().is_some(), "THRESHOLD must fire");
    }
}