Skip to main content

vtcode_core/tools/resilience/
mod.rs

1//! Resilience primitives for tool execution.
2//!
3//! Collects the concerns that govern how tools recover from and bound
4//! transient failures:
5//!
6//! - [`circuit_breaker`] - state-machine fault isolation (Closed/Open/HalfOpen)
7//!   keyed by tool name. Tracks failure counts, applies exponential backoff,
8//!   and short-circuits calls when a downstream is degraded.
9//! - [`adaptive_rate_limiter`] - per-key token-bucket rate limiter with adaptive
10//!   refill and priority weighting. Used to throttle tools whose cost varies
11//!   with the call.
12//! - [`rate_limiter`] - per-tool budgeted counter rate limiter. Provides the
13//!   per-tool token bucket that the executor uses for synchronous admission.
14//! - [`ToolResilience`] - the unified facade. New callers should use this
15//!   rather than reaching for the individual primitives, so that all three
16//!   invariants (admission, fault isolation, post-call accounting) are checked
17//!   in one place.
18//!
19//! The primitives share callers (autonomous executor, tool pipeline, agent
20//! error recovery) but address distinct failure modes; they are grouped here
21//! so a maintainer can audit the full resilience toolkit in one place.
22
23pub mod adaptive_rate_limiter;
24pub mod circuit_breaker;
25pub mod rate_limiter;
26
27use std::sync::Arc;
28use std::time::Duration;
29
30use once_cell::sync::Lazy;
31
32use vtcode_commons::ErrorCategory;
33
34use self::adaptive_rate_limiter::{AdaptiveRateLimiter, Priority};
35use self::circuit_breaker::CircuitBreaker;
36
37/// Outcome of a tool call, used by [`ToolResilience::record_outcome`] to update
38/// the circuit breaker.
39#[derive(Debug, Clone, Copy)]
40pub enum CallOutcome {
41    /// The call succeeded. Resets the failure counter for the tool.
42    Success,
43    /// The call failed due to a non-retryable error (invalid arguments, plan-mode
44    /// denial, permission denial). Does not trip the circuit breaker.
45    InvalidArgument,
46    /// The call failed due to a retryable execution error. May trip the circuit
47    /// breaker after `failure_threshold` consecutive occurrences.
48    ExecutionError,
49    /// The call failed due to cancellation (Drop guard, timeout, user abort).
50    /// Treated like an execution error for breaker accounting.
51    Cancelled,
52}
53
54impl CallOutcome {
55    /// Map this outcome to the `ErrorCategory` used by the circuit breaker for
56    /// non-success outcomes. Returns `None` for [`CallOutcome::Success`] because
57    /// success is handled by a separate code path (`record_success`).
58    fn to_error_category(self) -> Option<ErrorCategory> {
59        match self {
60            CallOutcome::Success => None,
61            CallOutcome::InvalidArgument => Some(ErrorCategory::InvalidParameters),
62            CallOutcome::ExecutionError | CallOutcome::Cancelled => {
63                Some(ErrorCategory::ExecutionError)
64            }
65        }
66    }
67}
68
69/// Unified facade for tool resilience. Wraps the adaptive rate limiter and the
70/// circuit breaker so callers can use a single API for admission, fault
71/// isolation, and post-call accounting.
72///
73/// # Example
74///
75/// ```ignore
76/// use vtcode_core::tools::resilience::{GLOBAL_TOOL_RESILIENCE, CallOutcome, Priority};
77///
78/// // On entry:
79/// GLOBAL_TOOL_RESILIENCE
80///     .try_acquire("read_file", Priority::Normal)
81///     .map_err(|wait| anyhow!("rate limited; retry after {wait:?}"))?;
82///
83/// // On exit:
84/// match result {
85///     Ok(_) => GLOBAL_TOOL_RESILIENCE.record_success("read_file"),
86///     Err(e) if e.is_argument_error() => {
87///         GLOBAL_TOOL_RESILIENCE.record_outcome("read_file", CallOutcome::InvalidArgument);
88///     }
89///     Err(_) => {
90///         GLOBAL_TOOL_RESILIENCE.record_outcome("read_file", CallOutcome::ExecutionError);
91///     }
92/// }
93/// ```
94pub struct ToolResilience {
95    rate_limiter: AdaptiveRateLimiter,
96    circuit_breaker: CircuitBreaker,
97}
98
99impl ToolResilience {
100    /// Construct a new facade with the supplied adaptive rate limiter and
101    /// circuit breaker.
102    pub fn new(rate_limiter: AdaptiveRateLimiter, circuit_breaker: CircuitBreaker) -> Self {
103        Self {
104            rate_limiter,
105            circuit_breaker,
106        }
107    }
108
109    /// Try to acquire a token for the tool. Returns `Ok(())` when the call is
110    /// allowed. When the tool is currently rate limited the suggested wait
111    /// duration is returned in `Err`.
112    pub fn try_acquire(&self, tool_name: &str, priority: Priority) -> Result<(), Duration> {
113        // 1. Fault isolation first: a tool with an open circuit is rejected
114        //    immediately, even if the rate limiter has tokens to spare.
115        if !self.circuit_breaker.allow_request_for_tool(tool_name) {
116            let backoff = self
117                .circuit_breaker
118                .remaining_backoff(tool_name)
119                .unwrap_or_else(|| Duration::from_millis(100));
120            return Err(backoff);
121        }
122        // 2. Rate limit. Configure the priority (idempotent).
123        self.rate_limiter.set_priority(tool_name, priority);
124        self.rate_limiter.try_acquire(tool_name)
125    }
126
127    /// Record a successful call. Closes the circuit if it was HalfOpen.
128    pub fn record_success(&self, tool_name: &str) {
129        self.circuit_breaker.record_success_for_tool(tool_name);
130    }
131
132    /// Record a non-success outcome. Categorical errors that should not trip
133    /// the breaker (`InvalidArgument`) are routed through
134    /// `CallOutcome::InvalidArgument`. See [`CallOutcome`].
135    pub fn record_outcome(&self, tool_name: &str, outcome: CallOutcome) {
136        match outcome.to_error_category() {
137            // Success: reset the breaker (also covers HalfOpen -> Closed).
138            None => self.circuit_breaker.record_success_for_tool(tool_name),
139            // Failure: the breaker API is a no-op for non-circuit-breaking
140            // categories, so InvalidArgument collapses to a harmless call.
141            Some(category) => self
142                .circuit_breaker
143                .record_failure_category_for_tool(tool_name, category),
144        }
145    }
146
147    /// Diagnostic snapshot of the circuit breaker.
148    pub fn circuit_snapshot(&self) -> circuit_breaker::CircuitBreakerSnapshot {
149        self.circuit_breaker.snapshot()
150    }
151}
152
153/// Process-wide resilience facade. Constructed lazily from the shared adaptive
154/// rate limiter and a default circuit breaker.
155pub static GLOBAL_TOOL_RESILIENCE: Lazy<Arc<ToolResilience>> = Lazy::new(|| {
156    Arc::new(ToolResilience::new(
157        AdaptiveRateLimiter::default(),
158        CircuitBreaker::default(),
159    ))
160});
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn facade_records_success_and_failures() {
168        let resilience = ToolResilience::new(
169            AdaptiveRateLimiter::new(8.0, 4.0),
170            CircuitBreaker::new(circuit_breaker::CircuitBreakerConfig {
171                failure_threshold: 2,
172                ..Default::default()
173            }),
174        );
175
176        // First two calls allowed; record two execution errors to open the circuit.
177        resilience
178            .try_acquire("alpha", Priority::Normal)
179            .expect("first call allowed");
180        resilience.record_outcome("alpha", CallOutcome::ExecutionError);
181
182        resilience
183            .try_acquire("alpha", Priority::Normal)
184            .expect("second call allowed");
185        resilience.record_outcome("alpha", CallOutcome::ExecutionError);
186
187        // Third call must be rejected by the circuit breaker.
188        let third = resilience.try_acquire("alpha", Priority::Normal);
189        assert!(third.is_err(), "circuit should be open after 2 failures");
190    }
191
192    #[test]
193    fn invalid_argument_does_not_trip_breaker() {
194        let resilience = ToolResilience::new(
195            AdaptiveRateLimiter::new(8.0, 4.0),
196            CircuitBreaker::new(circuit_breaker::CircuitBreakerConfig {
197                failure_threshold: 1,
198                ..Default::default()
199            }),
200        );
201
202        for _ in 0..3 {
203            resilience
204                .try_acquire("beta", Priority::Normal)
205                .expect("call allowed");
206            resilience.record_outcome("beta", CallOutcome::InvalidArgument);
207        }
208
209        // After 3 invalid-argument failures, the circuit must still be Closed.
210        assert!(resilience.try_acquire("beta", Priority::Normal).is_ok());
211    }
212}