vtcode_core/tools/resilience/mod.rs
1//! Resilience primitives for tool execution.
2//!
3//! Collects the concerns that govern how tools recover from and bound
4//! transient failures:
5//!
6//! - [`circuit_breaker`] - state-machine fault isolation (Closed/Open/HalfOpen)
7//! keyed by tool name. Tracks failure counts, applies exponential backoff,
8//! and short-circuits calls when a downstream is degraded.
9//! - [`adaptive_rate_limiter`] - per-key token-bucket rate limiter with adaptive
10//! refill and priority weighting. Used to throttle tools whose cost varies
11//! with the call.
12//! - [`rate_limiter`] - per-tool budgeted counter rate limiter. Provides the
13//! per-tool token bucket that the executor uses for synchronous admission.
14//! - [`ToolResilience`] - the unified facade. New callers should use this
15//! rather than reaching for the individual primitives, so that all three
16//! invariants (admission, fault isolation, post-call accounting) are checked
17//! in one place.
18//!
19//! The primitives share callers (autonomous executor, tool pipeline, agent
20//! error recovery) but address distinct failure modes; they are grouped here
21//! so a maintainer can audit the full resilience toolkit in one place.
22
23pub mod adaptive_rate_limiter;
24pub mod circuit_breaker;
25pub mod rate_limiter;
26
27use std::sync::Arc;
28use std::time::Duration;
29
30use once_cell::sync::Lazy;
31
32use vtcode_commons::ErrorCategory;
33
34use self::adaptive_rate_limiter::{AdaptiveRateLimiter, Priority};
35use self::circuit_breaker::CircuitBreaker;
36
37/// Outcome of a tool call, used by [`ToolResilience::record_outcome`] to update
38/// the circuit breaker.
39#[derive(Debug, Clone, Copy)]
40pub enum CallOutcome {
41 /// The call succeeded. Resets the failure counter for the tool.
42 Success,
43 /// The call failed due to a non-retryable error (invalid arguments, plan-mode
44 /// denial, permission denial). Does not trip the circuit breaker.
45 InvalidArgument,
46 /// The call failed due to a retryable execution error. May trip the circuit
47 /// breaker after `failure_threshold` consecutive occurrences.
48 ExecutionError,
49 /// The call failed due to cancellation (Drop guard, timeout, user abort).
50 /// Treated like an execution error for breaker accounting.
51 Cancelled,
52}
53
54impl CallOutcome {
55 /// Map this outcome to the `ErrorCategory` used by the circuit breaker for
56 /// non-success outcomes. Returns `None` for [`CallOutcome::Success`] because
57 /// success is handled by a separate code path (`record_success`).
58 fn to_error_category(self) -> Option<ErrorCategory> {
59 match self {
60 CallOutcome::Success => None,
61 CallOutcome::InvalidArgument => Some(ErrorCategory::InvalidParameters),
62 CallOutcome::ExecutionError | CallOutcome::Cancelled => {
63 Some(ErrorCategory::ExecutionError)
64 }
65 }
66 }
67}
68
69/// Unified facade for tool resilience. Wraps the adaptive rate limiter and the
70/// circuit breaker so callers can use a single API for admission, fault
71/// isolation, and post-call accounting.
72///
73/// # Example
74///
75/// ```ignore
76/// use vtcode_core::tools::resilience::{GLOBAL_TOOL_RESILIENCE, CallOutcome, Priority};
77///
78/// // On entry:
79/// GLOBAL_TOOL_RESILIENCE
80/// .try_acquire("read_file", Priority::Normal)
81/// .map_err(|wait| anyhow!("rate limited; retry after {wait:?}"))?;
82///
83/// // On exit:
84/// match result {
85/// Ok(_) => GLOBAL_TOOL_RESILIENCE.record_success("read_file"),
86/// Err(e) if e.is_argument_error() => {
87/// GLOBAL_TOOL_RESILIENCE.record_outcome("read_file", CallOutcome::InvalidArgument);
88/// }
89/// Err(_) => {
90/// GLOBAL_TOOL_RESILIENCE.record_outcome("read_file", CallOutcome::ExecutionError);
91/// }
92/// }
93/// ```
94pub struct ToolResilience {
95 rate_limiter: AdaptiveRateLimiter,
96 circuit_breaker: CircuitBreaker,
97}
98
99impl ToolResilience {
100 /// Construct a new facade with the supplied adaptive rate limiter and
101 /// circuit breaker.
102 pub fn new(rate_limiter: AdaptiveRateLimiter, circuit_breaker: CircuitBreaker) -> Self {
103 Self {
104 rate_limiter,
105 circuit_breaker,
106 }
107 }
108
109 /// Try to acquire a token for the tool. Returns `Ok(())` when the call is
110 /// allowed. When the tool is currently rate limited the suggested wait
111 /// duration is returned in `Err`.
112 pub fn try_acquire(&self, tool_name: &str, priority: Priority) -> Result<(), Duration> {
113 // 1. Fault isolation first: a tool with an open circuit is rejected
114 // immediately, even if the rate limiter has tokens to spare.
115 if !self.circuit_breaker.allow_request_for_tool(tool_name) {
116 let backoff = self
117 .circuit_breaker
118 .remaining_backoff(tool_name)
119 .unwrap_or_else(|| Duration::from_millis(100));
120 return Err(backoff);
121 }
122 // 2. Rate limit. Configure the priority (idempotent).
123 self.rate_limiter.set_priority(tool_name, priority);
124 self.rate_limiter.try_acquire(tool_name)
125 }
126
127 /// Record a successful call. Closes the circuit if it was HalfOpen.
128 pub fn record_success(&self, tool_name: &str) {
129 self.circuit_breaker.record_success_for_tool(tool_name);
130 }
131
132 /// Record a non-success outcome. Categorical errors that should not trip
133 /// the breaker (`InvalidArgument`) are routed through
134 /// `CallOutcome::InvalidArgument`. See [`CallOutcome`].
135 pub fn record_outcome(&self, tool_name: &str, outcome: CallOutcome) {
136 match outcome.to_error_category() {
137 // Success: reset the breaker (also covers HalfOpen -> Closed).
138 None => self.circuit_breaker.record_success_for_tool(tool_name),
139 // Failure: the breaker API is a no-op for non-circuit-breaking
140 // categories, so InvalidArgument collapses to a harmless call.
141 Some(category) => self
142 .circuit_breaker
143 .record_failure_category_for_tool(tool_name, category),
144 }
145 }
146
147 /// Diagnostic snapshot of the circuit breaker.
148 pub fn circuit_snapshot(&self) -> circuit_breaker::CircuitBreakerSnapshot {
149 self.circuit_breaker.snapshot()
150 }
151}
152
153/// Process-wide resilience facade. Constructed lazily from the shared adaptive
154/// rate limiter and a default circuit breaker.
155pub static GLOBAL_TOOL_RESILIENCE: Lazy<Arc<ToolResilience>> = Lazy::new(|| {
156 Arc::new(ToolResilience::new(
157 AdaptiveRateLimiter::default(),
158 CircuitBreaker::default(),
159 ))
160});
161
162#[cfg(test)]
163mod tests {
164 use super::*;
165
166 #[test]
167 fn facade_records_success_and_failures() {
168 let resilience = ToolResilience::new(
169 AdaptiveRateLimiter::new(8.0, 4.0),
170 CircuitBreaker::new(circuit_breaker::CircuitBreakerConfig {
171 failure_threshold: 2,
172 ..Default::default()
173 }),
174 );
175
176 // First two calls allowed; record two execution errors to open the circuit.
177 resilience
178 .try_acquire("alpha", Priority::Normal)
179 .expect("first call allowed");
180 resilience.record_outcome("alpha", CallOutcome::ExecutionError);
181
182 resilience
183 .try_acquire("alpha", Priority::Normal)
184 .expect("second call allowed");
185 resilience.record_outcome("alpha", CallOutcome::ExecutionError);
186
187 // Third call must be rejected by the circuit breaker.
188 let third = resilience.try_acquire("alpha", Priority::Normal);
189 assert!(third.is_err(), "circuit should be open after 2 failures");
190 }
191
192 #[test]
193 fn invalid_argument_does_not_trip_breaker() {
194 let resilience = ToolResilience::new(
195 AdaptiveRateLimiter::new(8.0, 4.0),
196 CircuitBreaker::new(circuit_breaker::CircuitBreakerConfig {
197 failure_threshold: 1,
198 ..Default::default()
199 }),
200 );
201
202 for _ in 0..3 {
203 resilience
204 .try_acquire("beta", Priority::Normal)
205 .expect("call allowed");
206 resilience.record_outcome("beta", CallOutcome::InvalidArgument);
207 }
208
209 // After 3 invalid-argument failures, the circuit must still be Closed.
210 assert!(resilience.try_acquire("beta", Priority::Normal).is_ok());
211 }
212}