claude_agent/client/resilience/
mod.rs

1//! Resilience layer for Claude API client.
2//!
3//! Provides retry with exponential backoff and circuit breaker pattern.
4
5mod backoff;
6mod circuit;
7
8pub use backoff::ExponentialBackoff;
9pub use circuit::{CircuitBreaker, CircuitConfig, CircuitState};
10
11use std::sync::Arc;
12use std::time::Duration;
13
14#[derive(Clone)]
15pub struct ResilienceConfig {
16    pub retry: RetryConfig,
17    pub circuit: Option<CircuitConfig>,
18    pub timeout: Duration,
19}
20
21#[derive(Clone)]
22pub struct RetryConfig {
23    pub max_retries: u32,
24    pub backoff: ExponentialBackoff,
25    pub retry_on_rate_limit: bool,
26    pub retry_on_server_error: bool,
27    pub retry_on_network_error: bool,
28}
29
30impl Default for ResilienceConfig {
31    fn default() -> Self {
32        Self {
33            retry: RetryConfig::default(),
34            circuit: Some(CircuitConfig::default()),
35            timeout: Duration::from_secs(120),
36        }
37    }
38}
39
40impl Default for RetryConfig {
41    fn default() -> Self {
42        Self {
43            max_retries: 3,
44            backoff: ExponentialBackoff::default(),
45            retry_on_rate_limit: true,
46            retry_on_server_error: true,
47            retry_on_network_error: true,
48        }
49    }
50}
51
52impl ResilienceConfig {
53    pub fn no_retry() -> Self {
54        Self {
55            retry: RetryConfig {
56                max_retries: 0,
57                ..Default::default()
58            },
59            circuit: None,
60            timeout: Duration::from_secs(120),
61        }
62    }
63
64    pub fn aggressive() -> Self {
65        Self {
66            retry: RetryConfig {
67                max_retries: 5,
68                backoff: ExponentialBackoff::new(
69                    Duration::from_millis(50),
70                    Duration::from_secs(10),
71                    2.0,
72                ),
73                ..Default::default()
74            },
75            circuit: Some(CircuitConfig {
76                failure_threshold: 10,
77                recovery_timeout: Duration::from_secs(60),
78                success_threshold: 5,
79            }),
80            timeout: Duration::from_secs(300),
81        }
82    }
83
84    pub fn conservative() -> Self {
85        Self {
86            retry: RetryConfig {
87                max_retries: 2,
88                backoff: ExponentialBackoff::new(
89                    Duration::from_millis(500),
90                    Duration::from_secs(30),
91                    2.0,
92                ),
93                ..Default::default()
94            },
95            circuit: Some(CircuitConfig::default()),
96            timeout: Duration::from_secs(60),
97        }
98    }
99}
100
101pub struct Resilience {
102    config: ResilienceConfig,
103    circuit: Option<Arc<CircuitBreaker>>,
104}
105
106impl Resilience {
107    pub fn new(config: ResilienceConfig) -> Self {
108        let circuit = config
109            .circuit
110            .as_ref()
111            .map(|c| Arc::new(CircuitBreaker::new(c.clone())));
112        Self { config, circuit }
113    }
114
115    pub fn config(&self) -> &ResilienceConfig {
116        &self.config
117    }
118
119    pub fn circuit(&self) -> Option<&Arc<CircuitBreaker>> {
120        self.circuit.as_ref()
121    }
122
123    pub async fn execute<F, T, E>(&self, mut operation: F) -> Result<T, E>
124    where
125        F: FnMut() -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<T, E>> + Send>>,
126        E: Into<crate::Error> + From<crate::Error> + Clone,
127    {
128        if let Some(ref cb) = self.circuit
129            && !cb.allow_request()
130        {
131            let err = crate::Error::Api {
132                message: "Circuit breaker is open".to_string(),
133                status: Some(503),
134                error_type: Some("circuit_open".to_string()),
135            };
136            return Err(E::from(err));
137        }
138
139        let mut attempts = 0;
140        loop {
141            let result = tokio::time::timeout(self.config.timeout, operation()).await;
142
143            match result {
144                Ok(Ok(value)) => {
145                    if let Some(ref cb) = self.circuit {
146                        cb.record_success();
147                    }
148                    return Ok(value);
149                }
150                Ok(Err(e)) => {
151                    let error: crate::Error = e.clone().into();
152
153                    if let Some(ref cb) = self.circuit {
154                        cb.record_failure();
155                    }
156
157                    attempts += 1;
158                    if attempts > self.config.retry.max_retries {
159                        return Err(e);
160                    }
161
162                    if !self.should_retry(&error) {
163                        return Err(e);
164                    }
165
166                    let delay = self.config.retry.backoff.delay_for(attempts);
167
168                    if let Some(retry_after) = error.retry_after() {
169                        tokio::time::sleep(retry_after.max(delay)).await;
170                    } else {
171                        tokio::time::sleep(delay).await;
172                    }
173                }
174                Err(_timeout) => {
175                    if let Some(ref cb) = self.circuit {
176                        cb.record_failure();
177                    }
178
179                    attempts += 1;
180                    if attempts > self.config.retry.max_retries {
181                        return Err(E::from(crate::Error::Timeout(self.config.timeout)));
182                    }
183
184                    let delay = self.config.retry.backoff.delay_for(attempts);
185                    tokio::time::sleep(delay).await;
186                }
187            }
188        }
189    }
190
191    fn should_retry(&self, error: &crate::Error) -> bool {
192        match error {
193            crate::Error::RateLimit { .. } => self.config.retry.retry_on_rate_limit,
194            crate::Error::Network(_) => self.config.retry.retry_on_network_error,
195            crate::Error::Api {
196                status: Some(529), ..
197            } => self.config.retry.retry_on_server_error,
198            crate::Error::Api {
199                status: Some(500..=599),
200                ..
201            } => self.config.retry.retry_on_server_error,
202            _ => false,
203        }
204    }
205}
206
207impl Default for Resilience {
208    fn default() -> Self {
209        Self::new(ResilienceConfig::default())
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn test_default_config() {
219        let config = ResilienceConfig::default();
220        assert_eq!(config.retry.max_retries, 3);
221        assert!(config.circuit.is_some());
222    }
223
224    #[test]
225    fn test_no_retry_config() {
226        let config = ResilienceConfig::no_retry();
227        assert_eq!(config.retry.max_retries, 0);
228        assert!(config.circuit.is_none());
229    }
230
231    #[test]
232    fn test_aggressive_config() {
233        let config = ResilienceConfig::aggressive();
234        assert_eq!(config.retry.max_retries, 5);
235        assert!(config.circuit.is_some());
236    }
237}