ricecoder_mcp/
error_recovery.rs

1//! Error recovery and resilience mechanisms
2
3use crate::error::{Error, Result};
4use std::time::Duration;
5use tracing::{debug, error, info, warn};
6
7/// Exponential backoff configuration for reconnection attempts
8#[derive(Debug, Clone)]
9pub struct BackoffConfig {
10    /// Initial delay in milliseconds
11    pub initial_delay_ms: u64,
12    /// Maximum delay in milliseconds
13    pub max_delay_ms: u64,
14    /// Multiplier for exponential backoff
15    pub multiplier: f64,
16    /// Maximum number of retries
17    pub max_retries: u32,
18}
19
20impl BackoffConfig {
21    /// Creates a new backoff configuration with default values
22    pub fn new() -> Self {
23        Self {
24            initial_delay_ms: 100,
25            max_delay_ms: 30000,
26            multiplier: 2.0,
27            max_retries: 5,
28        }
29    }
30
31    /// Sets the initial delay
32    pub fn with_initial_delay(mut self, delay_ms: u64) -> Self {
33        self.initial_delay_ms = delay_ms;
34        self
35    }
36
37    /// Sets the maximum delay
38    pub fn with_max_delay(mut self, delay_ms: u64) -> Self {
39        self.max_delay_ms = delay_ms;
40        self
41    }
42
43    /// Sets the multiplier
44    pub fn with_multiplier(mut self, multiplier: f64) -> Self {
45        self.multiplier = multiplier;
46        self
47    }
48
49    /// Sets the maximum retries
50    pub fn with_max_retries(mut self, retries: u32) -> Self {
51        self.max_retries = retries;
52        self
53    }
54
55    /// Calculates the delay for a given retry attempt
56    pub fn calculate_delay(&self, attempt: u32) -> Duration {
57        let delay = (self.initial_delay_ms as f64 * self.multiplier.powi(attempt as i32)) as u64;
58        let delay = delay.min(self.max_delay_ms);
59        Duration::from_millis(delay)
60    }
61}
62
63impl Default for BackoffConfig {
64    fn default() -> Self {
65        Self::new()
66    }
67}
68
69/// Error recovery strategy
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71pub enum RecoveryStrategy {
72    /// Retry the operation
73    Retry,
74    /// Fail immediately
75    Fail,
76    /// Use fallback/cached data
77    Fallback,
78    /// Continue with available resources
79    GracefulDegradation,
80}
81
82/// Determines the recovery strategy for an error
83pub fn determine_recovery_strategy(error: &Error) -> RecoveryStrategy {
84    match error {
85        // Recoverable errors should retry
86        Error::TimeoutError(_) => RecoveryStrategy::Retry,
87        Error::ConnectionError(_) => RecoveryStrategy::Retry,
88        Error::ServerDisconnected(_) => RecoveryStrategy::Retry,
89        Error::ExecutionInterrupted => RecoveryStrategy::Retry,
90
91        // Server errors might be recoverable
92        Error::ServerError(_) => RecoveryStrategy::Retry,
93
94        // Permanent errors should fail
95        Error::ToolNotFound(_) => RecoveryStrategy::Fail,
96        Error::PermissionDenied(_) => RecoveryStrategy::Fail,
97        Error::NamingConflict(_) => RecoveryStrategy::Fail,
98        Error::MultipleNamingConflicts(_) => RecoveryStrategy::Fail,
99
100        // Validation errors should fail
101        Error::ValidationError(_) => RecoveryStrategy::Fail,
102        Error::ParameterValidationError(_) => RecoveryStrategy::Fail,
103        Error::OutputValidationError(_) => RecoveryStrategy::Fail,
104        Error::InvalidToolParameters(_) => RecoveryStrategy::Fail,
105        Error::InvalidToolOutput(_) => RecoveryStrategy::Fail,
106
107        // Configuration errors might be recoverable with fallback
108        Error::ConfigError(_) => RecoveryStrategy::Fallback,
109        Error::ConfigValidationError(_) => RecoveryStrategy::Fallback,
110        Error::StorageError(_) => RecoveryStrategy::Fallback,
111
112        // Other errors should use graceful degradation
113        _ => RecoveryStrategy::GracefulDegradation,
114    }
115}
116
117/// Retry handler for operations with exponential backoff
118pub struct RetryHandler {
119    config: BackoffConfig,
120}
121
122impl RetryHandler {
123    /// Creates a new retry handler with default configuration
124    pub fn new() -> Self {
125        Self {
126            config: BackoffConfig::new(),
127        }
128    }
129
130    /// Creates a new retry handler with custom configuration
131    pub fn with_config(config: BackoffConfig) -> Self {
132        Self { config }
133    }
134
135    /// Executes an operation with retry logic
136    ///
137    /// # Arguments
138    /// * `operation_name` - Name of the operation for logging
139    /// * `operation` - Async closure that performs the operation
140    ///
141    /// # Returns
142    /// Result of the operation or error if all retries fail
143    pub async fn execute_with_retry<F, T>(&self, operation_name: &str, mut operation: F) -> Result<T>
144    where
145        F: FnMut() -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<T>>>>,
146    {
147        let mut attempt = 0;
148
149        loop {
150            debug!(
151                "Executing operation '{}' (attempt {}/{})",
152                operation_name,
153                attempt + 1,
154                self.config.max_retries + 1
155            );
156
157            match operation().await {
158                Ok(result) => {
159                    if attempt > 0 {
160                        info!(
161                            "Operation '{}' succeeded after {} retries",
162                            operation_name, attempt
163                        );
164                    }
165                    return Ok(result);
166                }
167                Err(err) => {
168                    if !err.is_recoverable() {
169                        debug!(
170                            "Operation '{}' failed with non-recoverable error: {}",
171                            operation_name, err
172                        );
173                        return Err(err);
174                    }
175
176                    if attempt >= self.config.max_retries {
177                        error!(
178                            "Operation '{}' failed after {} retries: {}",
179                            operation_name, attempt, err
180                        );
181                        return Err(Error::MaxRetriesExceeded(format!(
182                            "Operation '{}' failed after {} retries: {}",
183                            operation_name, attempt, err
184                        )));
185                    }
186
187                    let delay = self.config.calculate_delay(attempt);
188                    warn!(
189                        "Operation '{}' failed (attempt {}): {}. Retrying in {:?}",
190                        operation_name, attempt + 1, err, delay
191                    );
192
193                    tokio::time::sleep(delay).await;
194                    attempt += 1;
195                }
196            }
197        }
198    }
199
200    /// Executes an operation with retry logic and timeout
201    pub async fn execute_with_retry_and_timeout<F, T>(
202        &self,
203        operation_name: &str,
204        timeout_ms: u64,
205        mut operation: F,
206    ) -> Result<T>
207    where
208        F: FnMut() -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<T>>>>,
209    {
210        let timeout = Duration::from_millis(timeout_ms);
211        let mut attempt = 0;
212
213        loop {
214            debug!(
215                "Executing operation '{}' with timeout {:?} (attempt {}/{})",
216                operation_name,
217                timeout,
218                attempt + 1,
219                self.config.max_retries + 1
220            );
221
222            match tokio::time::timeout(timeout, operation()).await {
223                Ok(Ok(result)) => {
224                    if attempt > 0 {
225                        info!(
226                            "Operation '{}' succeeded after {} retries",
227                            operation_name, attempt
228                        );
229                    }
230                    return Ok(result);
231                }
232                Ok(Err(err)) => {
233                    if !err.is_recoverable() {
234                        debug!(
235                            "Operation '{}' failed with non-recoverable error: {}",
236                            operation_name, err
237                        );
238                        return Err(err);
239                    }
240
241                    if attempt >= self.config.max_retries {
242                        error!(
243                            "Operation '{}' failed after {} retries: {}",
244                            operation_name, attempt, err
245                        );
246                        return Err(Error::MaxRetriesExceeded(format!(
247                            "Operation '{}' failed after {} retries: {}",
248                            operation_name, attempt, err
249                        )));
250                    }
251
252                    let delay = self.config.calculate_delay(attempt);
253                    warn!(
254                        "Operation '{}' failed (attempt {}): {}. Retrying in {:?}",
255                        operation_name, attempt + 1, err, delay
256                    );
257
258                    tokio::time::sleep(delay).await;
259                    attempt += 1;
260                }
261                Err(_) => {
262                    if attempt >= self.config.max_retries {
263                        error!(
264                            "Operation '{}' timed out after {} retries",
265                            operation_name, attempt
266                        );
267                        return Err(Error::TimeoutError(timeout_ms));
268                    }
269
270                    let delay = self.config.calculate_delay(attempt);
271                    warn!(
272                        "Operation '{}' timed out (attempt {}). Retrying in {:?}",
273                        operation_name,
274                        attempt + 1,
275                        delay
276                    );
277
278                    tokio::time::sleep(delay).await;
279                    attempt += 1;
280                }
281            }
282        }
283    }
284}
285
286impl Default for RetryHandler {
287    fn default() -> Self {
288        Self::new()
289    }
290}
291
292/// Graceful degradation handler for managing partial failures
293#[derive(Debug, Clone)]
294pub struct GracefulDegradationHandler {
295    /// List of available resources
296    available_resources: Vec<String>,
297    /// List of unavailable resources
298    unavailable_resources: Vec<String>,
299}
300
301impl GracefulDegradationHandler {
302    /// Creates a new graceful degradation handler
303    pub fn new() -> Self {
304        Self {
305            available_resources: Vec::new(),
306            unavailable_resources: Vec::new(),
307        }
308    }
309
310    /// Marks a resource as available
311    pub fn mark_available(&mut self, resource_id: String) {
312        self.available_resources.push(resource_id.clone());
313        self.unavailable_resources.retain(|r| r != &resource_id);
314        info!("Resource marked as available: {}", resource_id);
315    }
316
317    /// Marks a resource as unavailable
318    pub fn mark_unavailable(&mut self, resource_id: String) {
319        self.unavailable_resources.push(resource_id.clone());
320        self.available_resources.retain(|r| r != &resource_id);
321        warn!("Resource marked as unavailable: {}", resource_id);
322    }
323
324    /// Gets the list of available resources
325    pub fn get_available_resources(&self) -> Vec<String> {
326        self.available_resources.clone()
327    }
328
329    /// Gets the list of unavailable resources
330    pub fn get_unavailable_resources(&self) -> Vec<String> {
331        self.unavailable_resources.clone()
332    }
333
334    /// Checks if any resources are available
335    pub fn has_available_resources(&self) -> bool {
336        !self.available_resources.is_empty()
337    }
338
339    /// Gets the availability percentage
340    pub fn availability_percentage(&self) -> f64 {
341        let total = self.available_resources.len() + self.unavailable_resources.len();
342        if total == 0 {
343            0.0
344        } else {
345            (self.available_resources.len() as f64 / total as f64) * 100.0
346        }
347    }
348}
349
350impl Default for GracefulDegradationHandler {
351    fn default() -> Self {
352        Self::new()
353    }
354}
355
356#[cfg(test)]
357mod tests {
358    use super::*;
359    use std::sync::Arc;
360
361    #[test]
362    fn test_backoff_config_default() {
363        let config = BackoffConfig::new();
364        assert_eq!(config.initial_delay_ms, 100);
365        assert_eq!(config.max_delay_ms, 30000);
366        assert_eq!(config.multiplier, 2.0);
367        assert_eq!(config.max_retries, 5);
368    }
369
370    #[test]
371    fn test_backoff_config_custom() {
372        let config = BackoffConfig::new()
373            .with_initial_delay(50)
374            .with_max_delay(10000)
375            .with_multiplier(1.5)
376            .with_max_retries(3);
377
378        assert_eq!(config.initial_delay_ms, 50);
379        assert_eq!(config.max_delay_ms, 10000);
380        assert_eq!(config.multiplier, 1.5);
381        assert_eq!(config.max_retries, 3);
382    }
383
384    #[test]
385    fn test_calculate_delay() {
386        let config = BackoffConfig::new()
387            .with_initial_delay(100)
388            .with_max_delay(10000)
389            .with_multiplier(2.0);
390
391        assert_eq!(config.calculate_delay(0), Duration::from_millis(100));
392        assert_eq!(config.calculate_delay(1), Duration::from_millis(200));
393        assert_eq!(config.calculate_delay(2), Duration::from_millis(400));
394        assert_eq!(config.calculate_delay(3), Duration::from_millis(800));
395        assert_eq!(config.calculate_delay(4), Duration::from_millis(1600));
396        assert_eq!(config.calculate_delay(5), Duration::from_millis(3200));
397        assert_eq!(config.calculate_delay(6), Duration::from_millis(6400));
398        assert_eq!(config.calculate_delay(7), Duration::from_millis(10000)); // Capped at max
399    }
400
401    #[test]
402    fn test_determine_recovery_strategy_retry() {
403        assert_eq!(
404            determine_recovery_strategy(&Error::TimeoutError(5000)),
405            RecoveryStrategy::Retry
406        );
407        assert_eq!(
408            determine_recovery_strategy(&Error::ConnectionError("test".to_string())),
409            RecoveryStrategy::Retry
410        );
411    }
412
413    #[test]
414    fn test_determine_recovery_strategy_fail() {
415        assert_eq!(
416            determine_recovery_strategy(&Error::ToolNotFound("test".to_string())),
417            RecoveryStrategy::Fail
418        );
419        assert_eq!(
420            determine_recovery_strategy(&Error::PermissionDenied("test".to_string())),
421            RecoveryStrategy::Fail
422        );
423    }
424
425    #[test]
426    fn test_determine_recovery_strategy_fallback() {
427        assert_eq!(
428            determine_recovery_strategy(&Error::ConfigError("test".to_string())),
429            RecoveryStrategy::Fallback
430        );
431    }
432
433    #[test]
434    fn test_retry_handler_default() {
435        let handler = RetryHandler::new();
436        assert_eq!(handler.config.max_retries, 5);
437    }
438
439    #[test]
440    fn test_retry_handler_custom_config() {
441        let config = BackoffConfig::new().with_max_retries(3);
442        let handler = RetryHandler::with_config(config);
443        assert_eq!(handler.config.max_retries, 3);
444    }
445
446    #[tokio::test]
447    async fn test_execute_with_retry_success() {
448        let handler = RetryHandler::new();
449        let call_count = Arc::new(tokio::sync::Mutex::new(0));
450        let call_count_clone = call_count.clone();
451
452        let result = handler
453            .execute_with_retry("test_op", || {
454                let count = call_count_clone.clone();
455                Box::pin(async move {
456                    let mut c = count.lock().await;
457                    *c += 1;
458                    Ok::<i32, Error>(42)
459                })
460            })
461            .await;
462
463        assert!(result.is_ok());
464        assert_eq!(result.unwrap(), 42);
465        assert_eq!(*call_count.lock().await, 1);
466    }
467
468    #[tokio::test]
469    async fn test_execute_with_retry_non_recoverable_error() {
470        let handler = RetryHandler::new();
471        let call_count = Arc::new(tokio::sync::Mutex::new(0));
472        let call_count_clone = call_count.clone();
473
474        let result = handler
475            .execute_with_retry("test_op", || {
476                let count = call_count_clone.clone();
477                Box::pin(async move {
478                    let mut c = count.lock().await;
479                    *c += 1;
480                    Err::<i32, Error>(Error::ToolNotFound("test".to_string()))
481                })
482            })
483            .await;
484
485        assert!(result.is_err());
486        assert_eq!(*call_count.lock().await, 1); // Should not retry
487    }
488
489    #[test]
490    fn test_graceful_degradation_handler() {
491        let mut handler = GracefulDegradationHandler::new();
492
493        handler.mark_available("server1".to_string());
494        handler.mark_available("server2".to_string());
495        handler.mark_unavailable("server3".to_string());
496
497        assert_eq!(handler.get_available_resources().len(), 2);
498        assert_eq!(handler.get_unavailable_resources().len(), 1);
499        assert!(handler.has_available_resources());
500        assert_eq!(handler.availability_percentage(), 66.66666666666666);
501    }
502
503    #[test]
504    fn test_graceful_degradation_handler_all_unavailable() {
505        let mut handler = GracefulDegradationHandler::new();
506
507        handler.mark_unavailable("server1".to_string());
508        handler.mark_unavailable("server2".to_string());
509
510        assert!(!handler.has_available_resources());
511        assert_eq!(handler.availability_percentage(), 0.0);
512    }
513
514    #[test]
515    fn test_graceful_degradation_handler_recovery() {
516        let mut handler = GracefulDegradationHandler::new();
517
518        handler.mark_unavailable("server1".to_string());
519        assert!(!handler.has_available_resources());
520
521        handler.mark_available("server1".to_string());
522        assert!(handler.has_available_resources());
523        assert_eq!(handler.get_unavailable_resources().len(), 0);
524    }
525}