rustkernel_core/resilience/
recovery.rs

1//! Recovery Policies
2//!
3//! Provides automatic recovery from transient failures.
4//!
5//! # Features
6//!
7//! - Configurable retry policies with backoff
8//! - Recovery strategies (retry, fallback, skip)
9//! - Checkpoint/restart support
10//!
11//! # Example
12//!
13//! ```rust,ignore
14//! use rustkernel_core::resilience::recovery::{RecoveryPolicy, RetryConfig};
15//!
16//! let policy = RecoveryPolicy::default()
17//!     .with_retry(RetryConfig::exponential(3, Duration::from_millis(100)));
18//!
19//! let result = policy.execute(|| async {
20//!     kernel.execute(input).await
21//! }).await?;
22//! ```
23
24use super::{ResilienceError, ResilienceResult};
25use serde::{Deserialize, Serialize};
26use std::time::Duration;
27
28/// Recovery policy for kernel failures
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct RecoveryPolicy {
31    /// Retry configuration
32    pub retry: Option<RetryConfig>,
33    /// Recovery strategy
34    pub strategy: RecoveryStrategy,
35    /// Whether to log recoveries
36    pub log_recoveries: bool,
37}
38
39impl Default for RecoveryPolicy {
40    fn default() -> Self {
41        Self {
42            retry: Some(RetryConfig::default()),
43            strategy: RecoveryStrategy::Retry,
44            log_recoveries: true,
45        }
46    }
47}
48
49impl RecoveryPolicy {
50    /// Production recovery policy
51    pub fn production() -> Self {
52        Self {
53            retry: Some(RetryConfig::exponential(3, Duration::from_millis(100))),
54            strategy: RecoveryStrategy::Retry,
55            log_recoveries: true,
56        }
57    }
58
59    /// Development recovery policy
60    pub fn development() -> Self {
61        Self {
62            retry: Some(RetryConfig::fixed(2, Duration::from_millis(50))),
63            strategy: RecoveryStrategy::Retry,
64            log_recoveries: true,
65        }
66    }
67
68    /// No recovery (fail immediately)
69    pub fn none() -> Self {
70        Self {
71            retry: None,
72            strategy: RecoveryStrategy::FailFast,
73            log_recoveries: false,
74        }
75    }
76
77    /// Set retry configuration
78    pub fn with_retry(mut self, config: RetryConfig) -> Self {
79        self.retry = Some(config);
80        self
81    }
82
83    /// Set recovery strategy
84    pub fn with_strategy(mut self, strategy: RecoveryStrategy) -> Self {
85        self.strategy = strategy;
86        self
87    }
88
89    /// Execute with recovery policy
90    pub async fn execute<F, Fut, T, E>(&self, f: F) -> ResilienceResult<T>
91    where
92        F: Fn() -> Fut,
93        Fut: std::future::Future<Output = Result<T, E>>,
94        E: Into<crate::error::KernelError> + std::fmt::Debug,
95    {
96        match self.strategy {
97            RecoveryStrategy::FailFast => f()
98                .await
99                .map_err(|e| ResilienceError::KernelError(e.into())),
100            RecoveryStrategy::Retry => {
101                if let Some(ref retry) = self.retry {
102                    retry.execute(f).await
103                } else {
104                    f().await
105                        .map_err(|e| ResilienceError::KernelError(e.into()))
106                }
107            }
108            RecoveryStrategy::Skip => {
109                // Skip strategy: return default or special value
110                // For now, just try once
111                f().await
112                    .map_err(|e| ResilienceError::KernelError(e.into()))
113            }
114        }
115    }
116}
117
118/// Recovery strategy
119#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
120#[serde(rename_all = "snake_case")]
121pub enum RecoveryStrategy {
122    /// Fail immediately without retrying
123    FailFast,
124    /// Retry with configured policy
125    #[default]
126    Retry,
127    /// Skip failed operations
128    Skip,
129}
130
131/// Retry configuration
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct RetryConfig {
134    /// Maximum retry attempts
135    pub max_retries: u32,
136    /// Initial delay between retries
137    pub initial_delay: Duration,
138    /// Maximum delay between retries
139    pub max_delay: Duration,
140    /// Backoff strategy
141    pub backoff: BackoffStrategy,
142    /// Jitter factor (0.0 - 1.0)
143    pub jitter: f64,
144    /// Whether to retry on all errors (default: true)
145    pub retry_all_errors: bool,
146}
147
148impl Default for RetryConfig {
149    fn default() -> Self {
150        Self {
151            max_retries: 3,
152            initial_delay: Duration::from_millis(100),
153            max_delay: Duration::from_secs(10),
154            backoff: BackoffStrategy::Exponential { factor: 2.0 },
155            jitter: 0.1,
156            retry_all_errors: true,
157        }
158    }
159}
160
161impl RetryConfig {
162    /// Create with exponential backoff
163    pub fn exponential(max_retries: u32, initial_delay: Duration) -> Self {
164        Self {
165            max_retries,
166            initial_delay,
167            backoff: BackoffStrategy::Exponential { factor: 2.0 },
168            ..Default::default()
169        }
170    }
171
172    /// Create with fixed delay
173    pub fn fixed(max_retries: u32, delay: Duration) -> Self {
174        Self {
175            max_retries,
176            initial_delay: delay,
177            backoff: BackoffStrategy::Fixed,
178            ..Default::default()
179        }
180    }
181
182    /// Create with linear backoff
183    pub fn linear(max_retries: u32, initial_delay: Duration) -> Self {
184        Self {
185            max_retries,
186            initial_delay,
187            backoff: BackoffStrategy::Linear {
188                increment: initial_delay,
189            },
190            ..Default::default()
191        }
192    }
193
194    /// Set max retries
195    pub fn max_retries(mut self, max: u32) -> Self {
196        self.max_retries = max;
197        self
198    }
199
200    /// Set initial delay
201    pub fn initial_delay(mut self, delay: Duration) -> Self {
202        self.initial_delay = delay;
203        self
204    }
205
206    /// Set max delay
207    pub fn max_delay(mut self, delay: Duration) -> Self {
208        self.max_delay = delay;
209        self
210    }
211
212    /// Set jitter factor
213    pub fn jitter(mut self, jitter: f64) -> Self {
214        self.jitter = jitter.clamp(0.0, 1.0);
215        self
216    }
217
218    /// Calculate delay for a given attempt
219    pub fn delay_for_attempt(&self, attempt: u32) -> Duration {
220        let base_delay = match self.backoff {
221            BackoffStrategy::Fixed => self.initial_delay,
222            BackoffStrategy::Linear { increment } => self.initial_delay + increment * attempt,
223            BackoffStrategy::Exponential { factor } => {
224                let multiplier = factor.powi(attempt as i32);
225                Duration::from_secs_f64(self.initial_delay.as_secs_f64() * multiplier)
226            }
227        };
228
229        // Apply max delay cap
230        let capped = base_delay.min(self.max_delay);
231
232        // Apply jitter
233        if self.jitter > 0.0 {
234            let jitter_range = capped.as_secs_f64() * self.jitter;
235            let jitter_amount = rand::random::<f64>() * jitter_range * 2.0 - jitter_range;
236            Duration::from_secs_f64((capped.as_secs_f64() + jitter_amount).max(0.0))
237        } else {
238            capped
239        }
240    }
241
242    /// Execute with retry
243    pub async fn execute<F, Fut, T, E>(&self, f: F) -> ResilienceResult<T>
244    where
245        F: Fn() -> Fut,
246        Fut: std::future::Future<Output = Result<T, E>>,
247        E: Into<crate::error::KernelError> + std::fmt::Debug,
248    {
249        let mut last_error = None;
250
251        for attempt in 0..=self.max_retries {
252            match f().await {
253                Ok(result) => {
254                    if attempt > 0 {
255                        tracing::info!(attempt = attempt, "Operation succeeded after retry");
256                    }
257                    return Ok(result);
258                }
259                Err(e) => {
260                    let kernel_error: crate::error::KernelError = e.into();
261
262                    // Check if we should retry
263                    if !self.retry_all_errors || attempt >= self.max_retries {
264                        tracing::warn!(
265                            attempt = attempt,
266                            error = ?kernel_error,
267                            "Operation failed, no more retries"
268                        );
269                        return Err(ResilienceError::MaxRetriesExceeded {
270                            retries: self.max_retries,
271                        });
272                    }
273
274                    let delay = self.delay_for_attempt(attempt);
275                    tracing::debug!(
276                        attempt = attempt,
277                        delay = ?delay,
278                        error = ?kernel_error,
279                        "Operation failed, retrying"
280                    );
281
282                    tokio::time::sleep(delay).await;
283                    last_error = Some(kernel_error);
284                }
285            }
286        }
287
288        Err(last_error.map(ResilienceError::KernelError).unwrap_or(
289            ResilienceError::MaxRetriesExceeded {
290                retries: self.max_retries,
291            },
292        ))
293    }
294}
295
296/// Backoff strategy
297#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
298#[serde(tag = "type", rename_all = "snake_case")]
299pub enum BackoffStrategy {
300    /// Fixed delay between retries
301    Fixed,
302    /// Linear increase in delay
303    Linear {
304        /// Amount to add each retry
305        increment: Duration,
306    },
307    /// Exponential increase in delay
308    Exponential {
309        /// Multiplication factor
310        factor: f64,
311    },
312}
313
314impl Default for BackoffStrategy {
315    fn default() -> Self {
316        Self::Exponential { factor: 2.0 }
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    #[test]
325    fn test_retry_config_exponential() {
326        let config = RetryConfig::exponential(3, Duration::from_millis(100));
327
328        assert_eq!(config.max_retries, 3);
329        assert_eq!(config.initial_delay, Duration::from_millis(100));
330
331        // Test delay calculation (without jitter)
332        let config = RetryConfig::exponential(3, Duration::from_millis(100)).jitter(0.0);
333        assert_eq!(config.delay_for_attempt(0), Duration::from_millis(100));
334        assert_eq!(config.delay_for_attempt(1), Duration::from_millis(200));
335        assert_eq!(config.delay_for_attempt(2), Duration::from_millis(400));
336    }
337
338    #[test]
339    fn test_retry_config_fixed() {
340        let config = RetryConfig::fixed(5, Duration::from_millis(50)).jitter(0.0);
341
342        assert_eq!(config.delay_for_attempt(0), Duration::from_millis(50));
343        assert_eq!(config.delay_for_attempt(1), Duration::from_millis(50));
344        assert_eq!(config.delay_for_attempt(5), Duration::from_millis(50));
345    }
346
347    #[test]
348    fn test_retry_config_linear() {
349        let config = RetryConfig::linear(3, Duration::from_millis(100)).jitter(0.0);
350
351        assert_eq!(config.delay_for_attempt(0), Duration::from_millis(100));
352        assert_eq!(config.delay_for_attempt(1), Duration::from_millis(200));
353        assert_eq!(config.delay_for_attempt(2), Duration::from_millis(300));
354    }
355
356    #[test]
357    fn test_max_delay_cap() {
358        let config = RetryConfig::exponential(10, Duration::from_secs(1))
359            .max_delay(Duration::from_secs(5))
360            .jitter(0.0);
361
362        // Should be capped at 5 seconds
363        assert_eq!(config.delay_for_attempt(10), Duration::from_secs(5));
364    }
365
366    #[test]
367    fn test_recovery_policy() {
368        let policy = RecoveryPolicy::production();
369        assert!(policy.retry.is_some());
370        assert_eq!(policy.strategy, RecoveryStrategy::Retry);
371    }
372
373    #[test]
374    fn test_recovery_policy_none() {
375        let policy = RecoveryPolicy::none();
376        assert!(policy.retry.is_none());
377        assert_eq!(policy.strategy, RecoveryStrategy::FailFast);
378    }
379}