Skip to main content

fraiseql_server/encryption/
error_recovery.rs

1// Phase 12.3 Cycle 10: Error Recovery (GREEN)
2//! Error recovery for encryption operations including Vault outages,
3//! key expiry, network partitions, and graceful degradation strategies.
4
5use std::sync::{
6    Arc,
7    atomic::{AtomicU64, Ordering},
8};
9
10use chrono::{DateTime, Duration, Utc};
11
12/// Recovery strategy for encryption failures
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum RecoveryStrategy {
15    /// Use cached value if available
16    UseCache,
17    /// Retry with exponential backoff
18    Retry,
19    /// Fail fast
20    FailFast,
21    /// Degrade to read-only mode
22    ReadOnly,
23}
24
25impl std::fmt::Display for RecoveryStrategy {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        match self {
28            Self::UseCache => write!(f, "use_cache"),
29            Self::Retry => write!(f, "retry"),
30            Self::FailFast => write!(f, "fail_fast"),
31            Self::ReadOnly => write!(f, "read_only"),
32        }
33    }
34}
35
36/// Error category for diagnostics and recovery
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum ErrorCategory {
39    /// Network connectivity issue
40    NetworkError,
41    /// Vault service unavailable
42    VaultUnavailable,
43    /// Encryption key not found
44    KeyNotFound,
45    /// Encryption key expired
46    KeyExpired,
47    /// Permission denied
48    PermissionDenied,
49    /// Encryption operation failed
50    EncryptionFailed,
51    /// Decryption operation failed
52    DecryptionFailed,
53    /// Cache miss
54    CacheMiss,
55    /// Unknown error
56    Unknown,
57}
58
59impl std::fmt::Display for ErrorCategory {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        match self {
62            Self::NetworkError => write!(f, "network_error"),
63            Self::VaultUnavailable => write!(f, "vault_unavailable"),
64            Self::KeyNotFound => write!(f, "key_not_found"),
65            Self::KeyExpired => write!(f, "key_expired"),
66            Self::PermissionDenied => write!(f, "permission_denied"),
67            Self::EncryptionFailed => write!(f, "encryption_failed"),
68            Self::DecryptionFailed => write!(f, "decryption_failed"),
69            Self::CacheMiss => write!(f, "cache_miss"),
70            Self::Unknown => write!(f, "unknown"),
71        }
72    }
73}
74
75/// Error with context and recovery information
76#[derive(Debug, Clone)]
77pub struct RecoveryError {
78    /// Error category
79    pub category:    ErrorCategory,
80    /// Human-readable message
81    pub message:     String,
82    /// Recovery strategy recommendation
83    pub strategy:    RecoveryStrategy,
84    /// Recovery suggestion for user
85    pub suggestion:  String,
86    /// Timestamp of error
87    pub timestamp:   DateTime<Utc>,
88    /// Retry count so far
89    pub retry_count: u32,
90    /// Can retry
91    pub retryable:   bool,
92}
93
94impl RecoveryError {
95    /// Create new recovery error
96    pub fn new(category: ErrorCategory, message: impl Into<String>) -> Self {
97        let suggestion = match category {
98            ErrorCategory::NetworkError => "Check network connectivity and retry".to_string(),
99            ErrorCategory::VaultUnavailable => "Vault is unavailable. Check Vault status and retry after 30s".to_string(),
100            ErrorCategory::KeyNotFound => "Encryption key not found. Check key reference in configuration".to_string(),
101            ErrorCategory::KeyExpired => "Encryption key has expired. Key will be refreshed automatically. Retry the operation".to_string(),
102            ErrorCategory::PermissionDenied => "Permission denied accessing encryption key. Check authentication credentials".to_string(),
103            ErrorCategory::EncryptionFailed => "Encryption operation failed. Check input data and retry".to_string(),
104            ErrorCategory::DecryptionFailed => "Decryption operation failed. Data may be corrupted".to_string(),
105            ErrorCategory::CacheMiss => "Key not in cache. Vault fetch will be attempted".to_string(),
106            ErrorCategory::Unknown => "An unknown error occurred. Check logs for details".to_string(),
107        };
108
109        let (strategy, retryable) = match category {
110            ErrorCategory::NetworkError => (RecoveryStrategy::Retry, true),
111            ErrorCategory::VaultUnavailable => (RecoveryStrategy::UseCache, true),
112            ErrorCategory::KeyExpired => (RecoveryStrategy::Retry, true),
113            ErrorCategory::KeyNotFound => (RecoveryStrategy::FailFast, false),
114            ErrorCategory::PermissionDenied => (RecoveryStrategy::FailFast, false),
115            ErrorCategory::EncryptionFailed => (RecoveryStrategy::FailFast, false),
116            ErrorCategory::DecryptionFailed => (RecoveryStrategy::FailFast, false),
117            ErrorCategory::CacheMiss => (RecoveryStrategy::Retry, true),
118            ErrorCategory::Unknown => (RecoveryStrategy::FailFast, false),
119        };
120
121        Self {
122            category,
123            message: message.into(),
124            strategy,
125            suggestion,
126            timestamp: Utc::now(),
127            retry_count: 0,
128            retryable,
129        }
130    }
131
132    /// Increment retry count
133    pub fn with_retry_count(mut self, count: u32) -> Self {
134        self.retry_count = count;
135        self
136    }
137
138    /// Get time since error
139    pub fn age(&self) -> Duration {
140        Utc::now() - self.timestamp
141    }
142
143    /// Check if error is fresh (less than 1 minute old)
144    pub fn is_fresh(&self) -> bool {
145        self.age() < Duration::minutes(1)
146    }
147
148    /// Check if this error suggests a transient issue (can retry)
149    pub fn is_transient(&self) -> bool {
150        self.retryable
151    }
152
153    /// Check if cache fallback is appropriate for this error
154    pub fn should_use_cache(&self) -> bool {
155        matches!(self.strategy, RecoveryStrategy::UseCache | RecoveryStrategy::ReadOnly)
156    }
157
158    /// Estimate milliseconds to wait before retry based on retry count
159    pub fn retry_delay_ms(&self, config: &RetryConfig) -> u64 {
160        config.backoff_delay_ms(self.retry_count)
161    }
162}
163
164/// Retry configuration
165#[derive(Debug, Clone)]
166pub struct RetryConfig {
167    /// Maximum number of retries
168    pub max_retries:        u32,
169    /// Initial backoff delay in milliseconds
170    pub initial_backoff_ms: u64,
171    /// Maximum backoff delay in milliseconds
172    pub max_backoff_ms:     u64,
173    /// Backoff multiplier for exponential growth
174    pub backoff_multiplier: f64,
175}
176
177impl RetryConfig {
178    /// Create new retry config
179    pub fn new() -> Self {
180        Self {
181            max_retries:        3,
182            initial_backoff_ms: 100,
183            max_backoff_ms:     5000,
184            backoff_multiplier: 2.0,
185        }
186    }
187
188    /// Set maximum retries
189    pub fn with_max_retries(mut self, max: u32) -> Self {
190        self.max_retries = max;
191        self
192    }
193
194    /// Calculate backoff delay for retry attempt
195    pub fn backoff_delay_ms(&self, attempt: u32) -> u64 {
196        let delay = self.initial_backoff_ms as f64 * self.backoff_multiplier.powi(attempt as i32);
197        (delay as u64).min(self.max_backoff_ms)
198    }
199
200    /// Check if we should attempt retry based on attempt count
201    pub fn should_retry(&self, attempt: u32) -> bool {
202        attempt < self.max_retries
203    }
204
205    /// Add jitter to delay to prevent thundering herd
206    pub fn backoff_delay_with_jitter_ms(&self, attempt: u32) -> u64 {
207        use rand::Rng;
208        let base_delay = self.backoff_delay_ms(attempt);
209        // Add ±10% jitter
210        let jitter_percent = base_delay / 10;
211        let mut rng = rand::thread_rng();
212        let jitter = rng.gen_range(0..=jitter_percent);
213        let use_add = rng.gen_bool(0.5);
214        if use_add {
215            base_delay + jitter
216        } else {
217            base_delay.saturating_sub(jitter)
218        }
219    }
220}
221
222impl Default for RetryConfig {
223    fn default() -> Self {
224        Self::new()
225    }
226}
227
228/// Circuit breaker for fault tolerance
229#[derive(Debug)]
230pub struct CircuitBreaker {
231    /// Failure threshold before opening
232    failure_threshold: u32,
233    /// Success threshold to close
234    success_threshold: u32,
235    /// Current failure count
236    failure_count:     Arc<AtomicU64>,
237    /// Current success count
238    success_count:     Arc<AtomicU64>,
239    /// Circuit state
240    state:             Arc<atomic::AtomicUsize>,
241    /// Last state change time
242    last_change:       Arc<std::sync::Mutex<DateTime<Utc>>>,
243}
244
245mod atomic {
246    use std::sync::atomic::AtomicUsize as StdAtomicUsize;
247
248    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
249    #[repr(usize)]
250    pub enum CircuitState {
251        Closed   = 0,
252        Open     = 1,
253        HalfOpen = 2,
254    }
255
256    impl CircuitState {
257        pub fn from_usize(val: usize) -> Self {
258            match val {
259                0 => CircuitState::Closed,
260                1 => CircuitState::Open,
261                2 => CircuitState::HalfOpen,
262                _ => CircuitState::Closed,
263            }
264        }
265
266        pub fn to_usize(self) -> usize {
267            self as usize
268        }
269    }
270
271    pub struct AtomicUsize(StdAtomicUsize);
272
273    impl AtomicUsize {
274        pub fn new(val: CircuitState) -> Self {
275            AtomicUsize(StdAtomicUsize::new(val.to_usize()))
276        }
277
278        pub fn load(&self) -> CircuitState {
279            CircuitState::from_usize(self.0.load(std::sync::atomic::Ordering::Relaxed))
280        }
281
282        pub fn store(&self, val: CircuitState) {
283            self.0.store(val.to_usize(), std::sync::atomic::Ordering::Relaxed)
284        }
285    }
286
287    impl std::fmt::Debug for AtomicUsize {
288        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
289            write!(f, "{:?}", self.load())
290        }
291    }
292}
293
294use atomic::{AtomicUsize, CircuitState};
295
296impl CircuitBreaker {
297    /// Create new circuit breaker
298    pub fn new(failure_threshold: u32, success_threshold: u32) -> Self {
299        Self {
300            failure_threshold,
301            success_threshold,
302            failure_count: Arc::new(AtomicU64::new(0)),
303            success_count: Arc::new(AtomicU64::new(0)),
304            state: Arc::new(AtomicUsize::new(CircuitState::Closed)),
305            last_change: Arc::new(std::sync::Mutex::new(Utc::now())),
306        }
307    }
308
309    /// Record successful operation
310    pub fn record_success(&self) {
311        let state = self.state.load();
312        match state {
313            CircuitState::Closed => {
314                self.failure_count.store(0, Ordering::Relaxed);
315            },
316            CircuitState::HalfOpen => {
317                let success = self.success_count.fetch_add(1, Ordering::Relaxed) + 1;
318                if success >= self.success_threshold as u64 {
319                    self.state.store(CircuitState::Closed);
320                    self.failure_count.store(0, Ordering::Relaxed);
321                    self.success_count.store(0, Ordering::Relaxed);
322                    if let Ok(mut last) = self.last_change.lock() {
323                        *last = Utc::now();
324                    }
325                }
326            },
327            CircuitState::Open => {},
328        }
329    }
330
331    /// Record failed operation
332    pub fn record_failure(&self) {
333        let state = self.state.load();
334        match state {
335            CircuitState::Closed => {
336                let failures = self.failure_count.fetch_add(1, Ordering::Relaxed) + 1;
337                if failures >= self.failure_threshold as u64 {
338                    self.state.store(CircuitState::Open);
339                    if let Ok(mut last) = self.last_change.lock() {
340                        *last = Utc::now();
341                    }
342                }
343            },
344            CircuitState::Open => {
345                // Could transition to HalfOpen after timeout
346            },
347            CircuitState::HalfOpen => {
348                self.state.store(CircuitState::Open);
349                self.success_count.store(0, Ordering::Relaxed);
350                if let Ok(mut last) = self.last_change.lock() {
351                    *last = Utc::now();
352                }
353            },
354        }
355    }
356
357    /// Check if operation allowed
358    pub fn is_allowed(&self) -> bool {
359        matches!(self.state.load(), CircuitState::Closed | CircuitState::HalfOpen)
360    }
361
362    /// Get current state
363    pub fn state(&self) -> CircuitState {
364        self.state.load()
365    }
366
367    /// Reset circuit breaker
368    pub fn reset(&self) {
369        self.state.store(CircuitState::Closed);
370        self.failure_count.store(0, Ordering::Relaxed);
371        self.success_count.store(0, Ordering::Relaxed);
372    }
373
374    /// Get time since last state change
375    pub fn time_since_last_change(&self) -> Duration {
376        if let Ok(last) = self.last_change.lock() {
377            Utc::now() - *last
378        } else {
379            Duration::zero()
380        }
381    }
382
383    /// Check if circuit should attempt recovery from Open state
384    pub fn should_attempt_recovery(&self, recovery_timeout_ms: u64) -> bool {
385        matches!(self.state.load(), CircuitState::Open)
386            && self.time_since_last_change().num_milliseconds() as u64 >= recovery_timeout_ms
387    }
388
389    /// Attempt to transition from Open to HalfOpen after timeout
390    pub fn attempt_recovery(&self, recovery_timeout_ms: u64) {
391        if self.should_attempt_recovery(recovery_timeout_ms) {
392            self.state.store(CircuitState::HalfOpen);
393            self.success_count.store(0, Ordering::Relaxed);
394            if let Ok(mut last) = self.last_change.lock() {
395                *last = Utc::now();
396            }
397        }
398    }
399
400    /// Get current failure and success counts
401    pub fn get_counts(&self) -> (u64, u64) {
402        let failures = self.failure_count.load(Ordering::Relaxed);
403        let successes = self.success_count.load(Ordering::Relaxed);
404        (failures, successes)
405    }
406
407    /// Check if circuit is fully open (no operations allowed)
408    pub fn is_open(&self) -> bool {
409        matches!(self.state.load(), CircuitState::Open)
410    }
411
412    /// Check if circuit is half-open (limited operations for recovery)
413    pub fn is_half_open(&self) -> bool {
414        matches!(self.state.load(), CircuitState::HalfOpen)
415    }
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn test_recovery_strategy_display() {
424        assert_eq!(RecoveryStrategy::UseCache.to_string(), "use_cache");
425        assert_eq!(RecoveryStrategy::Retry.to_string(), "retry");
426        assert_eq!(RecoveryStrategy::FailFast.to_string(), "fail_fast");
427        assert_eq!(RecoveryStrategy::ReadOnly.to_string(), "read_only");
428    }
429
430    #[test]
431    fn test_error_category_display() {
432        assert_eq!(ErrorCategory::NetworkError.to_string(), "network_error");
433        assert_eq!(ErrorCategory::VaultUnavailable.to_string(), "vault_unavailable");
434        assert_eq!(ErrorCategory::KeyNotFound.to_string(), "key_not_found");
435    }
436
437    #[test]
438    fn test_recovery_error_network() {
439        let error = RecoveryError::new(ErrorCategory::NetworkError, "Connection timeout");
440        assert_eq!(error.category, ErrorCategory::NetworkError);
441        assert_eq!(error.strategy, RecoveryStrategy::Retry);
442        assert!(error.retryable);
443    }
444
445    #[test]
446    fn test_recovery_error_vault_unavailable() {
447        let error = RecoveryError::new(ErrorCategory::VaultUnavailable, "Vault unreachable");
448        assert_eq!(error.category, ErrorCategory::VaultUnavailable);
449        assert_eq!(error.strategy, RecoveryStrategy::UseCache);
450        assert!(error.retryable);
451    }
452
453    #[test]
454    fn test_recovery_error_key_not_found() {
455        let error = RecoveryError::new(ErrorCategory::KeyNotFound, "Key missing");
456        assert!(!error.retryable);
457        assert_eq!(error.strategy, RecoveryStrategy::FailFast);
458    }
459
460    #[test]
461    fn test_recovery_error_with_retry_count() {
462        let error = RecoveryError::new(ErrorCategory::NetworkError, "Timeout").with_retry_count(2);
463        assert_eq!(error.retry_count, 2);
464    }
465
466    #[test]
467    fn test_recovery_error_is_fresh() {
468        let error = RecoveryError::new(ErrorCategory::NetworkError, "Timeout");
469        assert!(error.is_fresh());
470    }
471
472    #[test]
473    fn test_retry_config_default() {
474        let config = RetryConfig::default();
475        assert_eq!(config.max_retries, 3);
476        assert_eq!(config.initial_backoff_ms, 100);
477        assert_eq!(config.max_backoff_ms, 5000);
478    }
479
480    #[test]
481    fn test_retry_config_backoff_calculation() {
482        let config = RetryConfig::new();
483        assert_eq!(config.backoff_delay_ms(0), 100);
484        assert_eq!(config.backoff_delay_ms(1), 200);
485        assert_eq!(config.backoff_delay_ms(2), 400);
486        assert_eq!(config.backoff_delay_ms(3), 800);
487    }
488
489    #[test]
490    fn test_retry_config_max_backoff() {
491        let config = RetryConfig::new().with_max_retries(10);
492        let delay = config.backoff_delay_ms(10);
493        assert_eq!(delay, config.max_backoff_ms);
494    }
495
496    #[test]
497    fn test_circuit_breaker_closed() {
498        let breaker = CircuitBreaker::new(3, 2);
499        assert!(breaker.is_allowed());
500        assert_eq!(breaker.state(), CircuitState::Closed);
501    }
502
503    #[test]
504    fn test_circuit_breaker_opens_on_failure() {
505        let breaker = CircuitBreaker::new(3, 2);
506        breaker.record_failure();
507        breaker.record_failure();
508        breaker.record_failure();
509        assert!(!breaker.is_allowed());
510        assert_eq!(breaker.state(), CircuitState::Open);
511    }
512
513    #[test]
514    fn test_circuit_breaker_half_open_on_success() {
515        let breaker = CircuitBreaker::new(3, 2);
516        // Open the circuit
517        breaker.record_failure();
518        breaker.record_failure();
519        breaker.record_failure();
520
521        // Manually transition to half-open for testing
522        breaker.state.store(CircuitState::HalfOpen);
523        breaker.record_success();
524        breaker.record_success();
525
526        assert!(breaker.is_allowed());
527        assert_eq!(breaker.state(), CircuitState::Closed);
528    }
529
530    #[test]
531    fn test_circuit_breaker_reset() {
532        let breaker = CircuitBreaker::new(3, 2);
533        breaker.record_failure();
534        breaker.record_failure();
535        breaker.record_failure();
536        assert!(!breaker.is_allowed());
537
538        breaker.reset();
539        assert!(breaker.is_allowed());
540        assert_eq!(breaker.state(), CircuitState::Closed);
541    }
542
543    #[test]
544    fn test_recovery_error_is_transient() {
545        let transient = RecoveryError::new(ErrorCategory::NetworkError, "Network issue");
546        assert!(transient.is_transient());
547
548        let not_transient = RecoveryError::new(ErrorCategory::KeyNotFound, "Key missing");
549        assert!(!not_transient.is_transient());
550    }
551
552    #[test]
553    fn test_recovery_error_should_use_cache() {
554        let vault_error = RecoveryError::new(ErrorCategory::VaultUnavailable, "Vault down");
555        assert!(vault_error.should_use_cache());
556
557        let key_error = RecoveryError::new(ErrorCategory::KeyExpired, "Key expired");
558        assert!(!key_error.should_use_cache());
559    }
560
561    #[test]
562    fn test_retry_config_should_retry() {
563        let config = RetryConfig::new().with_max_retries(3);
564        assert!(config.should_retry(0));
565        assert!(config.should_retry(2));
566        assert!(!config.should_retry(3));
567        assert!(!config.should_retry(5));
568    }
569
570    #[test]
571    fn test_circuit_breaker_get_counts() {
572        let breaker = CircuitBreaker::new(5, 3);
573        let (failures, successes) = breaker.get_counts();
574        assert_eq!(failures, 0);
575        assert_eq!(successes, 0);
576
577        breaker.record_failure();
578        breaker.record_failure();
579        let (failures, _) = breaker.get_counts();
580        assert_eq!(failures, 2);
581    }
582
583    #[test]
584    fn test_circuit_breaker_is_open() {
585        let breaker = CircuitBreaker::new(2, 2);
586        assert!(!breaker.is_open());
587
588        breaker.record_failure();
589        breaker.record_failure();
590        assert!(breaker.is_open());
591    }
592
593    #[test]
594    fn test_circuit_breaker_is_half_open() {
595        let breaker = CircuitBreaker::new(2, 2);
596        assert!(!breaker.is_half_open());
597
598        breaker.record_failure();
599        breaker.record_failure();
600        breaker.state.store(CircuitState::HalfOpen);
601        assert!(breaker.is_half_open());
602    }
603
604    #[test]
605    fn test_circuit_breaker_attempt_recovery() {
606        let breaker = CircuitBreaker::new(1, 2);
607        breaker.record_failure();
608        assert!(breaker.is_open());
609
610        // Attempt recovery immediately (timeout not elapsed)
611        breaker.attempt_recovery(1000);
612        assert!(breaker.is_open());
613
614        // Should transition to HalfOpen after timeout
615        breaker.attempt_recovery(0);
616        assert!(breaker.is_half_open());
617    }
618}