Skip to main content

rustkernel_core/resilience/
mod.rs

1//! Resilience Patterns
2//!
3//! This module provides production-grade resilience patterns for RustKernels:
4//!
5//! - **Circuit Breaker**: Prevent cascade failures by detecting unhealthy kernels
6//! - **Timeout**: Deadline propagation and timeout enforcement
7//! - **Recovery**: Automatic recovery from transient failures
8//! - **Health**: Health checking for liveness/readiness probes
9//!
10//! # Example
11//!
12//! ```rust,ignore
13//! use rustkernel_core::resilience::{CircuitBreaker, CircuitBreakerConfig};
14//!
15//! let config = CircuitBreakerConfig::default()
16//!     .failure_threshold(5)
17//!     .reset_timeout(Duration::from_secs(30));
18//!
19//! let cb = CircuitBreaker::new("graph/pagerank", config);
20//!
21//! // Execute with circuit breaker protection
22//! cb.execute(|| async {
23//!     kernel.execute(input).await
24//! }).await?;
25//! ```
26
27pub mod circuit_breaker;
28pub mod health;
29pub mod recovery;
30pub mod timeout;
31
32pub use circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, CircuitState};
33pub use health::{HealthCheck, HealthCheckResult, HealthProbe};
34pub use recovery::{RecoveryPolicy, RecoveryStrategy, RetryConfig};
35pub use timeout::{DeadlineContext, TimeoutConfig, TimeoutError};
36
37// Re-export ringkernel-core 0.4.2 health/resilience primitives for deep integration.
38pub use ringkernel_core::health as ring_health;
39pub use ringkernel_core::rate_limiting as ring_rate_limiting;
40pub use ringkernel_core::timeout as ring_timeout;
41
42use serde::{Deserialize, Serialize};
43use std::time::Duration;
44
45/// Unified resilience configuration
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct ResilienceConfig {
48    /// Circuit breaker configuration
49    pub circuit_breaker: Option<CircuitBreakerConfig>,
50    /// Timeout configuration
51    pub timeout: Option<TimeoutConfig>,
52    /// Recovery policy
53    pub recovery: Option<RecoveryPolicy>,
54    /// Health check configuration
55    pub health_check_interval: Duration,
56}
57
58impl Default for ResilienceConfig {
59    fn default() -> Self {
60        Self {
61            circuit_breaker: Some(CircuitBreakerConfig::default()),
62            timeout: Some(TimeoutConfig::default()),
63            recovery: Some(RecoveryPolicy::default()),
64            health_check_interval: Duration::from_secs(10),
65        }
66    }
67}
68
69impl ResilienceConfig {
70    /// Create a new resilience config
71    pub fn new() -> Self {
72        Self::default()
73    }
74
75    /// Disable all resilience features
76    pub fn disabled() -> Self {
77        Self {
78            circuit_breaker: None,
79            timeout: None,
80            recovery: None,
81            health_check_interval: Duration::from_secs(60),
82        }
83    }
84
85    /// Production configuration with conservative settings
86    pub fn production() -> Self {
87        Self {
88            circuit_breaker: Some(CircuitBreakerConfig::production()),
89            timeout: Some(TimeoutConfig::production()),
90            recovery: Some(RecoveryPolicy::production()),
91            health_check_interval: Duration::from_secs(10),
92        }
93    }
94
95    /// Development configuration with relaxed settings
96    pub fn development() -> Self {
97        Self {
98            circuit_breaker: Some(CircuitBreakerConfig::default()),
99            timeout: Some(TimeoutConfig::development()),
100            recovery: Some(RecoveryPolicy::development()),
101            health_check_interval: Duration::from_secs(30),
102        }
103    }
104
105    /// Set circuit breaker config
106    pub fn with_circuit_breaker(mut self, config: CircuitBreakerConfig) -> Self {
107        self.circuit_breaker = Some(config);
108        self
109    }
110
111    /// Set timeout config
112    pub fn with_timeout(mut self, config: TimeoutConfig) -> Self {
113        self.timeout = Some(config);
114        self
115    }
116
117    /// Set recovery policy
118    pub fn with_recovery(mut self, policy: RecoveryPolicy) -> Self {
119        self.recovery = Some(policy);
120        self
121    }
122
123    /// Set health check interval
124    pub fn with_health_check_interval(mut self, interval: Duration) -> Self {
125        self.health_check_interval = interval;
126        self
127    }
128}
129
130/// Result type for resilience operations
131pub type ResilienceResult<T> = std::result::Result<T, ResilienceError>;
132
133/// Errors from resilience patterns
134#[derive(Debug, thiserror::Error)]
135pub enum ResilienceError {
136    /// Circuit breaker is open
137    #[error("Circuit breaker is open for {kernel_id}")]
138    CircuitOpen {
139        /// The kernel ID whose circuit breaker is open
140        kernel_id: String,
141    },
142
143    /// Request timed out
144    #[error("Request timed out after {timeout:?}")]
145    Timeout {
146        /// The timeout duration that was exceeded
147        timeout: Duration,
148    },
149
150    /// Deadline exceeded
151    #[error("Deadline exceeded")]
152    DeadlineExceeded,
153
154    /// Max retries exceeded
155    #[error("Max retries ({retries}) exceeded")]
156    MaxRetriesExceeded {
157        /// The number of retries that were attempted
158        retries: u32,
159    },
160
161    /// Health check failed
162    #[error("Health check failed: {reason}")]
163    HealthCheckFailed {
164        /// The reason for the health check failure
165        reason: String,
166    },
167
168    /// Kernel error during execution
169    #[error("Kernel error: {0}")]
170    KernelError(#[from] crate::error::KernelError),
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn test_default_config() {
179        let config = ResilienceConfig::default();
180        assert!(config.circuit_breaker.is_some());
181        assert!(config.timeout.is_some());
182        assert!(config.recovery.is_some());
183    }
184
185    #[test]
186    fn test_disabled_config() {
187        let config = ResilienceConfig::disabled();
188        assert!(config.circuit_breaker.is_none());
189        assert!(config.timeout.is_none());
190        assert!(config.recovery.is_none());
191    }
192
193    #[test]
194    fn test_production_config() {
195        let config = ResilienceConfig::production();
196        assert!(config.circuit_breaker.is_some());
197        assert!(config.timeout.is_some());
198    }
199}