scirs2_metrics/optimization/distributed/fault_tolerance/
mod.rs

1//! Fault tolerance and recovery management
2//!
3//! This module provides comprehensive fault tolerance capabilities:
4//! - Automatic failure detection and recovery
5//! - Health monitoring and alerting
6//! - Node replacement strategies
7//! - Data backup and restoration
8//! - Circuit breaker patterns
9
10use crate::error::{MetricsError, Result};
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, VecDeque};
13use std::sync::{Arc, Mutex, RwLock};
14use std::time::{Duration, Instant, SystemTime};
15
16pub use super::config::{FaultToleranceConfig, NodeReplacementStrategy};
17
18/// Recovery action types
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub enum RecoveryActionType {
21    /// Node failover to backup
22    NodeFailover,
23    /// Data replication to maintain redundancy
24    DataReplication,
25    /// Network healing and reconnection
26    NetworkHeal,
27    /// Service restart
28    ServiceRestart,
29    /// Resource scaling
30    ResourceScaling,
31    /// Configuration rollback
32    ConfigRollback,
33}
34
35/// Node health status
36#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
37pub enum NodeHealthStatus {
38    /// Node is healthy and responsive
39    Healthy,
40    /// Node is degraded but functional
41    Degraded,
42    /// Node has failed
43    Failed,
44    /// Node status is unknown
45    Unknown,
46    /// Node is recovering from failure
47    Recovering,
48    /// Node is being maintained
49    Maintenance,
50}
51
52/// Alert severity levels
53#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
54pub enum AlertSeverity {
55    /// Informational alert
56    Info,
57    /// Warning alert
58    Warning,
59    /// Error alert
60    Error,
61    /// Critical alert
62    Critical,
63    /// Emergency alert
64    Emergency,
65}
66
67/// Recovery strategy options
68#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
69pub enum RecoveryStrategy {
70    /// Immediate recovery action
71    Immediate,
72    /// Graceful recovery with delay
73    Graceful { delay: Duration },
74    /// Manual recovery only
75    Manual,
76    /// Automatic with fallback to manual
77    AutomaticWithFallback,
78    /// Progressive recovery (try multiple strategies)
79    Progressive,
80}
81
82/// Fault recovery manager
83#[derive(Debug)]
84pub struct FaultRecoveryManager {
85    /// Configuration
86    config: FaultToleranceConfig,
87    /// Health monitor
88    health_monitor: HealthMonitor,
89    /// Recovery actions history
90    recovery_history: Arc<RwLock<VecDeque<RecoveryAction>>>,
91    /// Alert thresholds
92    alert_thresholds: AlertThresholds,
93    /// Active recovery operations
94    active_recoveries: Arc<Mutex<HashMap<String, RecoveryOperation>>>,
95    /// Node replacement queue
96    replacement_queue: Arc<Mutex<VecDeque<NodeReplacementRequest>>>,
97}
98
99impl FaultRecoveryManager {
100    /// Create a new fault recovery manager
101    pub fn new(config: FaultToleranceConfig) -> Self {
102        Self {
103            health_monitor: HealthMonitor::new(config.health_check_interval),
104            alert_thresholds: AlertThresholds::default(),
105            recovery_history: Arc::new(RwLock::new(VecDeque::new())),
106            active_recoveries: Arc::new(Mutex::new(HashMap::new())),
107            replacement_queue: Arc::new(Mutex::new(VecDeque::new())),
108            config,
109        }
110    }
111
112    /// Start fault monitoring and recovery
113    pub fn start(&mut self) -> Result<()> {
114        self.health_monitor.start()?;
115        Ok(())
116    }
117
118    /// Stop fault monitoring
119    pub fn stop(&mut self) -> Result<()> {
120        self.health_monitor.stop()?;
121        Ok(())
122    }
123
124    /// Register a node for monitoring
125    pub fn register_node(&mut self, node_id: String, metrics: NodeMetrics) -> Result<()> {
126        self.health_monitor.register_node(node_id, metrics)
127    }
128
129    /// Unregister a node from monitoring
130    pub fn unregister_node(&mut self, node_id: &str) -> Result<()> {
131        self.health_monitor.unregister_node(node_id)
132    }
133
134    /// Update node metrics
135    pub fn update_node_metrics(&mut self, node_id: &str, metrics: NodeMetrics) -> Result<()> {
136        self.health_monitor
137            .update_metrics(node_id, metrics.clone())?;
138
139        // Check if recovery action is needed
140        if let Some(action) = self.evaluate_recovery_need(node_id, &metrics)? {
141            self.trigger_recovery(action)?;
142        }
143
144        Ok(())
145    }
146
147    /// Evaluate if recovery action is needed for a node
148    fn evaluate_recovery_need(
149        &self,
150        node_id: &str,
151        metrics: &NodeMetrics,
152    ) -> Result<Option<RecoveryAction>> {
153        // Check CPU threshold
154        if metrics.cpu_usage > self.alert_thresholds.cpu_critical {
155            return Ok(Some(RecoveryAction {
156                id: format!(
157                    "recovery_{}_cpu_{}",
158                    node_id,
159                    Instant::now().elapsed().as_millis()
160                ),
161                action_type: RecoveryActionType::ResourceScaling,
162                target_node: node_id.to_string(),
163                severity: AlertSeverity::Critical,
164                description: format!("High CPU usage: {}%", metrics.cpu_usage),
165                strategy: RecoveryStrategy::Immediate,
166                created_at: SystemTime::now(),
167                started_at: None,
168                completed_at: None,
169                status: RecoveryStatus::Pending,
170                error: None,
171            }));
172        }
173
174        // Check memory threshold
175        if metrics.memory_usage > self.alert_thresholds.memory_critical {
176            return Ok(Some(RecoveryAction {
177                id: format!(
178                    "recovery_{}_memory_{}",
179                    node_id,
180                    Instant::now().elapsed().as_millis()
181                ),
182                action_type: RecoveryActionType::ResourceScaling,
183                target_node: node_id.to_string(),
184                severity: AlertSeverity::Critical,
185                description: format!("High memory usage: {}%", metrics.memory_usage),
186                strategy: RecoveryStrategy::Immediate,
187                created_at: SystemTime::now(),
188                started_at: None,
189                completed_at: None,
190                status: RecoveryStatus::Pending,
191                error: None,
192            }));
193        }
194
195        // Check if node is unresponsive
196        let last_heartbeat_age = metrics
197            .last_heartbeat
198            .elapsed()
199            .unwrap_or_else(|_| Duration::from_secs(0));
200        if last_heartbeat_age > Duration::from_secs(self.config.health_check_interval * 3) {
201            return Ok(Some(RecoveryAction {
202                id: format!(
203                    "recovery_{}_heartbeat_{}",
204                    node_id,
205                    SystemTime::now()
206                        .duration_since(SystemTime::UNIX_EPOCH)
207                        .unwrap_or_default()
208                        .as_millis()
209                ),
210                action_type: RecoveryActionType::NodeFailover,
211                target_node: node_id.to_string(),
212                severity: AlertSeverity::Emergency,
213                description: format!("Node unresponsive for {:?}", last_heartbeat_age),
214                strategy: RecoveryStrategy::Immediate,
215                created_at: SystemTime::now(),
216                started_at: None,
217                completed_at: None,
218                status: RecoveryStatus::Pending,
219                error: None,
220            }));
221        }
222
223        Ok(None)
224    }
225
226    /// Trigger a recovery action
227    pub fn trigger_recovery(&mut self, action: RecoveryAction) -> Result<String> {
228        if !self.config.auto_recovery && action.strategy != RecoveryStrategy::Manual {
229            // Log the action but don't execute if auto recovery is disabled
230            self.log_action(action);
231            return Ok(
232                "Recovery action logged but not executed (auto recovery disabled)".to_string(),
233            );
234        }
235
236        let action_id = action.id.clone();
237        let mut active_recoveries = self.active_recoveries.lock().expect("Operation failed");
238
239        let recovery_op = RecoveryOperation {
240            action: action.clone(),
241            progress: 0.0,
242            estimated_completion: None,
243        };
244
245        active_recoveries.insert(action_id.clone(), recovery_op);
246        drop(active_recoveries);
247
248        // Execute the recovery action
249        match action.action_type {
250            RecoveryActionType::NodeFailover => {
251                self.execute_node_failover(&action)?;
252            }
253            RecoveryActionType::DataReplication => {
254                self.execute_data_replication(&action)?;
255            }
256            RecoveryActionType::NetworkHeal => {
257                self.execute_network_heal(&action)?;
258            }
259            RecoveryActionType::ServiceRestart => {
260                self.execute_service_restart(&action)?;
261            }
262            RecoveryActionType::ResourceScaling => {
263                self.execute_resource_scaling(&action)?;
264            }
265            RecoveryActionType::ConfigRollback => {
266                self.execute_config_rollback(&action)?;
267            }
268        }
269
270        self.log_action(action);
271        Ok(action_id)
272    }
273
274    /// Execute node failover
275    fn execute_node_failover(&self, action: &RecoveryAction) -> Result<()> {
276        // TODO: Implement actual failover logic
277        // This would involve:
278        // 1. Identifying backup/standby nodes
279        // 2. Migrating workload from failed node
280        // 3. Updating routing tables
281        // 4. Notifying cluster about the change
282
283        println!("Executing node failover for node: {}", action.target_node);
284
285        // Queue node replacement if configured
286        match self.config.replacement_strategy {
287            NodeReplacementStrategy::Immediate | NodeReplacementStrategy::HotStandby => {
288                let mut queue = self.replacement_queue.lock().expect("Operation failed");
289                queue.push_back(NodeReplacementRequest {
290                    failed_node: action.target_node.clone(),
291                    replacement_type: self.config.replacement_strategy.clone(),
292                    requested_at: SystemTime::now(),
293                    priority: match action.severity {
294                        AlertSeverity::Emergency | AlertSeverity::Critical => {
295                            ReplacementPriority::High
296                        }
297                        _ => ReplacementPriority::Normal,
298                    },
299                });
300            }
301            _ => {}
302        }
303
304        Ok(())
305    }
306
307    /// Execute data replication
308    fn execute_data_replication(&self, action: &RecoveryAction) -> Result<()> {
309        // TODO: Implement data replication logic
310        println!(
311            "Executing data replication for node: {}",
312            action.target_node
313        );
314        Ok(())
315    }
316
317    /// Execute network healing
318    fn execute_network_heal(&self, action: &RecoveryAction) -> Result<()> {
319        // TODO: Implement network healing logic
320        println!("Executing network heal for node: {}", action.target_node);
321        Ok(())
322    }
323
324    /// Execute service restart
325    fn execute_service_restart(&self, action: &RecoveryAction) -> Result<()> {
326        // TODO: Implement service restart logic
327        println!("Executing service restart for node: {}", action.target_node);
328        Ok(())
329    }
330
331    /// Execute resource scaling
332    fn execute_resource_scaling(&self, action: &RecoveryAction) -> Result<()> {
333        // TODO: Implement resource scaling logic
334        println!(
335            "Executing resource scaling for node: {}",
336            action.target_node
337        );
338        Ok(())
339    }
340
341    /// Execute configuration rollback
342    fn execute_config_rollback(&self, action: &RecoveryAction) -> Result<()> {
343        // TODO: Implement config rollback logic
344        println!("Executing config rollback for node: {}", action.target_node);
345        Ok(())
346    }
347
348    /// Log a recovery action
349    fn log_action(&self, action: RecoveryAction) {
350        let mut history = self.recovery_history.write().expect("Operation failed");
351        history.push_back(action);
352
353        // Keep only recent history
354        while history.len() > 10000 {
355            history.pop_front();
356        }
357    }
358
359    /// Get recovery history
360    pub fn get_recovery_history(&self) -> Vec<RecoveryAction> {
361        let history = self.recovery_history.read().expect("Operation failed");
362        history.iter().cloned().collect()
363    }
364
365    /// Get active recovery operations
366    pub fn get_active_recoveries(&self) -> Vec<RecoveryOperation> {
367        let active = self.active_recoveries.lock().expect("Operation failed");
368        active.values().cloned().collect()
369    }
370
371    /// Complete a recovery operation
372    pub fn complete_recovery(
373        &mut self,
374        action_id: &str,
375        success: bool,
376        error: Option<String>,
377    ) -> Result<()> {
378        let mut active_recoveries = self.active_recoveries.lock().expect("Operation failed");
379
380        if let Some(mut recovery_op) = active_recoveries.remove(action_id) {
381            recovery_op.action.completed_at = Some(SystemTime::now());
382            recovery_op.action.status = if success {
383                RecoveryStatus::Completed
384            } else {
385                RecoveryStatus::Failed
386            };
387            recovery_op.action.error = error;
388            recovery_op.progress = 1.0;
389
390            // Update history
391            self.log_action(recovery_op.action);
392        }
393
394        Ok(())
395    }
396
397    /// Get cluster health summary
398    pub fn get_health_summary(&self) -> HealthSummary {
399        self.health_monitor.get_health_summary()
400    }
401
402    /// Update alert thresholds
403    pub fn update_alert_thresholds(&mut self, thresholds: AlertThresholds) {
404        self.alert_thresholds = thresholds;
405    }
406
407    /// Process node replacement requests
408    pub fn process_replacement_requests(&mut self) -> Result<Vec<NodeReplacementRequest>> {
409        let mut queue = self.replacement_queue.lock().expect("Operation failed");
410        let requests: Vec<_> = queue.drain(..).collect();
411        Ok(requests)
412    }
413}
414
415/// Recovery action
416#[derive(Debug, Clone, Serialize, Deserialize)]
417pub struct RecoveryAction {
418    /// Unique action ID
419    pub id: String,
420    /// Type of recovery action
421    pub action_type: RecoveryActionType,
422    /// Target node for the action
423    pub target_node: String,
424    /// Alert severity
425    pub severity: AlertSeverity,
426    /// Human-readable description
427    pub description: String,
428    /// Recovery strategy
429    pub strategy: RecoveryStrategy,
430    /// When action was created
431    pub created_at: SystemTime,
432    /// When action started execution
433    pub started_at: Option<SystemTime>,
434    /// When action completed
435    pub completed_at: Option<SystemTime>,
436    /// Current status
437    pub status: RecoveryStatus,
438    /// Error message if failed
439    pub error: Option<String>,
440}
441
442/// Recovery action status
443#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
444pub enum RecoveryStatus {
445    /// Action is pending execution
446    Pending,
447    /// Action is in progress
448    InProgress,
449    /// Action completed successfully
450    Completed,
451    /// Action failed
452    Failed,
453    /// Action was cancelled
454    Cancelled,
455}
456
457/// Recovery operation (action with progress tracking)
458#[derive(Debug, Clone)]
459pub struct RecoveryOperation {
460    /// The recovery action
461    pub action: RecoveryAction,
462    /// Progress percentage (0.0 - 1.0)
463    pub progress: f64,
464    /// Estimated completion time
465    pub estimated_completion: Option<SystemTime>,
466}
467
468/// Health monitoring system
469#[derive(Debug)]
470pub struct HealthMonitor {
471    /// Monitored nodes
472    nodes: Arc<RwLock<HashMap<String, NodeMonitoringInfo>>>,
473    /// Check interval
474    check_interval: u64,
475    /// Monitoring active flag
476    is_monitoring: Arc<RwLock<bool>>,
477}
478
479impl HealthMonitor {
480    /// Create a new health monitor
481    pub fn new(check_interval: u64) -> Self {
482        Self {
483            nodes: Arc::new(RwLock::new(HashMap::new())),
484            check_interval,
485            is_monitoring: Arc::new(RwLock::new(false)),
486        }
487    }
488
489    /// Start monitoring
490    pub fn start(&mut self) -> Result<()> {
491        let mut is_monitoring = self.is_monitoring.write().expect("Operation failed");
492        *is_monitoring = true;
493
494        // TODO: Start monitoring thread
495        Ok(())
496    }
497
498    /// Stop monitoring
499    pub fn stop(&mut self) -> Result<()> {
500        let mut is_monitoring = self.is_monitoring.write().expect("Operation failed");
501        *is_monitoring = false;
502
503        // TODO: Stop monitoring thread
504        Ok(())
505    }
506
507    /// Register a node for monitoring
508    pub fn register_node(&mut self, node_id: String, metrics: NodeMetrics) -> Result<()> {
509        let mut nodes = self.nodes.write().expect("Operation failed");
510
511        let monitoring_info = NodeMonitoringInfo {
512            node_id: node_id.clone(),
513            current_metrics: metrics,
514            health_status: NodeHealthStatus::Healthy,
515            last_check: Instant::now(),
516            failure_count: 0,
517            recovery_attempts: 0,
518            alerts: VecDeque::new(),
519        };
520
521        nodes.insert(node_id, monitoring_info);
522        Ok(())
523    }
524
525    /// Unregister a node from monitoring
526    pub fn unregister_node(&mut self, node_id: &str) -> Result<()> {
527        let mut nodes = self.nodes.write().expect("Operation failed");
528        nodes.remove(node_id);
529        Ok(())
530    }
531
532    /// Update node metrics
533    pub fn update_metrics(&mut self, node_id: &str, metrics: NodeMetrics) -> Result<()> {
534        let mut nodes = self.nodes.write().expect("Operation failed");
535
536        if let Some(monitoring_info) = nodes.get_mut(node_id) {
537            monitoring_info.current_metrics = metrics;
538            monitoring_info.last_check = Instant::now();
539            monitoring_info.health_status =
540                self.determine_health_status(&monitoring_info.current_metrics);
541        } else {
542            return Err(MetricsError::FaultToleranceError(format!(
543                "Node {} not registered for monitoring",
544                node_id
545            )));
546        }
547
548        Ok(())
549    }
550
551    /// Determine health status based on metrics
552    fn determine_health_status(&self, metrics: &NodeMetrics) -> NodeHealthStatus {
553        // Check if node is responsive
554        let heartbeat_age = metrics
555            .last_heartbeat
556            .elapsed()
557            .unwrap_or_else(|_| Duration::from_secs(0));
558        if heartbeat_age > Duration::from_secs(self.check_interval * 3) {
559            return NodeHealthStatus::Failed;
560        }
561
562        // Check resource utilization
563        if metrics.cpu_usage > 95.0 || metrics.memory_usage > 95.0 {
564            return NodeHealthStatus::Degraded;
565        }
566
567        if metrics.cpu_usage > 85.0 || metrics.memory_usage > 85.0 {
568            return NodeHealthStatus::Degraded;
569        }
570
571        NodeHealthStatus::Healthy
572    }
573
574    /// Get health summary for all nodes
575    pub fn get_health_summary(&self) -> HealthSummary {
576        let nodes = self.nodes.read().expect("Operation failed");
577
578        let mut summary = HealthSummary {
579            total_nodes: nodes.len(),
580            healthy_nodes: 0,
581            degraded_nodes: 0,
582            failed_nodes: 0,
583            unknown_nodes: 0,
584            recovering_nodes: 0,
585            maintenance_nodes: 0,
586            last_updated: SystemTime::now(),
587        };
588
589        for monitoring_info in nodes.values() {
590            match monitoring_info.health_status {
591                NodeHealthStatus::Healthy => summary.healthy_nodes += 1,
592                NodeHealthStatus::Degraded => summary.degraded_nodes += 1,
593                NodeHealthStatus::Failed => summary.failed_nodes += 1,
594                NodeHealthStatus::Unknown => summary.unknown_nodes += 1,
595                NodeHealthStatus::Recovering => summary.recovering_nodes += 1,
596                NodeHealthStatus::Maintenance => summary.maintenance_nodes += 1,
597            }
598        }
599
600        summary
601    }
602
603    /// Get node health status
604    pub fn get_node_health(&self, node_id: &str) -> Option<NodeHealthStatus> {
605        let nodes = self.nodes.read().expect("Operation failed");
606        nodes.get(node_id).map(|info| info.health_status.clone())
607    }
608
609    /// List all monitored nodes
610    pub fn list_nodes(&self) -> Vec<String> {
611        let nodes = self.nodes.read().expect("Operation failed");
612        nodes.keys().cloned().collect()
613    }
614}
615
616/// Node monitoring information
617#[derive(Debug, Clone)]
618pub struct NodeMonitoringInfo {
619    /// Node ID
620    pub node_id: String,
621    /// Current metrics
622    pub current_metrics: NodeMetrics,
623    /// Current health status
624    pub health_status: NodeHealthStatus,
625    /// Last health check time
626    pub last_check: Instant,
627    /// Number of consecutive failures
628    pub failure_count: usize,
629    /// Number of recovery attempts
630    pub recovery_attempts: usize,
631    /// Recent alerts
632    pub alerts: VecDeque<Alert>,
633}
634
635/// Node metrics for health monitoring
636#[derive(Debug, Clone, Serialize, Deserialize)]
637pub struct NodeMetrics {
638    /// CPU usage percentage (0-100)
639    pub cpu_usage: f64,
640    /// Memory usage percentage (0-100)
641    pub memory_usage: f64,
642    /// Disk usage percentage (0-100)
643    pub disk_usage: f64,
644    /// Network bandwidth utilization (0-100)
645    pub network_usage: f64,
646    /// Number of active connections
647    pub active_connections: usize,
648    /// Response time in milliseconds
649    pub response_time_ms: f64,
650    /// Error rate (0-1)
651    pub error_rate: f64,
652    /// Last heartbeat timestamp
653    pub last_heartbeat: SystemTime,
654    /// Custom metrics
655    pub custom_metrics: HashMap<String, f64>,
656}
657
658impl NodeMetrics {
659    /// Create default metrics for a healthy node
660    pub fn healthy() -> Self {
661        Self {
662            cpu_usage: 10.0,
663            memory_usage: 20.0,
664            disk_usage: 30.0,
665            network_usage: 5.0,
666            active_connections: 10,
667            response_time_ms: 50.0,
668            error_rate: 0.001,
669            last_heartbeat: SystemTime::now(),
670            custom_metrics: HashMap::new(),
671        }
672    }
673
674    /// Create metrics for a degraded node
675    pub fn degraded() -> Self {
676        Self {
677            cpu_usage: 90.0, // Above the 85.0 threshold
678            memory_usage: 80.0,
679            disk_usage: 70.0,
680            network_usage: 60.0,
681            active_connections: 100,
682            response_time_ms: 500.0,
683            error_rate: 0.05,
684            last_heartbeat: SystemTime::now(),
685            custom_metrics: HashMap::new(),
686        }
687    }
688}
689
690/// Alert thresholds configuration
691#[derive(Debug, Clone, Serialize, Deserialize)]
692pub struct AlertThresholds {
693    /// CPU usage warning threshold
694    pub cpu_warning: f64,
695    /// CPU usage critical threshold
696    pub cpu_critical: f64,
697    /// Memory usage warning threshold
698    pub memory_warning: f64,
699    /// Memory usage critical threshold
700    pub memory_critical: f64,
701    /// Response time warning threshold (ms)
702    pub response_time_warning: f64,
703    /// Response time critical threshold (ms)
704    pub response_time_critical: f64,
705    /// Error rate warning threshold
706    pub error_rate_warning: f64,
707    /// Error rate critical threshold
708    pub error_rate_critical: f64,
709}
710
711impl Default for AlertThresholds {
712    fn default() -> Self {
713        Self {
714            cpu_warning: 80.0,
715            cpu_critical: 95.0,
716            memory_warning: 80.0,
717            memory_critical: 95.0,
718            response_time_warning: 1000.0,
719            response_time_critical: 5000.0,
720            error_rate_warning: 0.01,
721            error_rate_critical: 0.05,
722        }
723    }
724}
725
726/// Health summary for the entire cluster
727#[derive(Debug, Clone, Serialize, Deserialize)]
728pub struct HealthSummary {
729    /// Total number of nodes
730    pub total_nodes: usize,
731    /// Number of healthy nodes
732    pub healthy_nodes: usize,
733    /// Number of degraded nodes
734    pub degraded_nodes: usize,
735    /// Number of failed nodes
736    pub failed_nodes: usize,
737    /// Number of nodes with unknown status
738    pub unknown_nodes: usize,
739    /// Number of recovering nodes
740    pub recovering_nodes: usize,
741    /// Number of nodes in maintenance
742    pub maintenance_nodes: usize,
743    /// Last update timestamp
744    pub last_updated: SystemTime,
745}
746
747impl HealthSummary {
748    /// Calculate health percentage
749    pub fn health_percentage(&self) -> f64 {
750        if self.total_nodes == 0 {
751            return 100.0;
752        }
753
754        (self.healthy_nodes as f64 / self.total_nodes as f64) * 100.0
755    }
756
757    /// Check if cluster is healthy
758    pub fn is_healthy(&self) -> bool {
759        self.failed_nodes == 0 && self.degraded_nodes <= (self.total_nodes / 10)
760    }
761}
762
763/// Alert information
764#[derive(Debug, Clone, Serialize, Deserialize)]
765pub struct Alert {
766    /// Alert ID
767    pub id: String,
768    /// Alert severity
769    pub severity: AlertSeverity,
770    /// Alert message
771    pub message: String,
772    /// Alert timestamp
773    pub timestamp: SystemTime,
774    /// Whether alert has been acknowledged
775    pub acknowledged: bool,
776}
777
778/// Node replacement request
779#[derive(Debug, Clone)]
780pub struct NodeReplacementRequest {
781    /// Failed node ID
782    pub failed_node: String,
783    /// Type of replacement
784    pub replacement_type: NodeReplacementStrategy,
785    /// When replacement was requested
786    pub requested_at: SystemTime,
787    /// Request priority
788    pub priority: ReplacementPriority,
789}
790
791/// Node replacement priority
792#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
793pub enum ReplacementPriority {
794    /// Low priority replacement
795    Low,
796    /// Normal priority replacement
797    Normal,
798    /// High priority replacement
799    High,
800    /// Emergency replacement
801    Emergency,
802}
803
804#[cfg(test)]
805mod tests {
806    use super::*;
807
808    #[test]
809    fn test_fault_recovery_manager_creation() {
810        let config = FaultToleranceConfig::default();
811        let manager = FaultRecoveryManager::new(config);
812        assert_eq!(manager.get_recovery_history().len(), 0);
813    }
814
815    #[test]
816    fn test_health_monitor_creation() {
817        let monitor = HealthMonitor::new(30);
818        assert_eq!(monitor.list_nodes().len(), 0);
819    }
820
821    #[test]
822    fn test_node_registration() {
823        let mut monitor = HealthMonitor::new(30);
824        let metrics = NodeMetrics::healthy();
825
826        monitor
827            .register_node("node1".to_string(), metrics)
828            .expect("Operation failed");
829        assert_eq!(monitor.list_nodes().len(), 1);
830        assert_eq!(
831            monitor.get_node_health("node1"),
832            Some(NodeHealthStatus::Healthy)
833        );
834    }
835
836    #[test]
837    fn test_health_status_determination() {
838        let monitor = HealthMonitor::new(30);
839
840        let healthy_metrics = NodeMetrics::healthy();
841        assert_eq!(
842            monitor.determine_health_status(&healthy_metrics),
843            NodeHealthStatus::Healthy
844        );
845
846        let degraded_metrics = NodeMetrics::degraded();
847        assert_eq!(
848            monitor.determine_health_status(&degraded_metrics),
849            NodeHealthStatus::Degraded
850        );
851    }
852
853    #[test]
854    fn test_recovery_action_creation() {
855        let action = RecoveryAction {
856            id: "test_action".to_string(),
857            action_type: RecoveryActionType::NodeFailover,
858            target_node: "node1".to_string(),
859            severity: AlertSeverity::Critical,
860            description: "Test recovery action".to_string(),
861            strategy: RecoveryStrategy::Immediate,
862            created_at: SystemTime::now(),
863            started_at: None,
864            completed_at: None,
865            status: RecoveryStatus::Pending,
866            error: None,
867        };
868
869        assert_eq!(action.status, RecoveryStatus::Pending);
870        assert_eq!(action.severity, AlertSeverity::Critical);
871    }
872
873    #[test]
874    fn test_health_summary() {
875        let summary = HealthSummary {
876            total_nodes: 10,
877            healthy_nodes: 8,
878            degraded_nodes: 1,
879            failed_nodes: 1,
880            unknown_nodes: 0,
881            recovering_nodes: 0,
882            maintenance_nodes: 0,
883            last_updated: SystemTime::now(),
884        };
885
886        assert_eq!(summary.health_percentage(), 80.0);
887        assert!(!summary.is_healthy()); // Has failed nodes
888    }
889
890    #[test]
891    fn test_alert_thresholds() {
892        let thresholds = AlertThresholds::default();
893        assert_eq!(thresholds.cpu_warning, 80.0);
894        assert_eq!(thresholds.cpu_critical, 95.0);
895        assert!(thresholds.cpu_critical > thresholds.cpu_warning);
896    }
897}