scirs2-metrics 0.3.2

//! Fault tolerance and recovery management
//!
//! This module provides comprehensive fault tolerance capabilities:
//! - Automatic failure detection and recovery
//! - Health monitoring and alerting
//! - Node replacement strategies
//! - Data backup and restoration
//! - Circuit breaker patterns

use crate::error::{MetricsError, Result};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, VecDeque};
use std::sync::{Arc, Mutex, RwLock};
use std::time::{Duration, Instant, SystemTime};

pub use super::config::{FaultToleranceConfig, NodeReplacementStrategy};

/// Recovery action types
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RecoveryActionType {
    /// Node failover to backup
    NodeFailover,
    /// Data replication to maintain redundancy
    DataReplication,
    /// Network healing and reconnection
    NetworkHeal,
    /// Service restart
    ServiceRestart,
    /// Resource scaling
    ResourceScaling,
    /// Configuration rollback
    ConfigRollback,
}

/// Node health status
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum NodeHealthStatus {
    /// Node is healthy and responsive
    Healthy,
    /// Node is degraded but functional
    Degraded,
    /// Node has failed
    Failed,
    /// Node status is unknown
    Unknown,
    /// Node is recovering from failure
    Recovering,
    /// Node is being maintained
    Maintenance,
}

/// Alert severity levels
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum AlertSeverity {
    /// Informational alert
    Info,
    /// Warning alert
    Warning,
    /// Error alert
    Error,
    /// Critical alert
    Critical,
    /// Emergency alert
    Emergency,
}

/// Recovery strategy options
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum RecoveryStrategy {
    /// Immediate recovery action
    Immediate,
    /// Graceful recovery with delay
    Graceful { delay: Duration },
    /// Manual recovery only
    Manual,
    /// Automatic with fallback to manual
    AutomaticWithFallback,
    /// Progressive recovery (try multiple strategies)
    Progressive,
}

/// Fault recovery manager
#[derive(Debug)]
pub struct FaultRecoveryManager {
    /// Configuration
    config: FaultToleranceConfig,
    /// Health monitor
    health_monitor: HealthMonitor,
    /// Recovery actions history
    recovery_history: Arc<RwLock<VecDeque<RecoveryAction>>>,
    /// Alert thresholds
    alert_thresholds: AlertThresholds,
    /// Active recovery operations
    active_recoveries: Arc<Mutex<HashMap<String, RecoveryOperation>>>,
    /// Node replacement queue
    replacement_queue: Arc<Mutex<VecDeque<NodeReplacementRequest>>>,
}

impl FaultRecoveryManager {
    /// Create a new fault recovery manager
    pub fn new(config: FaultToleranceConfig) -> Self {
        Self {
            health_monitor: HealthMonitor::new(config.health_check_interval),
            alert_thresholds: AlertThresholds::default(),
            recovery_history: Arc::new(RwLock::new(VecDeque::new())),
            active_recoveries: Arc::new(Mutex::new(HashMap::new())),
            replacement_queue: Arc::new(Mutex::new(VecDeque::new())),
            config,
        }
    }

    /// Start fault monitoring and recovery
    pub fn start(&mut self) -> Result<()> {
        self.health_monitor.start()?;
        Ok(())
    }

    /// Stop fault monitoring
    pub fn stop(&mut self) -> Result<()> {
        self.health_monitor.stop()?;
        Ok(())
    }

    /// Register a node for monitoring
    pub fn register_node(&mut self, node_id: String, metrics: NodeMetrics) -> Result<()> {
        self.health_monitor.register_node(node_id, metrics)
    }

    /// Unregister a node from monitoring
    pub fn unregister_node(&mut self, node_id: &str) -> Result<()> {
        self.health_monitor.unregister_node(node_id)
    }

    /// Update node metrics
    pub fn update_node_metrics(&mut self, node_id: &str, metrics: NodeMetrics) -> Result<()> {
        self.health_monitor
            .update_metrics(node_id, metrics.clone())?;

        // Check if recovery action is needed
        if let Some(action) = self.evaluate_recovery_need(node_id, &metrics)? {
            self.trigger_recovery(action)?;
        }

        Ok(())
    }

    /// Evaluate if recovery action is needed for a node
    fn evaluate_recovery_need(
        &self,
        node_id: &str,
        metrics: &NodeMetrics,
    ) -> Result<Option<RecoveryAction>> {
        // Check CPU threshold
        if metrics.cpu_usage > self.alert_thresholds.cpu_critical {
            return Ok(Some(RecoveryAction {
                id: format!(
                    "recovery_{}_cpu_{}",
                    node_id,
                    Instant::now().elapsed().as_millis()
                ),
                action_type: RecoveryActionType::ResourceScaling,
                target_node: node_id.to_string(),
                severity: AlertSeverity::Critical,
                description: format!("High CPU usage: {}%", metrics.cpu_usage),
                strategy: RecoveryStrategy::Immediate,
                created_at: SystemTime::now(),
                started_at: None,
                completed_at: None,
                status: RecoveryStatus::Pending,
                error: None,
            }));
        }

        // Check memory threshold
        if metrics.memory_usage > self.alert_thresholds.memory_critical {
            return Ok(Some(RecoveryAction {
                id: format!(
                    "recovery_{}_memory_{}",
                    node_id,
                    Instant::now().elapsed().as_millis()
                ),
                action_type: RecoveryActionType::ResourceScaling,
                target_node: node_id.to_string(),
                severity: AlertSeverity::Critical,
                description: format!("High memory usage: {}%", metrics.memory_usage),
                strategy: RecoveryStrategy::Immediate,
                created_at: SystemTime::now(),
                started_at: None,
                completed_at: None,
                status: RecoveryStatus::Pending,
                error: None,
            }));
        }

        // Check if node is unresponsive
        let last_heartbeat_age = metrics
            .last_heartbeat
            .elapsed()
            .unwrap_or_else(|_| Duration::from_secs(0));
        if last_heartbeat_age > Duration::from_secs(self.config.health_check_interval * 3) {
            return Ok(Some(RecoveryAction {
                id: format!(
                    "recovery_{}_heartbeat_{}",
                    node_id,
                    SystemTime::now()
                        .duration_since(SystemTime::UNIX_EPOCH)
                        .unwrap_or_default()
                        .as_millis()
                ),
                action_type: RecoveryActionType::NodeFailover,
                target_node: node_id.to_string(),
                severity: AlertSeverity::Emergency,
                description: format!("Node unresponsive for {:?}", last_heartbeat_age),
                strategy: RecoveryStrategy::Immediate,
                created_at: SystemTime::now(),
                started_at: None,
                completed_at: None,
                status: RecoveryStatus::Pending,
                error: None,
            }));
        }

        Ok(None)
    }

    /// Trigger a recovery action
    pub fn trigger_recovery(&mut self, action: RecoveryAction) -> Result<String> {
        if !self.config.auto_recovery && action.strategy != RecoveryStrategy::Manual {
            // Log the action but don't execute if auto recovery is disabled
            self.log_action(action);
            return Ok(
                "Recovery action logged but not executed (auto recovery disabled)".to_string(),
            );
        }

        let action_id = action.id.clone();
        let mut active_recoveries = self.active_recoveries.lock().expect("Operation failed");

        let recovery_op = RecoveryOperation {
            action: action.clone(),
            progress: 0.0,
            estimated_completion: None,
        };

        active_recoveries.insert(action_id.clone(), recovery_op);
        drop(active_recoveries);

        // Execute the recovery action
        match action.action_type {
            RecoveryActionType::NodeFailover => {
                self.execute_node_failover(&action)?;
            }
            RecoveryActionType::DataReplication => {
                self.execute_data_replication(&action)?;
            }
            RecoveryActionType::NetworkHeal => {
                self.execute_network_heal(&action)?;
            }
            RecoveryActionType::ServiceRestart => {
                self.execute_service_restart(&action)?;
            }
            RecoveryActionType::ResourceScaling => {
                self.execute_resource_scaling(&action)?;
            }
            RecoveryActionType::ConfigRollback => {
                self.execute_config_rollback(&action)?;
            }
        }

        self.log_action(action);
        Ok(action_id)
    }

    /// Execute node failover
    fn execute_node_failover(&self, action: &RecoveryAction) -> Result<()> {
        // TODO: Implement actual failover logic
        // This would involve:
        // 1. Identifying backup/standby nodes
        // 2. Migrating workload from failed node
        // 3. Updating routing tables
        // 4. Notifying cluster about the change

        println!("Executing node failover for node: {}", action.target_node);

        // Queue node replacement if configured
        match self.config.replacement_strategy {
            NodeReplacementStrategy::Immediate | NodeReplacementStrategy::HotStandby => {
                let mut queue = self.replacement_queue.lock().expect("Operation failed");
                queue.push_back(NodeReplacementRequest {
                    failed_node: action.target_node.clone(),
                    replacement_type: self.config.replacement_strategy.clone(),
                    requested_at: SystemTime::now(),
                    priority: match action.severity {
                        AlertSeverity::Emergency | AlertSeverity::Critical => {
                            ReplacementPriority::High
                        }
                        _ => ReplacementPriority::Normal,
                    },
                });
            }
            _ => {}
        }

        Ok(())
    }

    /// Execute data replication
    fn execute_data_replication(&self, action: &RecoveryAction) -> Result<()> {
        // TODO: Implement data replication logic
        println!(
            "Executing data replication for node: {}",
            action.target_node
        );
        Ok(())
    }

    /// Execute network healing
    fn execute_network_heal(&self, action: &RecoveryAction) -> Result<()> {
        // TODO: Implement network healing logic
        println!("Executing network heal for node: {}", action.target_node);
        Ok(())
    }

    /// Execute service restart
    fn execute_service_restart(&self, action: &RecoveryAction) -> Result<()> {
        // TODO: Implement service restart logic
        println!("Executing service restart for node: {}", action.target_node);
        Ok(())
    }

    /// Execute resource scaling
    fn execute_resource_scaling(&self, action: &RecoveryAction) -> Result<()> {
        // TODO: Implement resource scaling logic
        println!(
            "Executing resource scaling for node: {}",
            action.target_node
        );
        Ok(())
    }

    /// Execute configuration rollback
    fn execute_config_rollback(&self, action: &RecoveryAction) -> Result<()> {
        // TODO: Implement config rollback logic
        println!("Executing config rollback for node: {}", action.target_node);
        Ok(())
    }

    /// Log a recovery action
    fn log_action(&self, action: RecoveryAction) {
        let mut history = self.recovery_history.write().expect("Operation failed");
        history.push_back(action);

        // Keep only recent history
        while history.len() > 10000 {
            history.pop_front();
        }
    }

    /// Get recovery history
    pub fn get_recovery_history(&self) -> Vec<RecoveryAction> {
        let history = self.recovery_history.read().expect("Operation failed");
        history.iter().cloned().collect()
    }

    /// Get active recovery operations
    pub fn get_active_recoveries(&self) -> Vec<RecoveryOperation> {
        let active = self.active_recoveries.lock().expect("Operation failed");
        active.values().cloned().collect()
    }

    /// Complete a recovery operation
    pub fn complete_recovery(
        &mut self,
        action_id: &str,
        success: bool,
        error: Option<String>,
    ) -> Result<()> {
        let mut active_recoveries = self.active_recoveries.lock().expect("Operation failed");

        if let Some(mut recovery_op) = active_recoveries.remove(action_id) {
            recovery_op.action.completed_at = Some(SystemTime::now());
            recovery_op.action.status = if success {
                RecoveryStatus::Completed
            } else {
                RecoveryStatus::Failed
            };
            recovery_op.action.error = error;
            recovery_op.progress = 1.0;

            // Update history
            self.log_action(recovery_op.action);
        }

        Ok(())
    }

    /// Get cluster health summary
    pub fn get_health_summary(&self) -> HealthSummary {
        self.health_monitor.get_health_summary()
    }

    /// Update alert thresholds
    pub fn update_alert_thresholds(&mut self, thresholds: AlertThresholds) {
        self.alert_thresholds = thresholds;
    }

    /// Process node replacement requests
    pub fn process_replacement_requests(&mut self) -> Result<Vec<NodeReplacementRequest>> {
        let mut queue = self.replacement_queue.lock().expect("Operation failed");
        let requests: Vec<_> = queue.drain(..).collect();
        Ok(requests)
    }
}

/// Recovery action
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecoveryAction {
    /// Unique action ID
    pub id: String,
    /// Type of recovery action
    pub action_type: RecoveryActionType,
    /// Target node for the action
    pub target_node: String,
    /// Alert severity
    pub severity: AlertSeverity,
    /// Human-readable description
    pub description: String,
    /// Recovery strategy
    pub strategy: RecoveryStrategy,
    /// When action was created
    pub created_at: SystemTime,
    /// When action started execution
    pub started_at: Option<SystemTime>,
    /// When action completed
    pub completed_at: Option<SystemTime>,
    /// Current status
    pub status: RecoveryStatus,
    /// Error message if failed
    pub error: Option<String>,
}

/// Recovery action status
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum RecoveryStatus {
    /// Action is pending execution
    Pending,
    /// Action is in progress
    InProgress,
    /// Action completed successfully
    Completed,
    /// Action failed
    Failed,
    /// Action was cancelled
    Cancelled,
}

/// Recovery operation (action with progress tracking)
#[derive(Debug, Clone)]
pub struct RecoveryOperation {
    /// The recovery action
    pub action: RecoveryAction,
    /// Progress percentage (0.0 - 1.0)
    pub progress: f64,
    /// Estimated completion time
    pub estimated_completion: Option<SystemTime>,
}

/// Health monitoring system
#[derive(Debug)]
pub struct HealthMonitor {
    /// Monitored nodes
    nodes: Arc<RwLock<HashMap<String, NodeMonitoringInfo>>>,
    /// Check interval
    check_interval: u64,
    /// Monitoring active flag
    is_monitoring: Arc<RwLock<bool>>,
}

impl HealthMonitor {
    /// Create a new health monitor
    pub fn new(check_interval: u64) -> Self {
        Self {
            nodes: Arc::new(RwLock::new(HashMap::new())),
            check_interval,
            is_monitoring: Arc::new(RwLock::new(false)),
        }
    }

    /// Start monitoring
    pub fn start(&mut self) -> Result<()> {
        let mut is_monitoring = self.is_monitoring.write().expect("Operation failed");
        *is_monitoring = true;

        // TODO: Start monitoring thread
        Ok(())
    }

    /// Stop monitoring
    pub fn stop(&mut self) -> Result<()> {
        let mut is_monitoring = self.is_monitoring.write().expect("Operation failed");
        *is_monitoring = false;

        // TODO: Stop monitoring thread
        Ok(())
    }

    /// Register a node for monitoring
    pub fn register_node(&mut self, node_id: String, metrics: NodeMetrics) -> Result<()> {
        let mut nodes = self.nodes.write().expect("Operation failed");

        let monitoring_info = NodeMonitoringInfo {
            node_id: node_id.clone(),
            current_metrics: metrics,
            health_status: NodeHealthStatus::Healthy,
            last_check: Instant::now(),
            failure_count: 0,
            recovery_attempts: 0,
            alerts: VecDeque::new(),
        };

        nodes.insert(node_id, monitoring_info);
        Ok(())
    }

    /// Unregister a node from monitoring
    pub fn unregister_node(&mut self, node_id: &str) -> Result<()> {
        let mut nodes = self.nodes.write().expect("Operation failed");
        nodes.remove(node_id);
        Ok(())
    }

    /// Update node metrics
    pub fn update_metrics(&mut self, node_id: &str, metrics: NodeMetrics) -> Result<()> {
        let mut nodes = self.nodes.write().expect("Operation failed");

        if let Some(monitoring_info) = nodes.get_mut(node_id) {
            monitoring_info.current_metrics = metrics;
            monitoring_info.last_check = Instant::now();
            monitoring_info.health_status =
                self.determine_health_status(&monitoring_info.current_metrics);
        } else {
            return Err(MetricsError::FaultToleranceError(format!(
                "Node {} not registered for monitoring",
                node_id
            )));
        }

        Ok(())
    }

    /// Determine health status based on metrics
    fn determine_health_status(&self, metrics: &NodeMetrics) -> NodeHealthStatus {
        // Check if node is responsive
        let heartbeat_age = metrics
            .last_heartbeat
            .elapsed()
            .unwrap_or_else(|_| Duration::from_secs(0));
        if heartbeat_age > Duration::from_secs(self.check_interval * 3) {
            return NodeHealthStatus::Failed;
        }

        // Check resource utilization
        if metrics.cpu_usage > 95.0 || metrics.memory_usage > 95.0 {
            return NodeHealthStatus::Degraded;
        }

        if metrics.cpu_usage > 85.0 || metrics.memory_usage > 85.0 {
            return NodeHealthStatus::Degraded;
        }

        NodeHealthStatus::Healthy
    }

    /// Get health summary for all nodes
    pub fn get_health_summary(&self) -> HealthSummary {
        let nodes = self.nodes.read().expect("Operation failed");

        let mut summary = HealthSummary {
            total_nodes: nodes.len(),
            healthy_nodes: 0,
            degraded_nodes: 0,
            failed_nodes: 0,
            unknown_nodes: 0,
            recovering_nodes: 0,
            maintenance_nodes: 0,
            last_updated: SystemTime::now(),
        };

        for monitoring_info in nodes.values() {
            match monitoring_info.health_status {
                NodeHealthStatus::Healthy => summary.healthy_nodes += 1,
                NodeHealthStatus::Degraded => summary.degraded_nodes += 1,
                NodeHealthStatus::Failed => summary.failed_nodes += 1,
                NodeHealthStatus::Unknown => summary.unknown_nodes += 1,
                NodeHealthStatus::Recovering => summary.recovering_nodes += 1,
                NodeHealthStatus::Maintenance => summary.maintenance_nodes += 1,
            }
        }

        summary
    }

    /// Get node health status
    pub fn get_node_health(&self, node_id: &str) -> Option<NodeHealthStatus> {
        let nodes = self.nodes.read().expect("Operation failed");
        nodes.get(node_id).map(|info| info.health_status.clone())
    }

    /// List all monitored nodes
    pub fn list_nodes(&self) -> Vec<String> {
        let nodes = self.nodes.read().expect("Operation failed");
        nodes.keys().cloned().collect()
    }
}

/// Node monitoring information
#[derive(Debug, Clone)]
pub struct NodeMonitoringInfo {
    /// Node ID
    pub node_id: String,
    /// Current metrics
    pub current_metrics: NodeMetrics,
    /// Current health status
    pub health_status: NodeHealthStatus,
    /// Last health check time
    pub last_check: Instant,
    /// Number of consecutive failures
    pub failure_count: usize,
    /// Number of recovery attempts
    pub recovery_attempts: usize,
    /// Recent alerts
    pub alerts: VecDeque<Alert>,
}

/// Node metrics for health monitoring
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeMetrics {
    /// CPU usage percentage (0-100)
    pub cpu_usage: f64,
    /// Memory usage percentage (0-100)
    pub memory_usage: f64,
    /// Disk usage percentage (0-100)
    pub disk_usage: f64,
    /// Network bandwidth utilization (0-100)
    pub network_usage: f64,
    /// Number of active connections
    pub active_connections: usize,
    /// Response time in milliseconds
    pub response_time_ms: f64,
    /// Error rate (0-1)
    pub error_rate: f64,
    /// Last heartbeat timestamp
    pub last_heartbeat: SystemTime,
    /// Custom metrics
    pub custom_metrics: HashMap<String, f64>,
}

impl NodeMetrics {
    /// Create default metrics for a healthy node
    pub fn healthy() -> Self {
        Self {
            cpu_usage: 10.0,
            memory_usage: 20.0,
            disk_usage: 30.0,
            network_usage: 5.0,
            active_connections: 10,
            response_time_ms: 50.0,
            error_rate: 0.001,
            last_heartbeat: SystemTime::now(),
            custom_metrics: HashMap::new(),
        }
    }

    /// Create metrics for a degraded node
    pub fn degraded() -> Self {
        Self {
            cpu_usage: 90.0, // Above the 85.0 threshold
            memory_usage: 80.0,
            disk_usage: 70.0,
            network_usage: 60.0,
            active_connections: 100,
            response_time_ms: 500.0,
            error_rate: 0.05,
            last_heartbeat: SystemTime::now(),
            custom_metrics: HashMap::new(),
        }
    }
}

/// Alert thresholds configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlertThresholds {
    /// CPU usage warning threshold
    pub cpu_warning: f64,
    /// CPU usage critical threshold
    pub cpu_critical: f64,
    /// Memory usage warning threshold
    pub memory_warning: f64,
    /// Memory usage critical threshold
    pub memory_critical: f64,
    /// Response time warning threshold (ms)
    pub response_time_warning: f64,
    /// Response time critical threshold (ms)
    pub response_time_critical: f64,
    /// Error rate warning threshold
    pub error_rate_warning: f64,
    /// Error rate critical threshold
    pub error_rate_critical: f64,
}

impl Default for AlertThresholds {
    fn default() -> Self {
        Self {
            cpu_warning: 80.0,
            cpu_critical: 95.0,
            memory_warning: 80.0,
            memory_critical: 95.0,
            response_time_warning: 1000.0,
            response_time_critical: 5000.0,
            error_rate_warning: 0.01,
            error_rate_critical: 0.05,
        }
    }
}

/// Health summary for the entire cluster
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthSummary {
    /// Total number of nodes
    pub total_nodes: usize,
    /// Number of healthy nodes
    pub healthy_nodes: usize,
    /// Number of degraded nodes
    pub degraded_nodes: usize,
    /// Number of failed nodes
    pub failed_nodes: usize,
    /// Number of nodes with unknown status
    pub unknown_nodes: usize,
    /// Number of recovering nodes
    pub recovering_nodes: usize,
    /// Number of nodes in maintenance
    pub maintenance_nodes: usize,
    /// Last update timestamp
    pub last_updated: SystemTime,
}

impl HealthSummary {
    /// Calculate health percentage
    pub fn health_percentage(&self) -> f64 {
        if self.total_nodes == 0 {
            return 100.0;
        }

        (self.healthy_nodes as f64 / self.total_nodes as f64) * 100.0
    }

    /// Check if cluster is healthy
    pub fn is_healthy(&self) -> bool {
        self.failed_nodes == 0 && self.degraded_nodes <= (self.total_nodes / 10)
    }
}

/// Alert information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Alert {
    /// Alert ID
    pub id: String,
    /// Alert severity
    pub severity: AlertSeverity,
    /// Alert message
    pub message: String,
    /// Alert timestamp
    pub timestamp: SystemTime,
    /// Whether alert has been acknowledged
    pub acknowledged: bool,
}

/// Node replacement request
#[derive(Debug, Clone)]
pub struct NodeReplacementRequest {
    /// Failed node ID
    pub failed_node: String,
    /// Type of replacement
    pub replacement_type: NodeReplacementStrategy,
    /// When replacement was requested
    pub requested_at: SystemTime,
    /// Request priority
    pub priority: ReplacementPriority,
}

/// Node replacement priority
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum ReplacementPriority {
    /// Low priority replacement
    Low,
    /// Normal priority replacement
    Normal,
    /// High priority replacement
    High,
    /// Emergency replacement
    Emergency,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_fault_recovery_manager_creation() {
        let config = FaultToleranceConfig::default();
        let manager = FaultRecoveryManager::new(config);
        assert_eq!(manager.get_recovery_history().len(), 0);
    }

    #[test]
    fn test_health_monitor_creation() {
        let monitor = HealthMonitor::new(30);
        assert_eq!(monitor.list_nodes().len(), 0);
    }

    #[test]
    fn test_node_registration() {
        let mut monitor = HealthMonitor::new(30);
        let metrics = NodeMetrics::healthy();

        monitor
            .register_node("node1".to_string(), metrics)
            .expect("Operation failed");
        assert_eq!(monitor.list_nodes().len(), 1);
        assert_eq!(
            monitor.get_node_health("node1"),
            Some(NodeHealthStatus::Healthy)
        );
    }

    #[test]
    fn test_health_status_determination() {
        let monitor = HealthMonitor::new(30);

        let healthy_metrics = NodeMetrics::healthy();
        assert_eq!(
            monitor.determine_health_status(&healthy_metrics),
            NodeHealthStatus::Healthy
        );

        let degraded_metrics = NodeMetrics::degraded();
        assert_eq!(
            monitor.determine_health_status(&degraded_metrics),
            NodeHealthStatus::Degraded
        );
    }

    #[test]
    fn test_recovery_action_creation() {
        let action = RecoveryAction {
            id: "test_action".to_string(),
            action_type: RecoveryActionType::NodeFailover,
            target_node: "node1".to_string(),
            severity: AlertSeverity::Critical,
            description: "Test recovery action".to_string(),
            strategy: RecoveryStrategy::Immediate,
            created_at: SystemTime::now(),
            started_at: None,
            completed_at: None,
            status: RecoveryStatus::Pending,
            error: None,
        };

        assert_eq!(action.status, RecoveryStatus::Pending);
        assert_eq!(action.severity, AlertSeverity::Critical);
    }

    #[test]
    fn test_health_summary() {
        let summary = HealthSummary {
            total_nodes: 10,
            healthy_nodes: 8,
            degraded_nodes: 1,
            failed_nodes: 1,
            unknown_nodes: 0,
            recovering_nodes: 0,
            maintenance_nodes: 0,
            last_updated: SystemTime::now(),
        };

        assert_eq!(summary.health_percentage(), 80.0);
        assert!(!summary.is_healthy()); // Has failed nodes
    }

    #[test]
    fn test_alert_thresholds() {
        let thresholds = AlertThresholds::default();
        assert_eq!(thresholds.cpu_warning, 80.0);
        assert_eq!(thresholds.cpu_critical, 95.0);
        assert!(thresholds.cpu_critical > thresholds.cpu_warning);
    }
}