oxirs-cluster 0.2.4

//! # Automated Failover System
//!
//! Automated failover and recovery system for OxiRS cluster nodes.
//! Handles leader election, node replacement, and service recovery.

use crate::health_monitor::{HealthEvent, HealthMonitor, NodeHealthLevel};
use crate::raft::OxirsNodeId;
use anyhow::{anyhow, Result};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use tokio::sync::{mpsc, RwLock};
use tokio::time::interval;
use tracing::{debug, error, info, warn};

/// Failover strategy for handling node failures
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum FailoverStrategy {
    /// Automatic failover with immediate replacement
    Immediate,
    /// Delayed failover with configurable grace period
    Delayed { grace_period: Duration },
    /// Manual failover requiring operator intervention
    Manual,
    /// Smart failover based on cluster state analysis
    Smart,
}

/// Recovery action to take when a node fails
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RecoveryAction {
    /// Restart the failed service
    RestartService,
    /// Replace the failed node with a new instance
    ReplaceNode { replacement_node: OxirsNodeId },
    /// Redistribute workload to healthy nodes
    RedistributeLoad,
    /// Scale out the cluster by adding new nodes
    ScaleOut { new_nodes: Vec<OxirsNodeId> },
    /// Initiate leader election
    InitiateLeaderElection,
    /// No action required
    NoAction,
}

/// Failover configuration
#[derive(Debug, Clone)]
pub struct FailoverConfig {
    /// Failover strategy
    pub strategy: FailoverStrategy,
    /// Maximum time to wait before declaring failover
    pub max_failover_time: Duration,
    /// Minimum cluster size to maintain
    pub min_cluster_size: usize,
    /// Enable automatic recovery
    pub enable_auto_recovery: bool,
    /// Maximum number of concurrent recovery actions
    pub max_concurrent_recoveries: usize,
    /// Cooldown period between recovery attempts
    pub recovery_cooldown: Duration,
    /// Enable leader failover
    pub enable_leader_failover: bool,
    /// Leader election timeout
    pub leader_election_timeout: Duration,
}

impl Default for FailoverConfig {
    fn default() -> Self {
        Self {
            strategy: FailoverStrategy::Smart,
            max_failover_time: Duration::from_secs(60),
            min_cluster_size: 3,
            enable_auto_recovery: true,
            max_concurrent_recoveries: 2,
            recovery_cooldown: Duration::from_secs(30),
            enable_leader_failover: true,
            leader_election_timeout: Duration::from_secs(10),
        }
    }
}

/// Failover event types
#[derive(Debug, Clone)]
pub enum FailoverEvent {
    /// Failover initiated for a node
    FailoverInitiated(OxirsNodeId, String),
    /// Failover completed successfully
    FailoverCompleted(OxirsNodeId),
    /// Failover failed
    FailoverFailed(OxirsNodeId, String),
    /// Recovery action started
    RecoveryStarted(OxirsNodeId, RecoveryAction),
    /// Recovery action completed
    RecoveryCompleted(OxirsNodeId, RecoveryAction),
    /// Recovery action failed
    RecoveryFailed(OxirsNodeId, RecoveryAction, String),
    /// Leader election initiated
    LeaderElectionInitiated,
    /// New leader elected
    NewLeaderElected(OxirsNodeId),
    /// Cluster rebalancing started
    RebalancingStarted,
    /// Cluster rebalancing completed
    RebalancingCompleted,
}

/// Node state tracking for failover decisions
#[derive(Debug, Clone)]
pub struct NodeState {
    #[allow(dead_code)]
    node_id: OxirsNodeId,
    health: NodeHealthLevel,
    last_seen: Instant,
    failure_count: u32,
    recovery_attempts: u32,
    is_leader: bool,
    last_recovery_attempt: Option<Instant>,
}

impl NodeState {
    fn new(node_id: OxirsNodeId) -> Self {
        Self {
            node_id,
            health: NodeHealthLevel::Unknown,
            last_seen: Instant::now(),
            failure_count: 0,
            recovery_attempts: 0,
            is_leader: false,
            last_recovery_attempt: None,
        }
    }

    fn can_attempt_recovery(&self, cooldown: Duration) -> bool {
        if let Some(last_attempt) = self.last_recovery_attempt {
            Instant::now().duration_since(last_attempt) > cooldown
        } else {
            true
        }
    }
}

/// Automated failover manager
pub struct FailoverManager {
    /// Configuration
    config: FailoverConfig,
    /// Health monitor reference
    health_monitor: Arc<HealthMonitor>,
    /// Node states
    node_states: Arc<RwLock<HashMap<OxirsNodeId, NodeState>>>,
    /// Current leader
    current_leader: Arc<RwLock<Option<OxirsNodeId>>>,
    /// Active recovery operations
    active_recoveries: Arc<RwLock<HashSet<OxirsNodeId>>>,
    /// Event channel
    event_sender: mpsc::UnboundedSender<FailoverEvent>,
    event_receiver: Arc<RwLock<mpsc::UnboundedReceiver<FailoverEvent>>>,
    /// Running flag
    running: Arc<RwLock<bool>>,
}

impl FailoverManager {
    /// Create a new failover manager
    pub fn new(config: FailoverConfig, health_monitor: Arc<HealthMonitor>) -> Self {
        let (event_sender, event_receiver) = mpsc::unbounded_channel();

        Self {
            config,
            health_monitor,
            node_states: Arc::new(RwLock::new(HashMap::new())),
            current_leader: Arc::new(RwLock::new(None)),
            active_recoveries: Arc::new(RwLock::new(HashSet::new())),
            event_sender,
            event_receiver: Arc::new(RwLock::new(event_receiver)),
            running: Arc::new(RwLock::new(false)),
        }
    }

    /// Start the failover manager
    pub async fn start(&self) -> Result<()> {
        let mut running = self.running.write().await;
        if *running {
            return Ok(());
        }
        *running = true;

        info!("Starting automated failover manager");

        // Start health monitoring event processor
        self.start_health_event_processor().await;

        // Start periodic cluster health assessment
        self.start_cluster_assessment().await;

        // Start recovery coordinator
        self.start_recovery_coordinator().await;

        Ok(())
    }

    /// Stop the failover manager
    pub async fn stop(&self) {
        let mut running = self.running.write().await;
        *running = false;
        info!("Failover manager stopped");
    }

    /// Register a node for failover management
    pub async fn register_node(&self, node_id: OxirsNodeId, is_leader: bool) -> Result<()> {
        let mut states = self.node_states.write().await;
        let mut state = NodeState::new(node_id);
        state.is_leader = is_leader;
        states.insert(node_id, state);

        if is_leader {
            let mut current_leader = self.current_leader.write().await;
            *current_leader = Some(node_id);
        }

        info!(
            "Registered node {} for failover management (leader: {})",
            node_id, is_leader
        );
        Ok(())
    }

    /// Unregister a node from failover management
    pub async fn unregister_node(&self, node_id: OxirsNodeId) -> Result<()> {
        let mut states = self.node_states.write().await;
        states.remove(&node_id);

        let current_leader = self.current_leader.read().await;
        if current_leader.as_ref() == Some(&node_id) {
            drop(current_leader);
            let mut leader = self.current_leader.write().await;
            *leader = None;
        }

        info!("Unregistered node {} from failover management", node_id);
        Ok(())
    }

    /// Manually trigger failover for a node
    pub async fn trigger_failover(&self, node_id: OxirsNodeId, reason: String) -> Result<()> {
        info!("Manual failover triggered for node {}: {}", node_id, reason);

        let action = self.determine_recovery_action(node_id).await?;
        self.execute_recovery_action(node_id, action).await?;

        let _ = self
            .event_sender
            .send(FailoverEvent::FailoverInitiated(node_id, reason));
        Ok(())
    }

    /// Get next failover event
    pub async fn next_event(&self) -> Option<FailoverEvent> {
        let mut receiver = self.event_receiver.write().await;
        receiver.recv().await
    }

    /// Get current cluster status
    pub async fn get_cluster_status(&self) -> HashMap<OxirsNodeId, NodeState> {
        let states = self.node_states.read().await;
        states.clone()
    }

    /// Get current leader
    pub async fn get_current_leader(&self) -> Option<OxirsNodeId> {
        let leader = self.current_leader.read().await;
        *leader
    }

    /// Start health event processor
    async fn start_health_event_processor(&self) {
        let health_monitor = self.health_monitor.clone();
        let node_states = self.node_states.clone();
        let event_sender = self.event_sender.clone();
        let _config = self.config.clone();
        let running = self.running.clone();

        tokio::spawn(async move {
            while *running.read().await {
                if let Some(health_event) = health_monitor.next_event().await {
                    match health_event {
                        HealthEvent::NodeFailed(node_id) => {
                            info!("Node {} failed - initiating failover", node_id);
                            let _ = event_sender.send(FailoverEvent::FailoverInitiated(
                                node_id,
                                "Node health check failed".to_string(),
                            ));
                        }
                        HealthEvent::NodeRecovered(node_id) => {
                            info!("Node {} recovered", node_id);
                            let mut states = node_states.write().await;
                            if let Some(state) = states.get_mut(&node_id) {
                                state.health = NodeHealthLevel::Healthy;
                                state.failure_count = 0;
                            }
                        }
                        HealthEvent::NodeSuspected(node_id) => {
                            warn!("Node {} suspected of failure", node_id);
                            let mut states = node_states.write().await;
                            if let Some(state) = states.get_mut(&node_id) {
                                state.failure_count += 1;
                            }
                        }
                        _ => {}
                    }
                }
            }
        });
    }

    /// Start cluster assessment task
    async fn start_cluster_assessment(&self) {
        let node_states = self.node_states.clone();
        let current_leader = self.current_leader.clone();
        let config = self.config.clone();
        let event_sender = self.event_sender.clone();
        let running = self.running.clone();

        tokio::spawn(async move {
            let mut interval = interval(Duration::from_secs(10));

            while *running.read().await {
                interval.tick().await;

                let states = node_states.read().await;
                let healthy_nodes = states
                    .values()
                    .filter(|state| matches!(state.health, NodeHealthLevel::Healthy))
                    .count();

                // Check if cluster is below minimum size
                if healthy_nodes < config.min_cluster_size {
                    warn!(
                        "Cluster below minimum size: {} < {}",
                        healthy_nodes, config.min_cluster_size
                    );

                    // Trigger scale-out to bring cluster back to minimum size
                    let nodes_needed = config.min_cluster_size - healthy_nodes;
                    info!("Triggering scale-out to add {} new nodes", nodes_needed);

                    // Generate new node IDs (using timestamp-based IDs to avoid conflicts)
                    let base_id = SystemTime::now()
                        .duration_since(UNIX_EPOCH)
                        .expect("SystemTime should be after UNIX_EPOCH")
                        .as_secs();
                    let new_nodes: Vec<OxirsNodeId> =
                        (0..nodes_needed).map(|i| base_id + i as u64).collect();

                    // Send scale-out event
                    let scale_action = RecoveryAction::ScaleOut {
                        new_nodes: new_nodes.clone(),
                    };
                    let _ = event_sender.send(FailoverEvent::RecoveryStarted(0, scale_action));
                }

                // Check if leader is healthy
                let leader = current_leader.read().await;
                if let Some(leader_id) = *leader {
                    if let Some(leader_state) = states.get(&leader_id) {
                        if !matches!(leader_state.health, NodeHealthLevel::Healthy) {
                            drop(leader);
                            if config.enable_leader_failover {
                                let _ = event_sender.send(FailoverEvent::LeaderElectionInitiated);
                            }
                        }
                    }
                }
            }
        });
    }

    /// Start recovery coordinator
    async fn start_recovery_coordinator(&self) {
        let node_states = self.node_states.clone();
        let active_recoveries = self.active_recoveries.clone();
        let config = self.config.clone();
        let event_sender = self.event_sender.clone();
        let running = self.running.clone();

        tokio::spawn(async move {
            let mut interval = interval(Duration::from_secs(5));

            while *running.read().await {
                interval.tick().await;

                let states = node_states.read().await;
                let recoveries = active_recoveries.read().await;

                for (node_id, state) in states.iter() {
                    if matches!(state.health, NodeHealthLevel::Failed)
                        && !recoveries.contains(node_id)
                        && state.can_attempt_recovery(config.recovery_cooldown)
                        && recoveries.len() < config.max_concurrent_recoveries
                    {
                        // Schedule recovery
                        debug!("Scheduling recovery for node {}", node_id);

                        // Determine appropriate recovery action based on cluster state
                        let node_id_copy = *node_id;
                        let node_states_clone = node_states.clone();
                        let active_recoveries_clone = active_recoveries.clone();
                        let config_clone = config.clone();
                        let event_sender_clone = event_sender.clone();

                        tokio::spawn(async move {
                            // Create a temporary FailoverManager reference for recovery execution
                            // In a real implementation, this would reference the actual FailoverManager
                            // For now, we'll execute recovery logic inline

                            // Mark node as being recovered
                            {
                                let mut recoveries = active_recoveries_clone.write().await;
                                recoveries.insert(node_id_copy);
                            }

                            // Determine recovery action
                            let recovery_action = {
                                let states = node_states_clone.read().await;
                                let state = states.get(&node_id_copy);

                                match state {
                                    Some(state) => match config_clone.strategy {
                                        FailoverStrategy::Immediate => {
                                            if state.is_leader {
                                                RecoveryAction::InitiateLeaderElection
                                            } else {
                                                RecoveryAction::RestartService
                                            }
                                        }
                                        FailoverStrategy::Delayed { .. } => {
                                            RecoveryAction::RestartService
                                        }
                                        FailoverStrategy::Manual => RecoveryAction::NoAction,
                                        FailoverStrategy::Smart => {
                                            let healthy_nodes = states
                                                .values()
                                                .filter(|s| {
                                                    matches!(s.health, NodeHealthLevel::Healthy)
                                                })
                                                .count();

                                            if healthy_nodes < config_clone.min_cluster_size {
                                                RecoveryAction::ScaleOut {
                                                    new_nodes: vec![node_id_copy + 1000],
                                                }
                                            } else if state.is_leader {
                                                RecoveryAction::InitiateLeaderElection
                                            } else {
                                                RecoveryAction::RedistributeLoad
                                            }
                                        }
                                    },
                                    None => RecoveryAction::NoAction,
                                }
                            };

                            // Send recovery started event
                            let _ = event_sender_clone.send(FailoverEvent::RecoveryStarted(
                                node_id_copy,
                                recovery_action.clone(),
                            ));

                            // Update state to record recovery attempt
                            {
                                let mut states = node_states_clone.write().await;
                                if let Some(state) = states.get_mut(&node_id_copy) {
                                    state.recovery_attempts += 1;
                                    state.last_recovery_attempt = Some(Instant::now());
                                }
                            }

                            // Simulate recovery execution
                            tokio::time::sleep(Duration::from_secs(2)).await;

                            // Send recovery completed event
                            let _ = event_sender_clone.send(FailoverEvent::RecoveryCompleted(
                                node_id_copy,
                                recovery_action,
                            ));

                            info!("Recovery completed for node {}", node_id_copy);

                            // Remove from active recoveries
                            {
                                let mut recoveries = active_recoveries_clone.write().await;
                                recoveries.remove(&node_id_copy);
                            }
                        });
                    }
                }
            }
        });
    }

    /// Determine appropriate recovery action for a failed node
    async fn determine_recovery_action(&self, node_id: OxirsNodeId) -> Result<RecoveryAction> {
        let states = self.node_states.read().await;
        let state = states
            .get(&node_id)
            .ok_or_else(|| anyhow!("Node {} not found", node_id))?;

        match self.config.strategy {
            FailoverStrategy::Immediate => {
                if state.is_leader {
                    Ok(RecoveryAction::InitiateLeaderElection)
                } else {
                    Ok(RecoveryAction::RestartService)
                }
            }
            FailoverStrategy::Delayed { .. } => {
                // Wait for grace period then restart
                Ok(RecoveryAction::RestartService)
            }
            FailoverStrategy::Manual => Ok(RecoveryAction::NoAction),
            FailoverStrategy::Smart => {
                // Analyze cluster state and determine best action
                let healthy_nodes = states
                    .values()
                    .filter(|s| matches!(s.health, NodeHealthLevel::Healthy))
                    .count();

                if healthy_nodes < self.config.min_cluster_size {
                    Ok(RecoveryAction::ScaleOut {
                        new_nodes: vec![node_id + 1000], // Placeholder
                    })
                } else if state.is_leader {
                    Ok(RecoveryAction::InitiateLeaderElection)
                } else {
                    Ok(RecoveryAction::RedistributeLoad)
                }
            }
        }
    }

    /// Execute a recovery action
    async fn execute_recovery_action(
        &self,
        node_id: OxirsNodeId,
        action: RecoveryAction,
    ) -> Result<()> {
        let mut active_recoveries = self.active_recoveries.write().await;
        active_recoveries.insert(node_id);
        drop(active_recoveries);

        let _ = self
            .event_sender
            .send(FailoverEvent::RecoveryStarted(node_id, action.clone()));

        let result = match action {
            RecoveryAction::RestartService => self.restart_service(node_id).await,
            RecoveryAction::ReplaceNode { replacement_node } => {
                self.replace_node(node_id, replacement_node).await
            }
            RecoveryAction::RedistributeLoad => self.redistribute_load(node_id).await,
            RecoveryAction::ScaleOut { ref new_nodes } => self.scale_out(new_nodes.clone()).await,
            RecoveryAction::InitiateLeaderElection => self.initiate_leader_election().await,
            RecoveryAction::NoAction => Ok(()),
        };

        let mut active_recoveries = self.active_recoveries.write().await;
        active_recoveries.remove(&node_id);

        match result {
            Ok(()) => {
                let _ = self
                    .event_sender
                    .send(FailoverEvent::RecoveryCompleted(node_id, action));
                info!("Recovery completed for node {}", node_id);
            }
            Err(e) => {
                let _ = self.event_sender.send(FailoverEvent::RecoveryFailed(
                    node_id,
                    action,
                    e.to_string(),
                ));
                error!("Recovery failed for node {}: {}", node_id, e);
            }
        }

        Ok(())
    }

    /// Restart a failed service
    async fn restart_service(&self, node_id: OxirsNodeId) -> Result<()> {
        info!("Restarting service for node {}", node_id);

        // Update node state to indicate recovery attempt
        {
            let mut states = self.node_states.write().await;
            if let Some(state) = states.get_mut(&node_id) {
                state.recovery_attempts += 1;
                state.last_recovery_attempt = Some(Instant::now());
            }
        }

        // Attempt to restart the service using health monitor
        // Placeholder for restart functionality
        let restart_result: Result<()> = Ok(());

        match restart_result {
            Ok(()) => {
                info!(
                    "Successfully initiated service restart for node {}",
                    node_id
                );

                // Wait for service to stabilize
                tokio::time::sleep(Duration::from_secs(5)).await;

                // Verify the service is healthy again
                let health_check = self.health_monitor.get_node_health(node_id).await;
                match health_check {
                    Some(status) if matches!(status.health.status, NodeHealthLevel::Healthy) => {
                        let mut states = self.node_states.write().await;
                        if let Some(state) = states.get_mut(&node_id) {
                            state.health = NodeHealthLevel::Healthy;
                            state.failure_count = 0;
                            state.last_seen = Instant::now();
                        }
                        info!(
                            "Node {} successfully recovered after service restart",
                            node_id
                        );
                        Ok(())
                    }
                    _ => {
                        warn!("Node {} still unhealthy after restart attempt", node_id);
                        Err(anyhow!("Service restart failed - node still unhealthy"))
                    }
                }
            }
            Err(e) => {
                error!("Failed to restart service for node {}: {}", node_id, e);
                Err(e)
            }
        }
    }

    /// Replace a failed node with a new one
    async fn replace_node(
        &self,
        failed_node: OxirsNodeId,
        replacement_node: OxirsNodeId,
    ) -> Result<()> {
        info!("Replacing node {} with {}", failed_node, replacement_node);

        // Step 1: Mark the failed node as being replaced
        {
            let mut states = self.node_states.write().await;
            if let Some(state) = states.get_mut(&failed_node) {
                state.health = NodeHealthLevel::Failed;
                state.recovery_attempts += 1;
                state.last_recovery_attempt = Some(Instant::now());
            }
        }

        // Step 2: Register the replacement node
        self.register_node(replacement_node, false).await?;

        // Step 3: Initialize the replacement node with the failed node's data
        info!(
            "Initializing replacement node {} with data from {}",
            replacement_node, failed_node
        );

        // In a real implementation, this would:
        // - Transfer data from the failed node's replicas
        // - Update cluster membership configuration
        // - Ensure the replacement node is caught up with the cluster state

        // Step 4: Update cluster configuration to remove failed node and add replacement
        self.update_cluster_membership(failed_node, Some(replacement_node))
            .await?;

        // Step 5: Verify replacement node is healthy and integrated
        tokio::time::sleep(Duration::from_secs(10)).await;
        let health_check = self.health_monitor.get_node_health(replacement_node).await;

        match health_check {
            Some(status) if matches!(status.health.status, NodeHealthLevel::Healthy) => {
                info!(
                    "Node replacement completed successfully: {} -> {}",
                    failed_node, replacement_node
                );

                // Remove the failed node from tracking
                self.unregister_node(failed_node).await?;
                Ok(())
            }
            _ => {
                error!("Replacement node {} is not healthy", replacement_node);
                Err(anyhow!("Replacement node failed health check"))
            }
        }
    }

    /// Redistribute load from a failed node
    async fn redistribute_load(&self, node_id: OxirsNodeId) -> Result<()> {
        info!("Redistributing load from node {}", node_id);
        let _ = self.event_sender.send(FailoverEvent::RebalancingStarted);

        // Step 1: Identify healthy nodes for load redistribution
        let healthy_nodes = {
            let states = self.node_states.read().await;
            states
                .iter()
                .filter(|(id, state)| {
                    **id != node_id && matches!(state.health, NodeHealthLevel::Healthy)
                })
                .map(|(id, _)| *id)
                .collect::<Vec<_>>()
        };

        if healthy_nodes.is_empty() {
            error!("No healthy nodes available for load redistribution");
            return Err(anyhow!("No healthy nodes available"));
        }

        info!(
            "Redistributing load to {} healthy nodes",
            healthy_nodes.len()
        );

        // Step 2: Calculate load distribution strategy
        // In a real implementation, this would:
        // - Analyze current workload on the failed node
        // - Calculate optimal distribution across healthy nodes
        // - Consider node capacity and current load

        // Step 3: Perform data migration/rebalancing
        for target_node in &healthy_nodes {
            info!(
                "Migrating portion of load from {} to {}",
                node_id, target_node
            );

            // In a real implementation, this would trigger:
            // - Shard migration
            // - Query redirection
            // - Client connection rebalancing
            // - Cache invalidation and warm-up
        }

        // Step 4: Update routing tables and load balancer configuration
        self.update_load_balancer_config(&healthy_nodes).await?;

        // Step 5: Verify redistribution was successful
        tokio::time::sleep(Duration::from_secs(5)).await;
        let verification_result = self.verify_load_redistribution(&healthy_nodes).await;

        match verification_result {
            Ok(()) => {
                let _ = self.event_sender.send(FailoverEvent::RebalancingCompleted);
                info!("Load redistribution completed successfully");
                Ok(())
            }
            Err(e) => {
                error!("Load redistribution verification failed: {}", e);
                Err(e)
            }
        }
    }

    /// Update load balancer configuration after redistribution
    async fn update_load_balancer_config(&self, healthy_nodes: &[OxirsNodeId]) -> Result<()> {
        info!(
            "Updating load balancer configuration for {} nodes",
            healthy_nodes.len()
        );

        // In a real implementation, this would:
        // - Update service discovery entries
        // - Reconfigure load balancers
        // - Update client connection pools
        // - Notify monitoring systems

        // For now, just update our internal state
        for node_id in healthy_nodes {
            if let Ok(mut states) = self.node_states.try_write() {
                if let Some(state) = states.get_mut(node_id) {
                    state.last_seen = Instant::now();
                }
            }
        }

        Ok(())
    }

    /// Verify that load redistribution was successful
    async fn verify_load_redistribution(&self, healthy_nodes: &[OxirsNodeId]) -> Result<()> {
        info!(
            "Verifying load redistribution across {} nodes",
            healthy_nodes.len()
        );

        for node_id in healthy_nodes {
            let health_check = self.health_monitor.get_node_health(*node_id).await;
            match health_check {
                Some(status) if matches!(status.health.status, NodeHealthLevel::Healthy) => {
                    debug!("Node {} is healthy after load redistribution", node_id);
                }
                _ => {
                    warn!(
                        "Node {} became unhealthy during load redistribution",
                        node_id
                    );
                    return Err(anyhow!("Node {} failed during redistribution", node_id));
                }
            }
        }

        Ok(())
    }

    /// Scale out the cluster with new nodes
    async fn scale_out(&self, new_nodes: Vec<OxirsNodeId>) -> Result<()> {
        info!("Scaling out cluster with {} new nodes", new_nodes.len());

        if new_nodes.is_empty() {
            return Ok(());
        }

        // Step 1: Validate that new nodes don't conflict with existing ones
        {
            let states = self.node_states.read().await;
            for node_id in &new_nodes {
                if states.contains_key(node_id) {
                    return Err(anyhow!("Node {} already exists in cluster", node_id));
                }
            }
        }

        // Step 2: Initialize and register new nodes
        for &node_id in &new_nodes {
            info!("Adding new node {} to cluster", node_id);

            // Register the node with the failover manager
            self.register_node(node_id, false).await?;

            // Initialize the node's health monitoring
            self.health_monitor
                .register_node(node_id)
                .await
                .map_err(|e| {
                    anyhow!(
                        "Failed to register node {} with health monitor: {}",
                        node_id,
                        e
                    )
                })?;
        }

        // Step 3: Update cluster membership configuration
        for &node_id in &new_nodes {
            self.update_cluster_membership(0, Some(node_id)).await?;
        }

        // Step 4: Wait for nodes to join and stabilize
        info!("Waiting for new nodes to stabilize...");
        tokio::time::sleep(Duration::from_secs(10)).await;

        // Step 5: Verify all new nodes are healthy
        for &node_id in &new_nodes {
            let health_check = self.health_monitor.get_node_health(node_id).await;
            match health_check {
                Some(status) if matches!(status.health.status, NodeHealthLevel::Healthy) => {
                    info!("New node {} is healthy and integrated", node_id);
                }
                _ => {
                    error!("New node {} failed health check during scale-out", node_id);
                    // Try to clean up the failed node
                    let _ = self.unregister_node(node_id).await;
                    return Err(anyhow!("Scale-out failed: node {} is unhealthy", node_id));
                }
            }
        }

        // Step 6: Trigger rebalancing to distribute load to new nodes
        info!("Triggering load rebalancing across expanded cluster");
        let _ = self.event_sender.send(FailoverEvent::RebalancingStarted);

        // In a real implementation, this would:
        // - Redistribute shards across the larger cluster
        // - Update routing tables
        // - Migrate some data to new nodes for better load distribution

        let _ = self.event_sender.send(FailoverEvent::RebalancingCompleted);

        info!(
            "Successfully scaled out cluster with {} new nodes",
            new_nodes.len()
        );
        Ok(())
    }

    /// Initiate leader election
    async fn initiate_leader_election(&self) -> Result<()> {
        info!("Initiating leader election");
        let _ = self
            .event_sender
            .send(FailoverEvent::LeaderElectionInitiated);

        // Step 1: Clear current leader
        {
            let mut current_leader = self.current_leader.write().await;
            *current_leader = None;
        }

        // Step 2: Find eligible candidates (healthy nodes)
        let candidates = {
            let states = self.node_states.read().await;
            states
                .iter()
                .filter(|(_, state)| matches!(state.health, NodeHealthLevel::Healthy))
                .map(|(id, state)| (*id, state.clone()))
                .collect::<Vec<_>>()
        };

        if candidates.is_empty() {
            error!("No healthy nodes available for leader election");
            return Err(anyhow!("No healthy candidates for leader election"));
        }

        info!("Found {} candidates for leader election", candidates.len());

        // Step 3: Apply leader selection strategy
        let new_leader = self.select_best_leader_candidate(&candidates).await?;

        // Step 4: Initiate leader election timeout
        let _election_timeout = self.config.leader_election_timeout;
        let election_start = Instant::now();

        // Step 5: Simulate consensus-based leader election
        // In a real Raft implementation, this would:
        // - Increment term
        // - Send RequestVote RPCs to all nodes
        // - Collect majority votes
        // - Handle election timeout and split votes

        info!(
            "Conducting leader election with {} as candidate",
            new_leader
        );

        // Simulate election time
        tokio::time::sleep(Duration::from_millis(100)).await;

        // Step 6: Verify the selected leader is still healthy
        let health_check = self.health_monitor.get_node_health(new_leader).await;
        match health_check {
            Some(status) if matches!(status.health.status, NodeHealthLevel::Healthy) => {
                // Step 7: Finalize leadership
                {
                    let mut current_leader = self.current_leader.write().await;
                    *current_leader = Some(new_leader);
                }

                // Update the node state to reflect leadership
                {
                    let mut states = self.node_states.write().await;
                    // Clear leadership flag from all nodes
                    for state in states.values_mut() {
                        state.is_leader = false;
                    }
                    // Set new leader
                    if let Some(state) = states.get_mut(&new_leader) {
                        state.is_leader = true;
                        state.last_seen = Instant::now();
                    }
                }

                let _ = self
                    .event_sender
                    .send(FailoverEvent::NewLeaderElected(new_leader));

                let election_duration = election_start.elapsed();
                info!(
                    "Leader election completed in {:?}. New leader: {}",
                    election_duration, new_leader
                );
                Ok(())
            }
            _ => {
                error!(
                    "Selected leader candidate {} became unhealthy during election",
                    new_leader
                );
                Err(anyhow!("Leader candidate failed health check"))
            }
        }
    }

    /// Select the best candidate for leadership based on various criteria
    async fn select_best_leader_candidate(
        &self,
        candidates: &[(OxirsNodeId, NodeState)],
    ) -> Result<OxirsNodeId> {
        if candidates.is_empty() {
            return Err(anyhow!("No candidates available"));
        }

        // Strategy: Select the node with the lowest failure count and most recent activity
        let best_candidate = candidates
            .iter()
            .min_by(|(_, a), (_, b)| {
                // Primary: lowest failure count
                match a.failure_count.cmp(&b.failure_count) {
                    std::cmp::Ordering::Equal => {
                        // Secondary: most recently seen (highest last_seen)
                        b.last_seen.cmp(&a.last_seen)
                    }
                    other => other,
                }
            })
            .map(|(id, _)| *id)
            .ok_or_else(|| anyhow!("Failed to select candidate"))?;

        debug!(
            "Selected leader candidate {} from {} options",
            best_candidate,
            candidates.len()
        );
        Ok(best_candidate)
    }

    /// Update cluster membership configuration
    async fn update_cluster_membership(
        &self,
        remove_node: OxirsNodeId,
        add_node: Option<OxirsNodeId>,
    ) -> Result<()> {
        info!(
            "Updating cluster membership: remove={}, add={:?}",
            remove_node, add_node
        );

        // In a real implementation, this would:
        // - Create a new cluster configuration
        // - Propose the configuration change through Raft consensus
        // - Wait for majority agreement
        // - Apply the configuration change
        // - Update routing tables and service discovery

        // For now, just log the operation
        if remove_node != 0 {
            info!(
                "Would remove node {} from cluster configuration",
                remove_node
            );
        }

        if let Some(new_node) = add_node {
            info!("Would add node {} to cluster configuration", new_node);
        }

        // Simulate configuration update time
        tokio::time::sleep(Duration::from_millis(500)).await;

        info!("Cluster membership update completed");
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::health_monitor::HealthMonitorConfig;

    #[tokio::test]
    async fn test_failover_manager_creation() {
        let config = FailoverConfig::default();
        let health_monitor = Arc::new(HealthMonitor::new(HealthMonitorConfig::default()));
        let manager = FailoverManager::new(config, health_monitor);
        assert!(!*manager.running.read().await);
    }

    #[tokio::test]
    async fn test_node_registration() {
        let config = FailoverConfig::default();
        let health_monitor = Arc::new(HealthMonitor::new(HealthMonitorConfig::default()));
        let manager = FailoverManager::new(config, health_monitor);

        manager.register_node(1, true).await.unwrap();
        let leader = manager.get_current_leader().await;
        assert_eq!(leader, Some(1));
    }

    #[tokio::test]
    async fn test_recovery_action_determination() {
        let config = FailoverConfig {
            strategy: FailoverStrategy::Immediate,
            ..Default::default()
        };
        let health_monitor = Arc::new(HealthMonitor::new(HealthMonitorConfig::default()));
        let manager = FailoverManager::new(config, health_monitor);

        manager.register_node(1, true).await.unwrap();
        let action = manager.determine_recovery_action(1).await.unwrap();

        match action {
            RecoveryAction::InitiateLeaderElection => {}
            _ => panic!("Expected leader election for failed leader"),
        }
    }
}