rivven-cluster 0.0.22

//! Cluster coordinator - orchestrates all cluster components
//!
//! The ClusterCoordinator is the main entry point for cluster operations.
//! It manages:
//! - Node lifecycle (join, leave, failure)
//! - Metadata consensus (Raft)
//! - Partition placement and rebalancing
//! - Leader election for partitions

use crate::config::{ClusterConfig, ClusterMode};
use crate::error::{ClusterError, Result};
use crate::membership::{Membership, MembershipEvent};
use crate::metadata::{MetadataCommand, MetadataStore};
use crate::node::{NodeId, NodeInfo};
use crate::partition::{PartitionId, TopicConfig};
use crate::placement::{PartitionPlacer, PlacementConfig};
use crate::raft::RaftNode;
use crate::replication::ReplicationManager;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{broadcast, RwLock};
use tracing::{debug, info, warn};

/// Cluster coordinator state
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CoordinatorState {
    /// Starting up
    Starting,
    /// Joining cluster
    Joining,
    /// Running as follower
    Follower,
    /// Running as leader (controller)
    Leader,
    /// Shutting down gracefully
    Leaving,
    /// Shutdown complete
    Stopped,
}

/// Cluster coordinator manages all cluster operations
pub struct ClusterCoordinator {
    /// Cluster configuration
    config: ClusterConfig,

    /// Our node information
    local_node: NodeInfo,

    /// Current coordinator state
    state: RwLock<CoordinatorState>,

    /// Local metadata store — only mutated directly in standalone/test mode
    /// (i.e. when no Raft node is wired).  When Raft is active the Raft state
    /// machine is the single source of truth; followers never apply membership
    /// commands locally to avoid a dual source-of-truth window.
    metadata: Arc<MetadataStore>,

    /// Raft consensus node — when available, ALL state mutations go through Raft
    /// and reads use the Raft state machine's authoritative metadata.
    /// This is `None` only in standalone test scenarios (e.g. unit tests).
    /// Wrapped in Arc<RwLock<Option<...>>> so spawned tasks (e.g. membership event handler)
    /// can access it even when set_raft_node() is called after start().
    raft_node: Arc<RwLock<Option<Arc<RwLock<RaftNode>>>>>,

    /// Cluster membership (SWIM protocol)
    membership: Option<Arc<Membership>>,

    /// Partition placer
    placer: RwLock<PartitionPlacer>,

    /// Replication manager
    replication: Arc<ReplicationManager>,

    /// Current Raft leader (used only when raft_node is None)
    raft_leader: RwLock<Option<NodeId>>,

    /// Whether we are the Raft leader (used only when raft_node is None)
    is_leader_flag: RwLock<bool>,

    /// Round-robin counter for partition selection (replaces wall-clock time)
    round_robin_counter: AtomicUsize,

    /// Shutdown signal
    shutdown_tx: broadcast::Sender<()>,

    /// Pending suspicion timers: node_id → cancel handle.
    ///
    /// When NodeSuspected fires, a grace period timer (15s) is started
    /// before reassigning partitions. If NodeRecovered fires before
    /// the timer expires, the pending reassignment is cancelled.
    pending_suspicions: Arc<dashmap::DashMap<String, tokio::sync::oneshot::Sender<()>>>,
}

impl ClusterCoordinator {
    /// Create a new cluster coordinator
    pub async fn new(config: ClusterConfig) -> Result<Self> {
        let local_node = NodeInfo::new(&config.node_id, config.client_addr, config.cluster_addr);

        let local_node = if let Some(rack) = &config.rack {
            local_node.with_rack(rack)
        } else {
            local_node
        };

        let metadata = Arc::new(MetadataStore::new());
        let replication = Arc::new(ReplicationManager::new(
            config.node_id.clone(),
            config.replication.clone(),
        ));

        let placer = PartitionPlacer::new(PlacementConfig {
            rack_aware: config.rack.is_some(),
            ..Default::default()
        });

        let (shutdown_tx, _) = broadcast::channel(1);

        Ok(Self {
            config,
            local_node,
            state: RwLock::new(CoordinatorState::Starting),
            metadata,
            raft_node: Arc::new(RwLock::new(None)),
            membership: None,
            placer: RwLock::new(placer),
            replication,
            raft_leader: RwLock::new(None),
            is_leader_flag: RwLock::new(false),
            round_robin_counter: AtomicUsize::new(0),
            shutdown_tx,
            pending_suspicions: Arc::new(dashmap::DashMap::new()),
        })
    }

    /// Create standalone coordinator (no clustering)
    pub async fn standalone(config: ClusterConfig) -> Result<Self> {
        let coordinator = Self::new(config).await?;

        // In standalone mode, we are always the leader
        *coordinator.state.write().await = CoordinatorState::Leader;
        *coordinator.is_leader_flag.write().await = true;

        // Register ourselves in metadata
        let local_node = coordinator.local_node.clone();
        coordinator
            .metadata
            .apply(
                0,
                MetadataCommand::RegisterNode {
                    info: local_node.clone(),
                },
            )
            .await;

        // Also add ourselves to the placer
        {
            let mut placer = coordinator.placer.write().await;
            let mut node = crate::node::Node::new(local_node);
            node.mark_alive(1); // Initial incarnation
            placer.add_node(&node);
        }

        info!("Coordinator running in standalone mode");
        Ok(coordinator)
    }

    /// Start the coordinator
    pub async fn start(&mut self) -> Result<()> {
        *self.state.write().await = CoordinatorState::Starting;

        match self.config.mode {
            ClusterMode::Standalone => {
                self.start_standalone().await?;
            }
            ClusterMode::Cluster => {
                self.start_cluster().await?;
            }
        }

        Ok(())
    }

    /// Start in standalone mode
    async fn start_standalone(&mut self) -> Result<()> {
        // Register ourselves
        self.metadata
            .apply(
                0,
                MetadataCommand::RegisterNode {
                    info: self.local_node.clone(),
                },
            )
            .await;

        // Add ourselves to the placer
        {
            let mut placer = self.placer.write().await;
            let mut node = crate::node::Node::new(self.local_node.clone());
            node.mark_alive(1); // Initial incarnation
            placer.add_node(&node);
        }

        *self.state.write().await = CoordinatorState::Leader;
        *self.is_leader_flag.write().await = true;

        info!(node_id = %self.config.node_id, "Started in standalone mode");
        Ok(())
    }

    /// Start in cluster mode
    async fn start_cluster(&mut self) -> Result<()> {
        *self.state.write().await = CoordinatorState::Joining;

        // Initialize SWIM membership
        let membership = Membership::new(
            self.local_node.clone(),
            self.config.swim.clone(),
            self.shutdown_tx.subscribe(),
        )
        .await?;

        // Subscribe to membership events
        let mut events = membership.subscribe();
        let metadata = self.metadata.clone();
        let node_id = self.config.node_id.clone();
        let raft_node = self.raft_node.clone();
        let pending_suspicions = self.pending_suspicions.clone();

        // Spawn membership event handler
        tokio::spawn(async move {
            while let Ok(event) = events.recv().await {
                Self::handle_membership_event(
                    &metadata,
                    &node_id,
                    event,
                    &raft_node,
                    &pending_suspicions,
                )
                .await;
            }
        });

        // Join cluster if we have seeds
        if !self.config.seeds.is_empty() {
            membership.join(&self.config.seeds).await?;
        }

        self.membership = Some(Arc::new(membership));
        *self.state.write().await = CoordinatorState::Follower;

        // Raft consensus is managed by RaftNode which is wired in via set_raft_node().
        // The coordinator checks RaftNode.is_leader() for authoritative leadership.
        //
        // Do NOT assume leadership just because seeds is empty.
        // Even the first node must wait for Raft to confirm leadership via
        // single-node bootstrap. Setting CoordinatorState::Leader without Raft
        // causes split-brain when two nodes start simultaneously with no seeds.
        if self.config.seeds.is_empty() {
            info!(
                node_id = %self.config.node_id,
                "First node — awaiting Raft bootstrap for leadership confirmation"
            );
        } else {
            info!(node_id = %self.config.node_id, "Started as cluster follower");
        }

        Ok(())
    }

    /// Handle membership events.
    ///
    /// When Raft is wired, proposes RegisterNode/DeregisterNode through Raft consensus
    /// for replicated consistency. Falls back to local MetadataStore hints when Raft
    /// is not yet available (e.g. during bootstrap before set_raft_node() is called).
    ///
    /// # SWIM-Raft Consistency Window (§2.3)
    ///
    /// There is an inherent consistency window between SWIM and Raft:
    /// - **SWIM** (UDP gossip) detects node failure within milliseconds to seconds.
    /// - **Raft** proposals are asynchronous and may take ~1 heartbeat interval to commit.
    ///
    /// During this window, the Raft state machine still considers a suspected/failed node
    /// as alive. Clients may be routed to it and receive `NOT_LEADER_FOR_PARTITION` errors,
    /// which they handle via retry with exponential backoff.
    ///
    /// The bridging strategy:
    /// - **NodeSuspected**: Fence partitions (reassign leadership) but do NOT deregister.
    ///   This allows recovery without re-registration if SWIM refutes the suspicion.
    /// - **NodeFailed**: Deregister via Raft AND reassign partition leadership.
    /// - **NodeRecovered**: Node is already registered; SWIM handles re-joining.
    ///
    /// The worst-case window duration is `election_timeout_max + network_RTT` (~600ms + RTT).
    ///
    /// On NodeFailed, automatically triggers leader election for all
    /// partitions that were led by the dead node, preventing partition unavailability.
    async fn handle_membership_event(
        metadata: &MetadataStore,
        _local_node_id: &str,
        event: MembershipEvent,
        raft_node: &Arc<RwLock<Option<Arc<RwLock<RaftNode>>>>>,
        pending_suspicions: &Arc<dashmap::DashMap<String, tokio::sync::oneshot::Sender<()>>>,
    ) {
        match event {
            MembershipEvent::NodeJoined(info) => {
                info!(node_id = %info.id, "Node joined cluster");
                let cmd = MetadataCommand::RegisterNode { info };
                Self::apply_membership_command(metadata, cmd, raft_node.as_ref()).await;
            }
            MembershipEvent::NodeLeft(node_id) => {
                info!(node_id = %node_id, "Node left cluster gracefully");
                // Cancel any pending suspicion timer for this node
                pending_suspicions.remove(&node_id);
                let cmd = MetadataCommand::DeregisterNode { node_id };
                Self::apply_membership_command(metadata, cmd, raft_node.as_ref()).await;
            }
            MembershipEvent::NodeFailed(node_id) => {
                warn!(node_id = %node_id, "Node failed");
                // Cancel any pending suspicion timer — the node is confirmed dead
                pending_suspicions.remove(&node_id);
                let cmd = MetadataCommand::DeregisterNode {
                    node_id: node_id.clone(),
                };
                Self::apply_membership_command(metadata, cmd, raft_node.as_ref()).await;

                // Automatically reassign partitions led by the dead node.
                // Only the Raft leader should trigger reassignment to avoid duplicate
                // proposals from multiple nodes.
                Self::reassign_dead_node_partitions(&node_id, raft_node.as_ref()).await;
            }
            MembershipEvent::NodeSuspected(node_id) => {
                warn!(node_id = %node_id, "Node suspected — starting grace period (15s)");
                // Instead of immediately reassigning partitions, start a grace
                // period timer. SWIM suspicion is provisional; the node may
                // recover in seconds. If NodeRecovered fires before the timer
                // expires, we cancel the pending reassignment.
                if pending_suspicions.contains_key(&node_id) {
                    debug!(node_id = %node_id, "Suspicion timer already pending, skipping");
                    return;
                }

                let (cancel_tx, cancel_rx) = tokio::sync::oneshot::channel::<()>();
                pending_suspicions.insert(node_id.clone(), cancel_tx);

                // Spawn a deferred reassignment task
                let raft_node = raft_node.clone();
                let suspicions = pending_suspicions.clone();
                let nid = node_id.clone();
                tokio::spawn(async move {
                    tokio::select! {
                        _ = tokio::time::sleep(Duration::from_secs(15)) => {
                            // Grace period expired — node is still suspected, proceed
                            // with partition reassignment.
                            warn!(node_id = %nid, "Suspicion grace period expired — reassigning partitions");
                            suspicions.remove(&nid);
                            Self::reassign_dead_node_partitions(&nid, &raft_node).await;
                        }
                        _ = cancel_rx => {
                            // NodeRecovered or NodeFailed fired — cancel the timer
                            debug!(node_id = %nid, "Suspicion timer cancelled (node recovered or failed)");
                        }
                    }
                });
            }
            MembershipEvent::NodeRecovered(node_id) => {
                info!(node_id = %node_id, "Node recovered");
                // Cancel pending suspicion timer if any
                if pending_suspicions.remove(&node_id).is_some() {
                    info!(node_id = %node_id, "Cancelled pending partition reassignment");
                }
            }
            MembershipEvent::NodeStateChanged { node_id, old, new } => {
                debug!(node_id = %node_id, ?old, ?new, "Node state changed");
            }
        }
    }

    /// Apply a membership command through the appropriate channel.
    ///
    /// - **Leader**: proposes the command through Raft so it is replicated to all nodes.
    /// - **Follower**: does *nothing* locally.  The leader's Raft proposal will
    ///   replicate through Raft and be applied by the state machine on every node
    ///   (including this follower), keeping the Raft log as the single source of
    ///   truth.
    /// - **No Raft node** (standalone / test / bootstrap): applies directly to the
    ///   local `MetadataStore`.
    async fn apply_membership_command(
        metadata: &MetadataStore,
        cmd: MetadataCommand,
        raft_node: &RwLock<Option<Arc<RwLock<RaftNode>>>>,
    ) {
        let raft_guard = raft_node.read().await;
        if let Some(ref raft) = *raft_guard {
            let raft_lock = raft.read().await;
            // Only the leader proposes through Raft to avoid duplicate proposals
            if raft_lock.is_leader() {
                if let Err(e) = raft_lock.propose(cmd).await {
                    warn!("Failed to propose membership command through Raft: {e}");
                }
                return;
            }
            // Followers do NOT apply hints locally.
            // The Raft state machine is the single source of truth.
            // The leader's proposal will replicate through Raft and be applied
            // by the state machine on all nodes (including this follower).
            debug!("Follower received membership event; will be applied via Raft replication");
        } else {
            // No Raft node yet (standalone / bootstrap / test): apply locally
            drop(raft_guard);
            metadata.apply(0, cmd).await;
        }
    }

    /// Automatic partition reassignment when a broker fails.
    ///
    /// For every partition that was led by the dead node, proposes a leader election
    /// via Raft using the deterministic "smallest ISR member" strategy. This ensures
    /// partitions automatically recover leadership without manual intervention.
    ///
    /// Only the Raft leader executes this to avoid duplicate proposals.
    ///
    /// # Leader-only limitation
    ///
    /// If the Raft **leader itself** dies, partition reassignment is stalled until
    /// a new leader is elected. During this window (bounded by `election_timeout_max`),
    /// partitions formerly led by the dead leader-node remain unavailable because no
    /// node will drive the reassignment proposals.  Once a new Raft leader is elected
    /// and SWIM detects the failure, the new leader will trigger reassignment.  If
    /// SWIM's failure event was already processed (and discarded) before the new Raft
    /// leader was elected, the partitions will remain orphaned until the next SWIM
    /// protocol round re-detects the dead node.  A future improvement could persist
    /// pending reassignment intents so the new leader replays them on election.
    async fn reassign_dead_node_partitions(
        dead_node_id: &str,
        raft_node: &RwLock<Option<Arc<RwLock<RaftNode>>>>,
    ) {
        let raft_guard = raft_node.read().await;
        let Some(ref raft) = *raft_guard else {
            return; // No Raft — standalone mode, nothing to reassign
        };
        let raft_lock = raft.read().await;

        if !raft_lock.is_leader() {
            return; // Only the leader drives reassignment
        }

        // Find all partitions that were led by the dead node
        let affected_partitions: Vec<(PartitionId, std::collections::HashSet<String>, u64)> = {
            let meta = raft_lock.metadata().await;
            meta.partitions_led_by(&dead_node_id.to_string())
                .into_iter()
                .filter_map(|pid| {
                    meta.topics.get(&pid.topic).and_then(|t| {
                        t.partition(pid.partition)
                            .map(|ps| (pid, ps.isr.clone(), ps.leader_epoch))
                    })
                })
                .collect()
        };

        if affected_partitions.is_empty() {
            debug!(
                dead_node = dead_node_id,
                "Dead node had no leader partitions"
            );
            return;
        }

        info!(
            dead_node = dead_node_id,
            affected = affected_partitions.len(),
            "Reassigning partitions from failed node"
        );

        let mut reassigned = 0u32;
        let mut failed = 0u32;

        for (partition_id, isr, leader_epoch) in affected_partitions {
            // Remove the dead node from ISR candidates
            let mut candidates: Vec<_> = isr.into_iter().filter(|n| n != dead_node_id).collect();
            candidates.sort(); // Deterministic: smallest ISR member wins

            let Some(new_leader) = candidates.first().cloned() else {
                warn!(
                    topic = partition_id.topic,
                    partition = partition_id.partition,
                    "No ISR candidates remaining — partition is offline"
                );
                failed += 1;
                continue;
            };

            let cmd = MetadataCommand::UpdatePartitionLeader {
                partition: partition_id.clone(),
                leader: new_leader.clone(),
                epoch: leader_epoch + 1,
            };

            if let Err(e) = raft_lock.propose(cmd).await {
                warn!(
                    topic = partition_id.topic,
                    partition = partition_id.partition,
                    new_leader = new_leader,
                    error = %e,
                    "Failed to propose partition leader reassignment"
                );
                failed += 1;
            } else {
                info!(
                    topic = partition_id.topic,
                    partition = partition_id.partition,
                    new_leader = new_leader,
                    epoch = leader_epoch + 1,
                    "Partition leader reassigned"
                );
                reassigned += 1;
            }
        }

        info!(
            dead_node = dead_node_id,
            reassigned, failed, "Partition reassignment complete"
        );
    }

    /// Wire a Raft consensus node into this coordinator.
    ///
    /// When set, all state mutations (create_topic, delete_topic, elect leader) go through
    /// Raft consensus for replicated consistency. Reads use the Raft state machine's
    /// authoritative metadata. Without a Raft node, the coordinator falls back to the
    /// local MetadataStore (standalone/test mode).
    pub async fn set_raft_node(&mut self, raft_node: Arc<RwLock<RaftNode>>) {
        // Use .await instead of try_write().expect().
        // try_write panics if any spawned task holds a read lock.
        let mut guard = self.raft_node.write().await;
        *guard = Some(raft_node);
    }

    /// Apply a metadata command through the appropriate channel.
    ///
    /// When Raft is wired, uses Raft consensus for replicated consistency.
    /// Falls back to local metadata store for standalone/test usage.
    async fn apply_command(&self, cmd: MetadataCommand) -> Result<()> {
        let raft_guard = self.raft_node.read().await;
        if let Some(ref raft_node) = *raft_guard {
            let raft = raft_node.read().await;
            raft.propose(cmd).await?;
        } else {
            drop(raft_guard);
            self.metadata.apply(0, cmd).await;
        }
        Ok(())
    }

    /// Execute a closure with read access to the current cluster metadata.
    ///
    /// Routes to Raft state machine when available for authoritative reads,
    /// otherwise uses the local metadata store.
    async fn with_metadata<F, R>(&self, f: F) -> R
    where
        F: FnOnce(&crate::metadata::ClusterMetadata) -> R,
    {
        let raft_guard = self.raft_node.read().await;
        if let Some(ref raft_node) = *raft_guard {
            let raft = raft_node.read().await;
            let meta = raft.metadata().await;
            f(&meta)
        } else {
            drop(raft_guard);
            let meta = self.metadata.read().await;
            f(&meta)
        }
    }

    /// Check if we are the cluster leader.
    ///
    /// When Raft is wired, checks the Raft node's authoritative leadership state.
    /// Otherwise falls back to the local is_leader flag.
    async fn check_is_leader(&self) -> bool {
        let raft_guard = self.raft_node.read().await;
        if let Some(ref raft_node) = *raft_guard {
            let raft = raft_node.read().await;
            raft.is_leader()
        } else {
            drop(raft_guard);
            *self.is_leader_flag.read().await
        }
    }

    /// Get current Raft leader node ID.
    async fn current_leader(&self) -> Option<NodeId> {
        let raft_guard = self.raft_node.read().await;
        if let Some(ref raft_node) = *raft_guard {
            let raft = raft_node.read().await;
            raft.leader().map(|leader_id| {
                if leader_id == raft.node_id() {
                    raft.node_id_str().to_string()
                } else {
                    leader_id.to_string()
                }
            })
        } else {
            drop(raft_guard);
            self.raft_leader.read().await.clone()
        }
    }

    /// Create a new topic
    pub async fn create_topic(&self, config: TopicConfig) -> Result<()> {
        if !self.check_is_leader().await {
            return Err(ClusterError::NotLeader {
                leader: self.current_leader().await,
            });
        }

        // Check if topic already exists
        let exists = self
            .with_metadata(|meta| meta.topics.contains_key(&config.name))
            .await;
        if exists {
            return Err(ClusterError::TopicAlreadyExists(config.name));
        }

        // Calculate partition assignments
        let placer = self.placer.read().await;
        let mut assignments = Vec::with_capacity(config.partitions as usize);

        for partition in 0..config.partitions {
            let replicas =
                placer.assign_partition(&config.name, partition, config.replication_factor)?;
            assignments.push(replicas);
        }

        drop(placer);

        // Apply via Raft consensus (or local metadata in standalone)
        let cmd = MetadataCommand::CreateTopic {
            config,
            partition_assignments: assignments,
        };

        self.apply_command(cmd).await?;

        Ok(())
    }

    /// Delete a topic
    pub async fn delete_topic(&self, name: &str) -> Result<()> {
        if !self.check_is_leader().await {
            return Err(ClusterError::NotLeader {
                leader: self.current_leader().await,
            });
        }

        // Check topic exists
        let exists = self
            .with_metadata(|meta| meta.topics.contains_key(name))
            .await;
        if !exists {
            return Err(ClusterError::TopicNotFound(name.to_string()));
        }

        let cmd = MetadataCommand::DeleteTopic {
            name: name.to_string(),
        };
        self.apply_command(cmd).await?;

        Ok(())
    }

    /// Get partition leader
    pub async fn get_partition_leader(
        &self,
        topic: &str,
        partition: u32,
    ) -> Result<Option<NodeId>> {
        let leader = self
            .with_metadata(|meta| meta.find_leader(topic, partition).cloned())
            .await;
        Ok(leader)
    }

    /// Trigger leader election for a partition
    pub async fn elect_partition_leader(&self, partition_id: &PartitionId) -> Result<NodeId> {
        if !self.check_is_leader().await {
            return Err(ClusterError::NotLeader {
                leader: self.current_leader().await,
            });
        }

        let partition = self
            .with_metadata(|meta| {
                meta.topics
                    .get(&partition_id.topic)
                    .and_then(|t| t.partition(partition_id.partition).cloned())
            })
            .await
            .ok_or_else(|| ClusterError::PartitionNotFound {
                topic: partition_id.topic.clone(),
                partition: partition_id.partition,
            })?;

        // Deterministic election: pick the lexicographically smallest ISR member
        // so all nodes agree on the same leader for the same ISR state.
        let mut sorted_isr: Vec<_> = partition.isr.iter().cloned().collect();
        sorted_isr.sort();
        let new_leader = sorted_isr
            .into_iter()
            .next()
            .ok_or(ClusterError::NotEnoughIsr {
                required: 1,
                current: 0,
            })?;

        let cmd = MetadataCommand::UpdatePartitionLeader {
            partition: partition_id.clone(),
            leader: new_leader.clone(),
            epoch: partition.leader_epoch + 1,
        };

        self.apply_command(cmd).await?;

        Ok(new_leader)
    }

    /// Get current coordinator state
    pub async fn state(&self) -> CoordinatorState {
        *self.state.read().await
    }

    /// Check if we are the cluster leader (public API)
    pub async fn is_leader(&self) -> bool {
        self.check_is_leader().await
    }

    /// Get metadata store
    pub fn metadata(&self) -> &Arc<MetadataStore> {
        &self.metadata
    }

    /// Get replication manager
    pub fn replication(&self) -> &Arc<ReplicationManager> {
        &self.replication
    }

    /// Get local node info
    pub fn local_node(&self) -> &NodeInfo {
        &self.local_node
    }

    /// Graceful shutdown
    pub async fn shutdown(&self) -> Result<()> {
        *self.state.write().await = CoordinatorState::Leaving;

        // Notify cluster we're leaving
        if let Some(membership) = &self.membership {
            membership.leave().await?;
        }

        // Signal all tasks to shutdown
        let _ = self.shutdown_tx.send(());

        *self.state.write().await = CoordinatorState::Stopped;
        info!(node_id = %self.config.node_id, "Coordinator shutdown complete");

        Ok(())
    }

    /// Get cluster health status
    pub async fn health(&self) -> ClusterHealth {
        let state = *self.state.read().await;
        let is_leader = self.check_is_leader().await;

        let (node_count, healthy_nodes) = if let Some(membership) = &self.membership {
            (membership.member_count(), membership.healthy_count())
        } else {
            (1, 1) // Standalone
        };

        let (topic_count, partition_count, offline_partitions, under_replicated) = self
            .with_metadata(|meta| {
                let tc = meta.topics.len();
                let pc: usize = meta.topics.values().map(|t| t.partitions.len()).sum();
                let op = meta.offline_partitions().len();
                let ur = meta.under_replicated_partitions().len();
                (tc, pc, op, ur)
            })
            .await;

        ClusterHealth {
            state,
            is_leader,
            node_count,
            healthy_nodes,
            topic_count,
            partition_count,
            offline_partitions,
            under_replicated_partitions: under_replicated,
        }
    }

    // ========== Routing Helper Methods ==========

    /// Select a partition for a message (used when partition not specified)
    ///
    /// If key is provided, uses consistent hashing
    /// Otherwise, uses round-robin across partitions
    pub async fn select_partition(&self, topic: &str, key: Option<&[u8]>) -> Option<u32> {
        let partition_count = self
            .with_metadata(|meta| {
                meta.topics
                    .get(topic)
                    .map(|t| t.partitions.len())
                    .unwrap_or(0)
            })
            .await;

        if partition_count == 0 {
            return None;
        }

        if let Some(key) = key {
            // Consistent hashing based on key
            use std::hash::{Hash, Hasher};
            let mut hasher = std::collections::hash_map::DefaultHasher::new();
            key.hash(&mut hasher);
            let hash = hasher.finish();
            Some((hash % partition_count as u64) as u32)
        } else {
            // Deterministic round-robin via atomic counter
            let idx = self.round_robin_counter.fetch_add(1, Ordering::Relaxed);
            Some((idx % partition_count) as u32)
        }
    }

    /// Get the leader for a partition (async wrapper for routing)
    pub async fn partition_leader(&self, topic: &str, partition: u32) -> Option<String> {
        self.with_metadata(|meta| meta.find_leader(topic, partition).cloned())
            .await
    }

    /// Get the current leader epoch for a partition (§2.4 data-path fencing)
    pub async fn partition_leader_epoch(&self, topic: &str, partition: u32) -> Option<u64> {
        self.with_metadata(|meta| {
            meta.topics
                .get(topic)
                .and_then(|t| t.partition(partition))
                .map(|p| p.leader_epoch)
        })
        .await
    }

    /// Check if a node is in the ISR for a partition
    pub async fn is_in_isr(&self, topic: &str, partition: u32, node_id: &str) -> bool {
        self.with_metadata(|meta| {
            meta.topics
                .get(topic)
                .and_then(|t| t.partition(partition))
                .map(|p| p.isr.contains(node_id))
                .unwrap_or(false)
        })
        .await
    }

    /// Get any ISR member for a partition (for read routing)
    pub async fn get_isr_member(&self, topic: &str, partition: u32) -> Option<String> {
        self.with_metadata(|meta| {
            meta.topics
                .get(topic)
                .and_then(|t| t.partition(partition))
                .and_then(|p| p.isr.iter().next().cloned())
        })
        .await
    }
}

/// Cluster health information
#[derive(Debug, Clone)]
pub struct ClusterHealth {
    pub state: CoordinatorState,
    pub is_leader: bool,
    pub node_count: usize,
    pub healthy_nodes: usize,
    pub topic_count: usize,
    pub partition_count: usize,
    pub offline_partitions: usize,
    pub under_replicated_partitions: usize,
}

impl ClusterHealth {
    /// Check if cluster is healthy
    pub fn is_healthy(&self) -> bool {
        matches!(
            self.state,
            CoordinatorState::Leader | CoordinatorState::Follower
        ) && self.healthy_nodes > 0
            && self.offline_partitions == 0
    }

    /// Get health status string
    pub fn status(&self) -> &'static str {
        if self.is_healthy() {
            if self.under_replicated_partitions > 0 {
                "degraded"
            } else {
                "healthy"
            }
        } else {
            "unhealthy"
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    async fn test_standalone_coordinator() {
        let config = ClusterConfig::standalone();
        let coordinator = ClusterCoordinator::standalone(config).await.unwrap();

        assert!(coordinator.is_leader().await);
        assert_eq!(coordinator.state().await, CoordinatorState::Leader);

        let health = coordinator.health().await;
        assert!(health.is_healthy());
        assert_eq!(health.node_count, 1);
    }

    #[tokio::test]
    async fn test_create_topic_standalone() {
        let config = ClusterConfig::standalone();
        let coordinator = ClusterCoordinator::standalone(config).await.unwrap();

        let topic_config = TopicConfig::new("test-topic", 3, 1);
        coordinator.create_topic(topic_config).await.unwrap();

        // Verify topic was created
        let topic = coordinator.metadata().get_topic("test-topic").await;
        assert!(topic.is_some());
        assert_eq!(topic.unwrap().partitions.len(), 3);
    }
}