lcpfs 2026.1.102

//! LCPFS Cluster - Distributed Storage Cluster Management
//!
//! This module provides the core cluster management for LCPFS distributed mode,
//! tying together CRUSH placement, OSD management, and MDS coordination.
//!
//! # Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────────────────┐
//! │                           LCPFS Cluster                                  │
//! │                                                                          │
//! │   ┌───────────────────────────────────────────────────────────────┐     │
//! │   │                      Cluster Manager                           │     │
//! │   │  ┌─────────────┐  ┌─────────────┐  ┌─────────────────────┐   │     │
//! │   │  │ CRUSH Map   │  │ OSD Map     │  │ Failure Detector    │   │     │
//! │   │  │ (Placement) │  │ (Topology)  │  │ (Health Monitor)    │   │     │
//! │   │  └─────────────┘  └─────────────┘  └─────────────────────┘   │     │
//! │   └───────────────────────────────────────────────────────────────┘     │
//! │                                │                                         │
//! │            ┌──────────────────┼──────────────────┐                      │
//! │            ▼                  ▼                  ▼                      │
//! │   ┌─────────────┐    ┌─────────────┐    ┌─────────────┐                │
//! │   │    MDS      │    │   OSD 0     │    │   OSD 1     │    ...        │
//! │   │  (Metadata) │    │  (Storage)  │    │  (Storage)  │                │
//! │   └─────────────┘    └─────────────┘    └─────────────┘                │
//! │                                                                          │
//! │   ┌───────────────────────────────────────────────────────────────┐     │
//! │   │                      Placement Groups                          │     │
//! │   │   PG 0: [OSD0, OSD2, OSD4]    PG 1: [OSD1, OSD3, OSD5]  ...   │     │
//! │   └───────────────────────────────────────────────────────────────┘     │
//! └─────────────────────────────────────────────────────────────────────────┘
//! ```
//!
//! # Components
//!
//! - **Cluster Manager**: Coordinates all cluster components
//! - **CRUSH Map**: Determines data placement across OSDs
//! - **OSD Map**: Tracks OSD status and topology
//! - **Failure Detector**: Monitors node health via heartbeats
//! - **Recovery Manager**: Handles data migration after failures

#![cfg_attr(not(feature = "std"), no_std)]

extern crate alloc;

use alloc::collections::BTreeMap;
use alloc::format;
use alloc::string::{String, ToString};
use alloc::vec;
use alloc::vec::Vec;
use core::fmt;

#[cfg(feature = "std")]
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};

#[cfg(not(feature = "std"))]
use core::sync::atomic::{AtomicBool, AtomicU64, Ordering};

use super::crush::{BucketType, CrushError, CrushMap};
use super::mds::{Mds, MdsConfig, MdsError};
use super::osd::{Osd, OsdConfig, OsdError, OsdState, PgState, PlacementGroup};

// ============================================================================
// Cluster Configuration
// ============================================================================

/// Cluster configuration.
#[derive(Debug, Clone)]
pub struct ClusterConfig {
    /// Cluster name.
    pub cluster_name: String,
    /// Cluster UUID.
    pub cluster_uuid: [u8; 16],
    /// Replication factor (default 3).
    pub replication_factor: usize,
    /// Number of placement groups (default 256).
    pub pg_count: u64,
    /// Minimum OSDs before cluster is active.
    pub min_osds: usize,
    /// Failure domain for replica placement.
    pub failure_domain: BucketType,
    /// Heartbeat interval in milliseconds.
    pub heartbeat_interval_ms: u64,
    /// Heartbeat timeout (mark as down after this).
    pub heartbeat_timeout_ms: u64,
    /// Recovery max bandwidth (bytes/sec, 0 = unlimited).
    pub recovery_max_bandwidth: u64,
    /// Scrub interval in seconds.
    pub scrub_interval_secs: u64,
    /// Enable auto-recovery.
    pub auto_recovery: bool,
}

impl Default for ClusterConfig {
    fn default() -> Self {
        Self {
            cluster_name: "lcpfs".to_string(),
            cluster_uuid: [0; 16],
            replication_factor: 3,
            pg_count: 256,
            min_osds: 3,
            failure_domain: BucketType::Host,
            heartbeat_interval_ms: 1000,
            heartbeat_timeout_ms: 20000,
            recovery_max_bandwidth: 0,
            scrub_interval_secs: 86400,
            auto_recovery: true,
        }
    }
}

// ============================================================================
// OSD Map (Cluster Topology)
// ============================================================================

/// OSD information in the cluster map.
#[derive(Debug, Clone)]
pub struct OsdInfo {
    /// OSD ID.
    pub id: u64,
    /// Network address.
    pub addr: String,
    /// Current state.
    pub state: OsdNodeState,
    /// Weight (for CRUSH).
    pub weight: f64,
    /// Failure domain (host, rack, etc.).
    pub failure_domain: String,
    /// Total capacity in bytes.
    pub capacity: u64,
    /// Used bytes.
    pub used: u64,
    /// Last heartbeat timestamp.
    pub last_heartbeat: u64,
    /// Epoch when this OSD joined.
    pub in_epoch: u64,
    /// Epoch when this OSD was marked up.
    pub up_epoch: u64,
}

/// OSD state in the cluster.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OsdNodeState {
    /// OSD is up and serving requests.
    Up,
    /// OSD is down (not responding).
    Down,
    /// OSD is joining the cluster.
    Joining,
    /// OSD is leaving the cluster.
    Leaving,
    /// OSD is marked out (data being migrated away).
    Out,
    /// OSD is destroyed/removed.
    Destroyed,
}

impl OsdNodeState {
    /// Check if OSD is available for operations.
    pub fn is_available(&self) -> bool {
        matches!(self, OsdNodeState::Up)
    }
}

/// OSD map - tracks all OSDs in the cluster.
#[derive(Debug, Clone)]
pub struct OsdMap {
    /// Map epoch (incremented on each change).
    pub epoch: u64,
    /// All OSDs.
    pub osds: BTreeMap<u64, OsdInfo>,
    /// Up OSDs (subset of all OSDs).
    pub up_osds: Vec<u64>,
    /// In OSDs (OSDs with data).
    pub in_osds: Vec<u64>,
    /// Pool configurations.
    pub pools: BTreeMap<u64, PoolConfig>,
    /// PG to OSD mappings.
    pub pg_map: BTreeMap<u64, PgMapping>,
    /// Cluster flags.
    pub flags: ClusterFlags,
}

/// Pool configuration.
#[derive(Debug, Clone)]
pub struct PoolConfig {
    /// Pool ID.
    pub id: u64,
    /// Pool name.
    pub name: String,
    /// Replication size.
    pub size: usize,
    /// Minimum replicas for I/O.
    pub min_size: usize,
    /// Number of PGs.
    pub pg_count: u64,
    /// CRUSH rule name.
    pub crush_rule: String,
}

/// PG to OSD mapping.
#[derive(Debug, Clone)]
pub struct PgMapping {
    /// PG ID.
    pub pgid: u64,
    /// Pool ID.
    pub pool_id: u64,
    /// Acting set (current serving OSDs).
    pub acting: Vec<u64>,
    /// Up set (desired OSDs per CRUSH).
    pub up: Vec<u64>,
    /// Primary OSD.
    pub primary: u64,
    /// PG state.
    pub state: PgState,
    /// Last epoch when PG was active.
    pub last_active_epoch: u64,
}

/// Cluster-wide flags.
#[derive(Debug, Clone, Copy, Default)]
pub struct ClusterFlags {
    /// Pause all I/O.
    pub pauserd: bool,
    /// Pause writes.
    pub pausewr: bool,
    /// Don't mark OSDs down automatically.
    pub nodown: bool,
    /// Don't mark OSDs out automatically.
    pub noout: bool,
    /// Don't start recovery.
    pub norecover: bool,
    /// Don't start backfill.
    pub nobackfill: bool,
    /// Don't scrub.
    pub noscrub: bool,
    /// Don't deep scrub.
    pub nodeep_scrub: bool,
}

impl OsdMap {
    /// Create a new OSD map.
    pub fn new() -> Self {
        Self {
            epoch: 0,
            osds: BTreeMap::new(),
            up_osds: Vec::new(),
            in_osds: Vec::new(),
            pools: BTreeMap::new(),
            pg_map: BTreeMap::new(),
            flags: ClusterFlags::default(),
        }
    }

    /// Add an OSD.
    pub fn add_osd(&mut self, osd: OsdInfo) {
        let id = osd.id;
        if osd.state.is_available() && !self.up_osds.contains(&id) {
            self.up_osds.push(id);
        }
        if !self.in_osds.contains(&id) {
            self.in_osds.push(id);
        }
        self.osds.insert(id, osd);
        self.epoch += 1;
    }

    /// Mark OSD as down.
    pub fn mark_down(&mut self, osd_id: u64) {
        if let Some(osd) = self.osds.get_mut(&osd_id) {
            osd.state = OsdNodeState::Down;
            self.up_osds.retain(|&id| id != osd_id);
            self.epoch += 1;
        }
    }

    /// Mark OSD as up.
    pub fn mark_up(&mut self, osd_id: u64) {
        if let Some(osd) = self.osds.get_mut(&osd_id) {
            osd.state = OsdNodeState::Up;
            osd.up_epoch = self.epoch + 1;
            if !self.up_osds.contains(&osd_id) {
                self.up_osds.push(osd_id);
            }
            self.epoch += 1;
        }
    }

    /// Mark OSD as out (data migration).
    pub fn mark_out(&mut self, osd_id: u64) {
        if let Some(osd) = self.osds.get_mut(&osd_id) {
            osd.state = OsdNodeState::Out;
            self.in_osds.retain(|&id| id != osd_id);
            self.epoch += 1;
        }
    }

    /// Get OSD info.
    pub fn get_osd(&self, osd_id: u64) -> Option<&OsdInfo> {
        self.osds.get(&osd_id)
    }

    /// Get number of up OSDs.
    pub fn up_count(&self) -> usize {
        self.up_osds.len()
    }

    /// Get number of in OSDs.
    pub fn in_count(&self) -> usize {
        self.in_osds.len()
    }

    /// Create a default pool.
    pub fn create_pool(&mut self, name: &str, pg_count: u64, size: usize) -> u64 {
        let id = self.pools.len() as u64;
        let pool = PoolConfig {
            id,
            name: name.to_string(),
            size,
            min_size: size.div_ceil(2), // Majority
            pg_count,
            crush_rule: "replicated_rule".to_string(),
        };
        self.pools.insert(id, pool);
        self.epoch += 1;
        id
    }
}

impl Default for OsdMap {
    fn default() -> Self {
        Self::new()
    }
}

// ============================================================================
// Failure Detection
// ============================================================================

/// Failure detector using heartbeats.
#[derive(Debug)]
pub struct FailureDetector {
    /// Heartbeat interval.
    interval_ms: u64,
    /// Timeout before marking as failed.
    timeout_ms: u64,
    /// Last heartbeat from each OSD.
    last_seen: BTreeMap<u64, u64>,
    /// Failed OSDs pending handling.
    failed: Vec<u64>,
}

impl FailureDetector {
    /// Create a new failure detector.
    pub fn new(interval_ms: u64, timeout_ms: u64) -> Self {
        Self {
            interval_ms,
            timeout_ms,
            last_seen: BTreeMap::new(),
            failed: Vec::new(),
        }
    }

    /// Record a heartbeat from an OSD.
    pub fn heartbeat(&mut self, osd_id: u64, timestamp: u64) {
        self.last_seen.insert(osd_id, timestamp);
        // Remove from failed if it was there
        self.failed.retain(|&id| id != osd_id);
    }

    /// Check for failed OSDs.
    pub fn check(&mut self, current_time: u64) -> Vec<u64> {
        let mut newly_failed = Vec::new();

        for (&osd_id, &last_time) in &self.last_seen {
            if current_time > last_time + self.timeout_ms && !self.failed.contains(&osd_id) {
                newly_failed.push(osd_id);
            }
        }

        for osd_id in &newly_failed {
            if !self.failed.contains(osd_id) {
                self.failed.push(*osd_id);
            }
        }

        newly_failed
    }

    /// Get all failed OSDs.
    pub fn get_failed(&self) -> &[u64] {
        &self.failed
    }

    /// Clear failure status (OSD recovered).
    pub fn clear_failure(&mut self, osd_id: u64) {
        self.failed.retain(|&id| id != osd_id);
    }

    /// Remove OSD from tracking.
    pub fn remove(&mut self, osd_id: u64) {
        self.last_seen.remove(&osd_id);
        self.failed.retain(|&id| id != osd_id);
    }
}

// ============================================================================
// Recovery Manager
// ============================================================================

/// Recovery operation state.
#[derive(Debug, Clone)]
pub struct RecoveryState {
    /// PG being recovered.
    pub pgid: u64,
    /// Source OSD (has the data).
    pub source_osd: u64,
    /// Target OSD (needs the data).
    pub target_osd: u64,
    /// Objects to recover.
    pub objects_total: u64,
    /// Objects recovered so far.
    pub objects_done: u64,
    /// Bytes to recover.
    pub bytes_total: u64,
    /// Bytes recovered so far.
    pub bytes_done: u64,
    /// Started timestamp.
    pub started: u64,
    /// Recovery state.
    pub state: RecoveryOpState,
}

/// State of a recovery operation.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RecoveryOpState {
    /// Queued, not started.
    Queued,
    /// In progress.
    InProgress,
    /// Completed successfully.
    Completed,
    /// Failed.
    Failed,
    /// Cancelled.
    Cancelled,
}

/// Recovery manager.
#[derive(Debug)]
pub struct RecoveryManager {
    /// Max concurrent recoveries.
    max_concurrent: usize,
    /// Active recoveries.
    active: BTreeMap<u64, RecoveryState>,
    /// Queued recoveries.
    queue: Vec<RecoveryState>,
    /// Completed recoveries.
    completed: u64,
    /// Failed recoveries.
    failed: u64,
}

impl RecoveryManager {
    /// Create a new recovery manager.
    pub fn new(max_concurrent: usize) -> Self {
        Self {
            max_concurrent,
            active: BTreeMap::new(),
            queue: Vec::new(),
            completed: 0,
            failed: 0,
        }
    }

    /// Queue a recovery operation.
    pub fn queue_recovery(&mut self, pgid: u64, source_osd: u64, target_osd: u64) {
        // Check if already queued/active
        if self.active.contains_key(&pgid) {
            return;
        }
        if self.queue.iter().any(|r| r.pgid == pgid) {
            return;
        }

        let recovery = RecoveryState {
            pgid,
            source_osd,
            target_osd,
            objects_total: 0,
            objects_done: 0,
            bytes_total: 0,
            bytes_done: 0,
            started: 0,
            state: RecoveryOpState::Queued,
        };

        self.queue.push(recovery);
    }

    /// Start queued recoveries up to max_concurrent.
    pub fn start_recoveries(&mut self, current_time: u64) -> Vec<u64> {
        let mut started = Vec::new();

        while self.active.len() < self.max_concurrent && !self.queue.is_empty() {
            let mut recovery = self.queue.remove(0);
            recovery.state = RecoveryOpState::InProgress;
            recovery.started = current_time;

            started.push(recovery.pgid);
            self.active.insert(recovery.pgid, recovery);
        }

        started
    }

    /// Mark recovery as complete.
    pub fn complete(&mut self, pgid: u64) {
        if let Some(mut recovery) = self.active.remove(&pgid) {
            recovery.state = RecoveryOpState::Completed;
            self.completed += 1;
        }
    }

    /// Mark recovery as failed.
    pub fn fail(&mut self, pgid: u64) {
        if let Some(mut recovery) = self.active.remove(&pgid) {
            recovery.state = RecoveryOpState::Failed;
            self.failed += 1;
            // Re-queue for retry
            recovery.state = RecoveryOpState::Queued;
            self.queue.push(recovery);
        }
    }

    /// Update recovery progress.
    pub fn update_progress(&mut self, pgid: u64, objects_done: u64, bytes_done: u64) {
        if let Some(recovery) = self.active.get_mut(&pgid) {
            recovery.objects_done = objects_done;
            recovery.bytes_done = bytes_done;
        }
    }

    /// Get active recovery count.
    pub fn active_count(&self) -> usize {
        self.active.len()
    }

    /// Get queued recovery count.
    pub fn queued_count(&self) -> usize {
        self.queue.len()
    }

    /// Get recovery stats.
    pub fn stats(&self) -> (u64, u64, usize, usize) {
        (
            self.completed,
            self.failed,
            self.active.len(),
            self.queue.len(),
        )
    }
}

// ============================================================================
// Cluster Client
// ============================================================================

/// Client for interacting with the cluster.
pub struct ClusterClient {
    /// Cluster configuration.
    config: ClusterConfig,
    /// CRUSH map for placement.
    crush_map: CrushMap,
    /// OSD map.
    osd_map: OsdMap,
    /// Request ID counter.
    request_id: AtomicU64,
}

impl ClusterClient {
    /// Create a new cluster client.
    pub fn new(config: ClusterConfig, crush_map: CrushMap, osd_map: OsdMap) -> Self {
        Self {
            config,
            crush_map,
            osd_map,
            request_id: AtomicU64::new(1),
        }
    }

    /// Generate next request ID.
    fn next_request_id(&self) -> u64 {
        self.request_id.fetch_add(1, Ordering::SeqCst)
    }

    /// Calculate PG for an object.
    pub fn object_to_pg(&self, pool_id: u64, oid: u64) -> u64 {
        let pool = match self.osd_map.pools.get(&pool_id) {
            Some(p) => p,
            None => return 0,
        };

        // Hash object ID to PG
        let hash = oid.wrapping_mul(0x9e3779b97f4a7c15);
        hash % pool.pg_count
    }

    /// Get OSDs for a PG using CRUSH.
    pub fn get_pg_osds(&self, pool_id: u64, pgid: u64) -> Result<Vec<u64>, ClusterError> {
        let pool = self
            .osd_map
            .pools
            .get(&pool_id)
            .ok_or(ClusterError::PoolNotFound(pool_id))?;

        self.crush_map
            .select(&pool.crush_rule, pgid, pool.size)
            .map_err(ClusterError::CrushError)
    }

    /// Get the primary OSD for a PG.
    pub fn get_primary(&self, pool_id: u64, pgid: u64) -> Result<u64, ClusterError> {
        let osds = self.get_pg_osds(pool_id, pgid)?;
        osds.first()
            .copied()
            .ok_or(ClusterError::NoPrimaryAvailable)
    }

    /// Get cluster epoch.
    pub fn epoch(&self) -> u64 {
        self.osd_map.epoch
    }

    /// Check if cluster has enough OSDs.
    pub fn is_healthy(&self) -> bool {
        self.osd_map.up_count() >= self.config.min_osds
    }
}

// ============================================================================
// Cluster Manager
// ============================================================================

/// Cluster state.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClusterState {
    /// Cluster is initializing.
    Initializing,
    /// Cluster is active and healthy.
    Active,
    /// Cluster is degraded (some data may be unavailable).
    Degraded,
    /// Cluster is recovering.
    Recovering,
    /// Cluster is offline.
    Offline,
}

/// The main cluster manager.
pub struct ClusterManager {
    /// Configuration.
    config: ClusterConfig,
    /// Cluster state.
    state: ClusterState,
    /// CRUSH map.
    crush_map: CrushMap,
    /// OSD map.
    osd_map: OsdMap,
    /// Failure detector.
    failure_detector: FailureDetector,
    /// Recovery manager.
    recovery_manager: RecoveryManager,
    /// Local OSDs (for combined node).
    local_osds: BTreeMap<u64, Osd>,
    /// Local MDS (for combined node).
    local_mds: Option<Mds>,
    /// Current epoch.
    epoch: AtomicU64,
    /// Is cluster running.
    running: AtomicBool,
}

impl ClusterManager {
    /// Create a new cluster manager.
    pub fn new(config: ClusterConfig) -> Self {
        let crush_map = CrushMap::new();
        let osd_map = OsdMap::new();

        let failure_detector =
            FailureDetector::new(config.heartbeat_interval_ms, config.heartbeat_timeout_ms);

        let recovery_manager = RecoveryManager::new(4);

        Self {
            config,
            state: ClusterState::Initializing,
            crush_map,
            osd_map,
            failure_detector,
            recovery_manager,
            local_osds: BTreeMap::new(),
            local_mds: None,
            epoch: AtomicU64::new(0),
            running: AtomicBool::new(false),
        }
    }

    /// Initialize a new cluster.
    pub fn init_cluster(&mut self) -> Result<(), ClusterError> {
        // Create default pool
        self.osd_map.create_pool(
            "default",
            self.config.pg_count,
            self.config.replication_factor,
        );

        // Initialize CRUSH map with simple structure
        self.crush_map = CrushMap::simple(0);

        // Start MDS
        let mds_config = MdsConfig {
            pg_count: self.config.pg_count,
            ..Default::default()
        };
        self.local_mds = Some(Mds::new(mds_config));

        self.state = ClusterState::Initializing;
        self.running.store(true, Ordering::SeqCst);

        Ok(())
    }

    /// Add an OSD to the cluster.
    pub fn add_osd(&mut self, osd_id: u64, addr: &str, weight: f64) -> Result<(), ClusterError> {
        // Create OSD info
        let osd_info = OsdInfo {
            id: osd_id,
            addr: addr.to_string(),
            state: OsdNodeState::Up,
            weight,
            failure_domain: format!("host-{}", osd_id),
            capacity: 1024 * 1024 * 1024 * 100, // 100 GB default
            used: 0,
            last_heartbeat: 0,
            in_epoch: self.osd_map.epoch,
            up_epoch: self.osd_map.epoch,
        };

        // Add to OSD map
        self.osd_map.add_osd(osd_info.clone());

        // Update CRUSH map - add OSD to its failure domain host
        self.crush_map
            .add_osd(osd_id, &osd_info.failure_domain, weight);

        // Create local OSD if this is a combined node
        let osd_config = OsdConfig {
            id: osd_id,
            cluster_name: self.config.cluster_name.clone(),
            ..Default::default()
        };
        let mut osd = Osd::new(osd_config);
        osd.boot().map_err(ClusterError::OsdError)?;
        self.local_osds.insert(osd_id, osd);

        // Increment epoch
        self.epoch.fetch_add(1, Ordering::SeqCst);

        // Check if cluster is now active
        if self.osd_map.up_count() >= self.config.min_osds {
            self.state = ClusterState::Active;
            self.assign_pgs()?;
        }

        Ok(())
    }

    /// Remove an OSD from the cluster.
    pub fn remove_osd(&mut self, osd_id: u64, force: bool) -> Result<(), ClusterError> {
        if !force && self.osd_map.up_count() <= self.config.min_osds {
            return Err(ClusterError::InsufficientOsds);
        }

        // Mark as out (starts migration)
        self.osd_map.mark_out(osd_id);

        // Remove from failure detector
        self.failure_detector.remove(osd_id);

        // Remove local OSD
        if let Some(mut osd) = self.local_osds.remove(&osd_id) {
            osd.shutdown();
        }

        // Reassign PGs
        self.assign_pgs()?;

        self.epoch.fetch_add(1, Ordering::SeqCst);

        Ok(())
    }

    /// Assign PGs to OSDs.
    fn assign_pgs(&mut self) -> Result<(), ClusterError> {
        for (pool_id, pool) in &self.osd_map.pools.clone() {
            for pg_num in 0..pool.pg_count {
                let pgid = (*pool_id << 32) | pg_num;

                // Get OSDs using CRUSH
                let osds = self
                    .crush_map
                    .select(&pool.crush_rule, pgid, pool.size)
                    .map_err(ClusterError::CrushError)?;

                if osds.is_empty() {
                    continue;
                }

                let primary = osds[0];
                let replicas: Vec<u64> = osds.iter().skip(1).copied().collect();

                // Update PG mapping
                let mapping = PgMapping {
                    pgid,
                    pool_id: *pool_id,
                    acting: osds.clone(),
                    up: osds.clone(),
                    primary,
                    state: PgState::Active,
                    last_active_epoch: self.epoch.load(Ordering::SeqCst),
                };
                self.osd_map.pg_map.insert(pgid, mapping);

                // Create PG on local OSDs
                for &osd_id in &osds {
                    if let Some(osd) = self.local_osds.get_mut(&osd_id) {
                        let pg = PlacementGroup::new(pgid, *pool_id, primary, replicas.clone());
                        osd.add_pg(pg);
                        osd.set_pg_state(pgid, PgState::Active);
                    }
                }
            }
        }

        Ok(())
    }

    /// Handle OSD heartbeat.
    pub fn heartbeat(&mut self, osd_id: u64, timestamp: u64) {
        self.failure_detector.heartbeat(osd_id, timestamp);

        // Update OSD info
        if let Some(osd_info) = self.osd_map.osds.get_mut(&osd_id) {
            osd_info.last_heartbeat = timestamp;
            if osd_info.state == OsdNodeState::Down {
                osd_info.state = OsdNodeState::Up;
                if !self.osd_map.up_osds.contains(&osd_id) {
                    self.osd_map.up_osds.push(osd_id);
                }
            }
        }
    }

    /// Check for failures and trigger recovery.
    pub fn check_health(&mut self, current_time: u64) -> Vec<ClusterEvent> {
        let mut events = Vec::new();

        // Check for failed OSDs
        let failed = self.failure_detector.check(current_time);

        for osd_id in failed {
            if !self.osd_map.flags.nodown {
                self.osd_map.mark_down(osd_id);
                events.push(ClusterEvent::OsdDown(osd_id));

                // Mark affected PGs as degraded
                for (pgid, mapping) in &mut self.osd_map.pg_map {
                    if mapping.acting.contains(&osd_id) {
                        mapping.state = PgState::Degraded;
                        events.push(ClusterEvent::PgDegraded(*pgid));

                        // Queue recovery if enabled
                        if self.config.auto_recovery && !self.osd_map.flags.norecover {
                            if let Some(&source) = mapping.acting.iter().find(|&&id| id != osd_id) {
                                // Find new target OSD
                                if let Ok(new_osds) =
                                    self.crush_map
                                        .select(&mapping.pool_id.to_string(), *pgid, 1)
                                {
                                    if let Some(&target) = new_osds.first() {
                                        self.recovery_manager.queue_recovery(*pgid, source, target);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        // Start any queued recoveries
        let started = self.recovery_manager.start_recoveries(current_time);
        for pgid in started {
            events.push(ClusterEvent::RecoveryStarted(pgid));
        }

        // Update cluster state
        self.update_state();

        events
    }

    /// Update cluster state based on current health.
    fn update_state(&mut self) {
        let up_count = self.osd_map.up_count();

        if up_count < self.config.min_osds {
            self.state = ClusterState::Offline;
        } else if self.recovery_manager.active_count() > 0 {
            self.state = ClusterState::Recovering;
        } else if self
            .osd_map
            .pg_map
            .values()
            .any(|pg| pg.state == PgState::Degraded)
        {
            self.state = ClusterState::Degraded;
        } else {
            self.state = ClusterState::Active;
        }
    }

    /// Get cluster state.
    pub fn state(&self) -> ClusterState {
        self.state
    }

    /// Get cluster epoch.
    pub fn epoch(&self) -> u64 {
        self.epoch.load(Ordering::SeqCst)
    }

    /// Get CRUSH map.
    pub fn crush_map(&self) -> &CrushMap {
        &self.crush_map
    }

    /// Get OSD map.
    pub fn osd_map(&self) -> &OsdMap {
        &self.osd_map
    }

    /// Get cluster statistics.
    pub fn stats(&self) -> ClusterStats {
        let mut total_capacity = 0u64;
        let mut total_used = 0u64;

        for osd in self.osd_map.osds.values() {
            total_capacity += osd.capacity;
            total_used += osd.used;
        }

        let (recovery_completed, recovery_failed, recovery_active, recovery_queued) =
            self.recovery_manager.stats();

        ClusterStats {
            state: self.state,
            epoch: self.epoch.load(Ordering::SeqCst),
            osds_total: self.osd_map.osds.len(),
            osds_up: self.osd_map.up_count(),
            osds_in: self.osd_map.in_count(),
            pgs_total: self.osd_map.pg_map.len(),
            pgs_active: self
                .osd_map
                .pg_map
                .values()
                .filter(|pg| pg.state == PgState::Active)
                .count(),
            pgs_degraded: self
                .osd_map
                .pg_map
                .values()
                .filter(|pg| pg.state == PgState::Degraded)
                .count(),
            total_capacity,
            total_used,
            recovery_active,
            recovery_queued,
            recovery_completed,
            recovery_failed,
        }
    }

    /// Create a cluster client.
    pub fn client(&self) -> ClusterClient {
        ClusterClient::new(
            self.config.clone(),
            self.crush_map.clone(),
            self.osd_map.clone(),
        )
    }
}

/// Cluster statistics.
#[derive(Debug, Clone)]
pub struct ClusterStats {
    /// Current state.
    pub state: ClusterState,
    /// Current epoch.
    pub epoch: u64,
    /// Total OSDs.
    pub osds_total: usize,
    /// Up OSDs.
    pub osds_up: usize,
    /// In OSDs.
    pub osds_in: usize,
    /// Total PGs.
    pub pgs_total: usize,
    /// Active PGs.
    pub pgs_active: usize,
    /// Degraded PGs.
    pub pgs_degraded: usize,
    /// Total capacity (bytes).
    pub total_capacity: u64,
    /// Used capacity (bytes).
    pub total_used: u64,
    /// Active recoveries.
    pub recovery_active: usize,
    /// Queued recoveries.
    pub recovery_queued: usize,
    /// Completed recoveries.
    pub recovery_completed: u64,
    /// Failed recoveries.
    pub recovery_failed: u64,
}

/// Cluster events.
#[derive(Debug, Clone)]
pub enum ClusterEvent {
    /// OSD went down.
    OsdDown(u64),
    /// OSD came up.
    OsdUp(u64),
    /// OSD was added.
    OsdAdded(u64),
    /// OSD was removed.
    OsdRemoved(u64),
    /// PG became degraded.
    PgDegraded(u64),
    /// PG became active.
    PgActive(u64),
    /// Recovery started.
    RecoveryStarted(u64),
    /// Recovery completed.
    RecoveryCompleted(u64),
    /// Epoch changed.
    EpochChanged(u64),
}

// ============================================================================
// Cluster Errors
// ============================================================================

/// Cluster operation errors.
#[derive(Debug, Clone)]
pub enum ClusterError {
    /// Not enough OSDs.
    InsufficientOsds,
    /// Pool not found.
    PoolNotFound(u64),
    /// OSD not found.
    OsdNotFound(u64),
    /// PG not found.
    PgNotFound(u64),
    /// No primary available.
    NoPrimaryAvailable,
    /// CRUSH error.
    CrushError(CrushError),
    /// OSD error.
    OsdError(OsdError),
    /// MDS error.
    MdsError(MdsError),
    /// Cluster not ready.
    NotReady,
    /// Operation not allowed.
    NotAllowed(String),
}

impl fmt::Display for ClusterError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            ClusterError::InsufficientOsds => write!(f, "Insufficient OSDs"),
            ClusterError::PoolNotFound(id) => write!(f, "Pool {} not found", id),
            ClusterError::OsdNotFound(id) => write!(f, "OSD {} not found", id),
            ClusterError::PgNotFound(id) => write!(f, "PG {} not found", id),
            ClusterError::NoPrimaryAvailable => write!(f, "No primary OSD available"),
            ClusterError::CrushError(e) => write!(f, "CRUSH error: {}", e),
            ClusterError::OsdError(e) => write!(f, "OSD error: {}", e),
            ClusterError::MdsError(e) => write!(f, "MDS error: {}", e),
            ClusterError::NotReady => write!(f, "Cluster not ready"),
            ClusterError::NotAllowed(msg) => write!(f, "Not allowed: {}", msg),
        }
    }
}

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    fn create_test_cluster() -> ClusterManager {
        let config = ClusterConfig {
            min_osds: 1,
            replication_factor: 1,
            pg_count: 16,
            ..Default::default()
        };
        ClusterManager::new(config)
    }

    #[test]
    fn test_cluster_creation() {
        let cluster = create_test_cluster();
        assert_eq!(cluster.state(), ClusterState::Initializing);
    }

    #[test]
    fn test_cluster_init() {
        let mut cluster = create_test_cluster();
        cluster.init_cluster().unwrap();
        assert!(cluster.local_mds.is_some());
    }

    #[test]
    fn test_add_osd() {
        let mut cluster = create_test_cluster();
        cluster.init_cluster().unwrap();

        cluster.add_osd(0, "127.0.0.1:6800", 1.0).unwrap();

        assert_eq!(cluster.osd_map.up_count(), 1);
        assert_eq!(cluster.state(), ClusterState::Active);
    }

    #[test]
    fn test_multiple_osds() {
        let mut cluster = ClusterManager::new(ClusterConfig {
            min_osds: 3,
            replication_factor: 3,
            pg_count: 32,
            ..Default::default()
        });
        cluster.init_cluster().unwrap();

        cluster.add_osd(0, "node0:6800", 1.0).unwrap();
        assert_eq!(cluster.state(), ClusterState::Initializing);

        cluster.add_osd(1, "node1:6800", 1.0).unwrap();
        assert_eq!(cluster.state(), ClusterState::Initializing);

        cluster.add_osd(2, "node2:6800", 1.0).unwrap();
        assert_eq!(cluster.state(), ClusterState::Active);
    }

    #[test]
    fn test_osd_failure() {
        let mut cluster = create_test_cluster();
        cluster.init_cluster().unwrap();

        cluster.add_osd(0, "node0:6800", 1.0).unwrap();
        cluster.add_osd(1, "node1:6800", 1.0).unwrap();

        // Simulate heartbeats
        cluster.heartbeat(0, 1000);
        cluster.heartbeat(1, 1000);

        // Time passes, OSD 0 stops sending heartbeats
        // OSD 1 continues
        cluster.heartbeat(1, 30000);

        // Check health (OSD 0 should be marked down)
        let events = cluster.check_health(30000);

        assert!(events.iter().any(|e| matches!(e, ClusterEvent::OsdDown(0))));
    }

    #[test]
    fn test_failure_detector() {
        let mut fd = FailureDetector::new(1000, 5000);

        fd.heartbeat(0, 1000);
        fd.heartbeat(1, 1000);

        // No failures yet
        assert!(fd.check(3000).is_empty());

        // OSD 0 times out
        let failed = fd.check(7000);
        assert!(failed.contains(&0));
        assert!(failed.contains(&1));

        // Clear OSD 0
        fd.heartbeat(0, 8000);
        fd.clear_failure(0);

        assert!(!fd.get_failed().contains(&0));
    }

    #[test]
    fn test_recovery_manager() {
        let mut rm = RecoveryManager::new(2);

        rm.queue_recovery(1, 0, 2);
        rm.queue_recovery(2, 0, 3);
        rm.queue_recovery(3, 0, 4);

        assert_eq!(rm.queued_count(), 3);

        let started = rm.start_recoveries(1000);
        assert_eq!(started.len(), 2);
        assert_eq!(rm.active_count(), 2);
        assert_eq!(rm.queued_count(), 1);

        rm.complete(1);
        assert_eq!(rm.active_count(), 1);

        let started = rm.start_recoveries(2000);
        assert_eq!(started.len(), 1);
    }

    #[test]
    fn test_osd_map() {
        let mut map = OsdMap::new();

        let osd = OsdInfo {
            id: 0,
            addr: "localhost:6800".to_string(),
            state: OsdNodeState::Up,
            weight: 1.0,
            failure_domain: "host-0".to_string(),
            capacity: 1000000000,
            used: 0,
            last_heartbeat: 0,
            in_epoch: 0,
            up_epoch: 0,
        };

        map.add_osd(osd);
        assert_eq!(map.up_count(), 1);
        assert_eq!(map.in_count(), 1);

        map.mark_down(0);
        assert_eq!(map.up_count(), 0);

        map.mark_up(0);
        assert_eq!(map.up_count(), 1);
    }

    #[test]
    fn test_cluster_client() {
        let mut cluster = create_test_cluster();
        cluster.init_cluster().unwrap();
        cluster.add_osd(0, "node0:6800", 1.0).unwrap();
        cluster.add_osd(1, "node1:6800", 1.0).unwrap();

        let client = cluster.client();

        // Test PG calculation
        let pg1 = client.object_to_pg(0, 1000);
        let pg2 = client.object_to_pg(0, 1000);
        assert_eq!(pg1, pg2); // Deterministic

        let pg3 = client.object_to_pg(0, 2000);
        // Different objects may map to different PGs
        let _ = pg3;

        assert!(client.is_healthy());
    }

    #[test]
    fn test_cluster_stats() {
        let mut cluster = create_test_cluster();
        cluster.init_cluster().unwrap();
        cluster.add_osd(0, "node0:6800", 1.0).unwrap();

        let stats = cluster.stats();

        assert_eq!(stats.osds_total, 1);
        assert_eq!(stats.osds_up, 1);
        assert_eq!(stats.state, ClusterState::Active);
    }

    #[test]
    fn test_pg_mapping() {
        let mut cluster = ClusterManager::new(ClusterConfig {
            min_osds: 2,
            replication_factor: 2,
            pg_count: 8,
            ..Default::default()
        });
        cluster.init_cluster().unwrap();

        cluster.add_osd(0, "node0:6800", 1.0).unwrap();
        cluster.add_osd(1, "node1:6800", 1.0).unwrap();

        // Check PGs were assigned
        assert!(!cluster.osd_map.pg_map.is_empty());

        // Each PG should have 2 OSDs (replication_factor = 2)
        for mapping in cluster.osd_map.pg_map.values() {
            assert!(mapping.acting.len() <= 2);
        }
    }

    #[test]
    fn test_cluster_flags() {
        let flags = ClusterFlags::default();

        assert!(!flags.pauserd);
        assert!(!flags.nodown);
        assert!(!flags.norecover);
    }

    #[test]
    fn test_pool_creation() {
        let mut map = OsdMap::new();

        let pool_id = map.create_pool("test", 64, 3);
        assert_eq!(pool_id, 0);

        let pool = map.pools.get(&pool_id).unwrap();
        assert_eq!(pool.name, "test");
        assert_eq!(pool.pg_count, 64);
        assert_eq!(pool.size, 3);
        assert_eq!(pool.min_size, 2); // Majority of 3
    }
}