lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// VDEV Evacuation Engine
// Drive migration with safe data relocation.

// ALL thresholds are learned from observation - NO hardcoded values.
// ============================================================================

use crate::fscore::impl_::LcpfsController;
use crate::fscore::structs::Dva;
use crate::hw::smart::{SmartAttribute, get_smart_data};
use alloc::collections::VecDeque;
use alloc::string::String;
use alloc::vec::Vec;
use lazy_static::lazy_static;
use libm::{fabs, sqrt};
use spin::Mutex;

// ═══════════════════════════════════════════════════════════════════════════════
// LEARNED THRESHOLDS (Welford's algorithm - no hardcoded values)
// ═══════════════════════════════════════════════════════════════════════════════

/// Adaptive threshold that learns from observations using Welford's algorithm
#[derive(Clone, Copy)]
pub struct LearnedThreshold {
    /// Current threshold value
    pub value: f64,
    /// Uncertainty in the threshold estimate
    pub uncertainty: f64,
    /// Number of observations made
    pub observations: u64,
    /// Current learning rate (decreases with more observations)
    pub learning_rate: f64,
    /// Mean outcome from past actions
    pub mean_outcome: f64,
    /// Variance of outcomes
    pub variance: f64,
}

impl LearnedThreshold {
    /// Creates a new uninformed threshold with an initial guess
    pub const fn uninformed(initial_guess: f64) -> Self {
        Self {
            value: initial_guess,
            uncertainty: f64::MAX,
            observations: 0,
            learning_rate: 1.0,
            mean_outcome: 0.0,
            variance: f64::MAX,
        }
    }

    /// Records an observation and updates the threshold using Welford's algorithm
    pub fn observe(&mut self, action_value: f64, outcome_delta_epsilon: f64) {
        self.observations += 1;
        let n = self.observations as f64;

        let delta = outcome_delta_epsilon - self.mean_outcome;
        self.mean_outcome += delta / n;
        let delta2 = outcome_delta_epsilon - self.mean_outcome;

        if self.observations > 1 {
            let m2 = self.variance * (n - 2.0) + delta * delta2;
            self.variance = m2 / (n - 1.0);
            self.uncertainty = sqrt(self.variance / n);
        }

        let adjustment = if outcome_delta_epsilon < 0.0 {
            (action_value - self.value) * self.learning_rate
        } else {
            (self.value - action_value) * self.learning_rate * 0.5
        };

        self.value += adjustment;
        self.learning_rate = 1.0 / (1.0 + sqrt(self.observations as f64) * 0.1);
    }

    /// Returns confidence in the threshold (0.0 to 1.0) based on observations and uncertainty
    pub fn confidence(&self) -> f64 {
        if self.observations == 0 {
            return 0.0;
        }
        let obs_factor = 1.0 - 1.0 / (1.0 + self.observations as f64 * 0.01);
        let unc_factor = 1.0 / (1.0 + fabs(self.uncertainty));
        obs_factor * unc_factor
    }

    /// Determines if action should be taken based on current value and estimated benefit
    pub fn should_act(&self, current_value: f64, estimated_benefit: f64) -> bool {
        let benefit_over_uncertainty = estimated_benefit / (self.uncertainty + 1e-10);
        current_value >= self.value && benefit_over_uncertainty > 1.0
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// VDEV HEALTH OBSERVATION
// ═══════════════════════════════════════════════════════════════════════════════

/// Health metrics for a single VDEV
#[derive(Clone, Copy, Default)]
pub struct VdevHealthObservation {
    /// Virtual device identifier
    pub vdev_id: usize,
    /// Timestamp of observation in milliseconds
    pub timestamp_ms: u64,
    /// Average I/O latency (microseconds)
    pub latency_avg_us: f64,
    /// P99 I/O latency (microseconds)
    pub latency_p99_us: f64,
    /// Read errors in last observation window
    pub read_errors: u64,
    /// Write errors in last observation window
    pub write_errors: u64,
    /// Reallocated sector count (SMART)
    pub reallocated_sectors: u64,
    /// Pending sector count (SMART)
    pub pending_sectors: u64,
    /// Current temperature (Celsius)
    pub temperature_c: f64,
    /// Power-on hours
    pub power_on_hours: u64,
    /// Uncorrectable error count
    pub uncorrectable_errors: u64,
    /// I/O operations in last window
    pub io_ops: u64,
}

impl VdevHealthObservation {
    /// Calculate a failure probability score (0.0 to 1.0)
    /// Higher = more likely to fail soon
    pub fn failure_risk(&self, learned: &EvacuationEngine) -> f64 {
        let mut risk = 0.0;

        // Latency risk (learned threshold)
        if self.latency_p99_us > learned.threshold_latency.value {
            risk += 0.3 * (self.latency_p99_us / learned.threshold_latency.value).min(2.0);
        }

        // Error rate risk (learned threshold)
        let error_rate = if self.io_ops > 0 {
            (self.read_errors + self.write_errors) as f64 / self.io_ops as f64
        } else {
            0.0
        };
        if error_rate > learned.threshold_error_rate.value {
            risk += 0.4 * (error_rate / learned.threshold_error_rate.value).min(3.0);
        }

        // SMART indicators (learned thresholds)
        if self.reallocated_sectors as f64 > learned.threshold_reallocated.value {
            risk += 0.2;
        }
        if self.pending_sectors as f64 > learned.threshold_pending.value {
            risk += 0.2;
        }

        // Temperature risk (learned threshold)
        if self.temperature_c > learned.threshold_temperature.value {
            risk +=
                0.1 * ((self.temperature_c - learned.threshold_temperature.value) / 10.0).min(1.0);
        }

        risk.min(1.0)
    }
}

/// Outcome of an evacuation for learning
#[derive(Clone, Copy)]
pub struct EvacuationOutcome {
    /// Virtual device identifier
    pub vdev_id: usize,
    /// Risk level when evacuation started
    pub started_at_risk: f64,
    /// Number of blocks successfully evacuated
    pub blocks_evacuated: u64,
    /// Time taken for evacuation in milliseconds
    pub time_taken_ms: u64,
    /// Whether the drive failed during evacuation
    pub drive_failed_during: bool,
    /// System epsilon before evacuation
    pub epsilon_before: f64,
    /// System epsilon after evacuation
    pub epsilon_after: f64,
}

impl EvacuationOutcome {
    /// Calculates the change in epsilon from the evacuation
    pub fn delta_epsilon(&self) -> f64 {
        self.epsilon_after - self.epsilon_before
    }

    /// Determines if the evacuation was successful (no failure and epsilon decreased)
    pub fn was_successful(&self) -> bool {
        !self.drive_failed_during && self.delta_epsilon() <= 0.0
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// EVACUATION ENGINE
// ═══════════════════════════════════════════════════════════════════════════════

lazy_static! {
    /// Global evacuation engine singleton for managing data evacuation from failing drives.
    /// Provides predictive failure detection and PI-controlled data migration across VDEVs.
    pub static ref EVAC_ENGINE: Mutex<EvacuationEngine> = Mutex::new(EvacuationEngine::new());
}

/// Engine for evacuating data from failing drives using learned thresholds
pub struct EvacuationEngine {
    /// Whether an evacuation is currently running
    pub is_running: bool,
    /// VDEV currently being evacuated (if any)
    pub evacuating_vdev: Option<usize>,
    /// Number of blocks migrated so far
    pub blocks_migrated: u64,
    /// Target VDEV for migration (if any)
    pub target_vdev: Option<usize>,

    // Observation history
    observations: VecDeque<VdevHealthObservation>,
    outcomes: VecDeque<EvacuationOutcome>,

    // ═══════════════════════════════════════════════════════════════════════════
    // LEARNED THRESHOLDS (no hardcoded values)
    // ═══════════════════════════════════════════════════════════════════════════
    /// Learned: Latency threshold (microseconds)
    threshold_latency: LearnedThreshold,

    /// Learned: Error rate threshold (errors per op)
    threshold_error_rate: LearnedThreshold,

    /// Learned: Reallocated sector threshold
    threshold_reallocated: LearnedThreshold,

    /// Learned: Pending sector threshold
    threshold_pending: LearnedThreshold,

    /// Learned: Temperature threshold (Celsius)
    threshold_temperature: LearnedThreshold,

    /// Learned: Risk level at which to trigger evacuation
    threshold_evac_risk: LearnedThreshold,

    /// Learned: Migration batch size
    batch_size: LearnedThreshold,

    /// Learned: Pause between batches (microseconds)
    batch_pause_us: LearnedThreshold,

    /// Current system epsilon
    current_epsilon: f64,

    /// Last computed risk score (for outcome tracking)
    last_risk_score: f64,
}

impl Default for EvacuationEngine {
    fn default() -> Self {
        Self::new()
    }
}

impl EvacuationEngine {
    /// Creates a new EvacuationEngine with uninformed priors
    pub fn new() -> Self {
        Self {
            is_running: false,
            evacuating_vdev: None,
            blocks_migrated: 0,
            target_vdev: None,
            observations: VecDeque::with_capacity(1000),
            outcomes: VecDeque::with_capacity(100),

            // Initialize with uninformed priors (will learn from observation)
            threshold_latency: LearnedThreshold::uninformed(50_000.0), // 50ms P99
            threshold_error_rate: LearnedThreshold::uninformed(0.001), // 0.1% error rate
            threshold_reallocated: LearnedThreshold::uninformed(100.0), // 100 reallocated
            threshold_pending: LearnedThreshold::uninformed(10.0),     // 10 pending
            threshold_temperature: LearnedThreshold::uninformed(55.0), // 55°C
            threshold_evac_risk: LearnedThreshold::uninformed(0.5),    // 50% risk triggers evac
            batch_size: LearnedThreshold::uninformed(1000.0),          // 1000 blocks per batch
            batch_pause_us: LearnedThreshold::uninformed(1000.0),      // 1ms between batches

            current_epsilon: 0.0,
            last_risk_score: 0.0,
        }
    }

    /// Update current system epsilon
    pub fn update_epsilon(&mut self, epsilon: f64) {
        self.current_epsilon = epsilon;
    }

    /// Submit a health observation for a VDEV
    pub fn observe(&mut self, obs: VdevHealthObservation) {
        self.observations.push_back(obs);

        // Keep history bounded
        while self.observations.len() > 1000 {
            self.observations.pop_front();
        }
    }

    /// PI decides whether to evacuate a VDEV
    pub fn should_evacuate(&mut self, vdev_id: usize) -> bool {
        if self.is_running {
            return false;
        }

        // Get latest observation for this VDEV
        let latest = self
            .observations
            .iter()
            .rev()
            .find(|o| o.vdev_id == vdev_id);

        let obs = match latest {
            Some(o) => o,
            None => return false,
        };

        let risk = obs.failure_risk(self);

        // Store risk score for outcome tracking
        self.last_risk_score = risk;

        // Estimate benefit of evacuating now
        let evac_benefit = self.estimate_evac_benefit(vdev_id, risk);

        // Use learned threshold with uncertainty-aware decision
        self.threshold_evac_risk.should_act(risk, evac_benefit)
            && self.threshold_evac_risk.confidence() > 0.1
    }

    /// Estimate epsilon reduction from evacuating a VDEV
    fn estimate_evac_benefit(&self, vdev_id: usize, current_risk: f64) -> f64 {
        // Look at past evacuations at similar risk levels
        let similar_outcomes: Vec<_> = self
            .outcomes
            .iter()
            .filter(|o| fabs(o.started_at_risk - current_risk) < 0.2)
            .collect();

        if similar_outcomes.is_empty() {
            // No prior data - assume benefit proportional to risk
            // If we evacuate successfully, we avoid potential drive failure epsilon cost
            let potential_loss = current_risk * 1_000_000.0; // Assume 1M epsilon if drive dies
            return potential_loss * 0.8; // 80% chance we avoid it
        }

        // Average epsilon improvement from similar evacuations
        let successful: Vec<_> = similar_outcomes
            .iter()
            .filter(|o| o.was_successful())
            .collect();

        if successful.is_empty() {
            return 0.0;
        }

        let avg_improvement: f64 =
            successful.iter().map(|o| -o.delta_epsilon()).sum::<f64>() / successful.len() as f64;

        avg_improvement.max(0.0)
    }

    /// Find the best spare/healthy VDEV to migrate to
    pub fn find_target_vdev(&self) -> Option<usize> {
        // Find VDEV with lowest failure risk that has enough space
        let mut best_target: Option<(usize, f64)> = None;

        for obs in self.observations.iter().rev() {
            // Skip if we're evacuating from this VDEV
            if Some(obs.vdev_id) == self.evacuating_vdev {
                continue;
            }

            let risk = obs.failure_risk(self);

            match best_target {
                None => best_target = Some((obs.vdev_id, risk)),
                Some((_, best_risk)) if risk < best_risk => {
                    best_target = Some((obs.vdev_id, risk));
                }
                _ => {}
            }
        }

        best_target.map(|(id, _)| id)
    }

    /// Start evacuating a VDEV
    pub fn start_evacuation(&mut self, dying_vdev: usize) -> Result<(), &'static str> {
        if self.is_running {
            return Err("Evacuation already in progress");
        }

        let target = self
            .find_target_vdev()
            .ok_or("No suitable target VDEV found")?;

        self.is_running = true;
        self.evacuating_vdev = Some(dying_vdev);
        self.target_vdev = Some(target);
        self.blocks_migrated = 0;

        crate::lcpfs_println!(
            "[ EVAC ] INITIATING PI-CONTROLLED EVACUATION: VDEV {} -> VDEV {}",
            dying_vdev,
            target
        );
        crate::lcpfs_println!(
            "[ EVAC ] Parameters: batch={}, pause={}μs (learned)",
            self.batch_size.value as u64,
            self.batch_pause_us.value as u64
        );

        // Spawn evacuation task
        crate::spawn_on_core(Self::evac_task, Some(2));

        Ok(())
    }

    fn evac_task() {
        let mut engine = EVAC_ENGINE.lock();

        let dying_vdev = match engine.evacuating_vdev {
            Some(v) => v,
            None => {
                engine.is_running = false;
                return;
            }
        };

        let target_vdev = match engine.target_vdev {
            Some(v) => v,
            None => {
                engine.is_running = false;
                return;
            }
        };

        let batch_size = engine.batch_size.value.max(100.0) as u64;
        let epsilon_before = engine.current_epsilon;
        let start_time = crate::get_time();

        // Real block migration using BLOCK_DEVICES
        use crate::BLOCK_DEVICES;
        use alloc::vec;

        // Get total blocks from device
        let total_blocks = {
            let devices = BLOCK_DEVICES.lock();
            if let Some(dev) = devices.get(dying_vdev) {
                dev.block_count()
            } else {
                crate::lcpfs_println!("[ EVAC ] Error: Source VDEV {} not found", dying_vdev);
                engine.is_running = false;
                return;
            }
        };

        crate::lcpfs_println!(
            "[ EVAC ] Migrating {} blocks from VDEV {} to VDEV {}",
            total_blocks,
            dying_vdev,
            target_vdev
        );

        let mut block_id = 0u64;
        let mut failed_blocks = 0u64;

        while block_id < total_blocks as u64 {
            let batch_end = (block_id + batch_size).min(total_blocks as u64);

            // Process batch
            for bid in block_id..batch_end {
                // Read from source VDEV
                let mut buffer = vec![0u8; 512];
                let read_success = {
                    let mut devices = BLOCK_DEVICES.lock();
                    if let Some(src) = devices.get_mut(dying_vdev) {
                        src.read_block(bid as usize, &mut buffer).is_ok()
                    } else {
                        false
                    }
                };

                if !read_success {
                    failed_blocks += 1;
                    continue; // Skip unreadable blocks
                }

                // Write to target VDEV
                let write_success = {
                    let mut devices = BLOCK_DEVICES.lock();
                    if let Some(dst) = devices.get_mut(target_vdev) {
                        dst.write_block(bid as usize, &buffer).is_ok()
                    } else {
                        false
                    }
                };

                if write_success {
                    engine.blocks_migrated += 1;
                } else {
                    failed_blocks += 1;
                }
            }

            block_id = batch_end;

            // Progress update
            if block_id % 1000 == 0 {
                crate::lcpfs_println!(
                    "[ EVAC ] Progress: {}/{} blocks migrated ({} failed)",
                    engine.blocks_migrated,
                    total_blocks,
                    failed_blocks
                );
            }
        }

        let time_taken_ms = (crate::get_time() - start_time) / 1_000_000; // ns to ms

        // Record outcome for learning
        let drive_failed = failed_blocks > (total_blocks as u64 / 10); // >10% failure = drive failed
        let outcome = EvacuationOutcome {
            vdev_id: dying_vdev,
            started_at_risk: engine.last_risk_score,
            blocks_evacuated: engine.blocks_migrated,
            time_taken_ms,
            drive_failed_during: drive_failed,
            epsilon_before,
            epsilon_after: engine.current_epsilon,
        };

        engine.learn_from_outcome(&outcome);
        engine.outcomes.push_back(outcome);

        while engine.outcomes.len() > 100 {
            engine.outcomes.pop_front();
        }

        engine.is_running = false;
        engine.evacuating_vdev = None;
        engine.target_vdev = None;

        crate::lcpfs_println!(
            "[ EVAC ] EVACUATION COMPLETE: {} blocks migrated from VDEV {} to VDEV {}",
            engine.blocks_migrated,
            dying_vdev,
            target_vdev
        );
    }

    /// Learn from evacuation outcome
    fn learn_from_outcome(&mut self, outcome: &EvacuationOutcome) {
        let delta = outcome.delta_epsilon();

        // Learn evacuation risk threshold
        self.threshold_evac_risk
            .observe(outcome.started_at_risk, delta);

        // If drive failed during evacuation, we waited too long
        if outcome.drive_failed_during {
            // Reduce the risk threshold (trigger earlier next time)
            self.threshold_evac_risk.observe(
                outcome.started_at_risk * 0.5,
                -1000.0, // Strong signal: this was bad
            );
        }

        // Learn batch size from throughput
        if outcome.time_taken_ms > 0 {
            let throughput = outcome.blocks_evacuated as f64 / outcome.time_taken_ms as f64;
            self.batch_size.observe(self.batch_size.value, -throughput);
        }
    }

    /// Get current statistics
    pub fn stats(&self) -> EvacStats {
        EvacStats {
            is_running: self.is_running,
            evacuating_vdev: self.evacuating_vdev,
            target_vdev: self.target_vdev,
            blocks_migrated: self.blocks_migrated,
            evac_risk_threshold: self.threshold_evac_risk.value,
            evac_risk_confidence: self.threshold_evac_risk.confidence(),
            latency_threshold_us: self.threshold_latency.value as u64,
            latency_confidence: self.threshold_latency.confidence(),
        }
    }
}

/// Statistics about evacuation operations and learned thresholds
#[derive(Debug, Clone, Copy)]
pub struct EvacStats {
    /// Whether an evacuation is currently running
    pub is_running: bool,
    /// VDEV currently being evacuated
    pub evacuating_vdev: Option<usize>,
    /// Target VDEV for migration
    pub target_vdev: Option<usize>,
    /// Total number of blocks migrated
    pub blocks_migrated: u64,
    /// Learned evacuation risk threshold
    pub evac_risk_threshold: f64,
    /// Confidence in evacuation risk threshold (0.0 to 1.0)
    pub evac_risk_confidence: f64,
    /// Learned latency threshold in microseconds
    pub latency_threshold_us: u64,
    /// Confidence in latency threshold (0.0 to 1.0)
    pub latency_confidence: f64,
}

// ═══════════════════════════════════════════════════════════════════════════════
// HEALTH MONITOR (Legacy compatibility wrapper)
// ═══════════════════════════════════════════════════════════════════════════════

/// Legacy wrapper for health monitoring (compatibility layer)
pub struct HealthMonitor {
    /// Recorded latency observations
    pub latencies: Vec<u64>,
    /// Recorded error counts
    pub error_counts: Vec<u64>,
}

impl Default for HealthMonitor {
    fn default() -> Self {
        Self::new()
    }
}

impl HealthMonitor {
    /// Creates a new HealthMonitor
    pub fn new() -> Self {
        Self {
            latencies: Vec::new(),
            error_counts: Vec::new(),
        }
    }

    /// Record a latency observation
    pub fn record_latency(&mut self, vdev_id: usize, latency_us: u64) {
        while self.latencies.len() <= vdev_id {
            self.latencies.push(0);
        }
        self.latencies[vdev_id] = latency_us;
    }

    /// Record an error
    pub fn record_error(&mut self, vdev_id: usize) {
        while self.error_counts.len() <= vdev_id {
            self.error_counts.push(0);
        }
        self.error_counts[vdev_id] += 1;
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// PUBLIC API
// ═══════════════════════════════════════════════════════════════════════════════

/// Update system epsilon
pub fn update_epsilon(epsilon: f64) {
    EVAC_ENGINE.lock().update_epsilon(epsilon);
}

/// Submit a VDEV health observation
pub fn observe_health(obs: VdevHealthObservation) {
    EVAC_ENGINE.lock().observe(obs);
}

/// Check if PI thinks we should evacuate a VDEV
pub fn should_evacuate(vdev_id: usize) -> bool {
    EVAC_ENGINE.lock().should_evacuate(vdev_id)
}

/// Start evacuating a VDEV
pub fn start_evacuation(dying_vdev: usize) -> Result<(), &'static str> {
    EVAC_ENGINE.lock().start_evacuation(dying_vdev)
}

/// Get current statistics
pub fn stats() -> EvacStats {
    EVAC_ENGINE.lock().stats()
}

/// Legacy API: Check health using controller
///
/// Queries SMART data from the device and uses it to populate health observations.
/// Falls back to conservative defaults if SMART data is unavailable.
///
/// Note: `controller` parameter is kept for API compatibility but not currently used
/// for metrics. Future versions should add latency/error tracking to LcpfsController.
pub fn check_health(_controller: &LcpfsController) {
    let vdev_id = 0;
    let timestamp_ms = crate::time::now() * 1000;

    // Try to get SMART data from the device
    let smart_data = get_smart_data(vdev_id as u64);

    let obs = if let Some(data) = smart_data {
        // Use real SMART data
        let attrs = &data.attributes;

        let reallocated = attrs
            .get(&SmartAttribute::ReallocatedSectors)
            .map(|v| v.current)
            .unwrap_or(0);
        let pending = attrs
            .get(&SmartAttribute::CurrentPendingSectors)
            .map(|v| v.current)
            .unwrap_or(0);
        let uncorrectable = attrs
            .get(&SmartAttribute::ReportedUncorrectable)
            .map(|v| v.current)
            .unwrap_or(0);
        let temperature = attrs
            .get(&SmartAttribute::Temperature)
            .map(|v| v.current as f64)
            .unwrap_or(40.0);
        let power_on_hours = attrs
            .get(&SmartAttribute::PowerOnHours)
            .map(|v| v.current)
            .unwrap_or(0);

        VdevHealthObservation {
            vdev_id,
            timestamp_ms,
            latency_avg_us: 100.0, // Default - would come from I/O subsystem
            latency_p99_us: 200.0, // Default - would come from I/O subsystem
            read_errors: 0,
            write_errors: 0,
            reallocated_sectors: reallocated,
            pending_sectors: pending,
            temperature_c: temperature,
            power_on_hours,
            uncorrectable_errors: uncorrectable,
            io_ops: 1000,
        }
    } else {
        // SMART data unavailable - use conservative defaults
        VdevHealthObservation {
            vdev_id,
            timestamp_ms,
            latency_avg_us: 100.0,
            latency_p99_us: 200.0,
            read_errors: 0,
            write_errors: 0,
            reallocated_sectors: 0,
            pending_sectors: 0,
            temperature_c: 40.0,
            power_on_hours: 0,
            uncorrectable_errors: 0,
            io_ops: 1000,
        }
    };

    observe_health(obs);

    // Let PI decide
    if should_evacuate(vdev_id) {
        let _ = start_evacuation(vdev_id);
    }
}