lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// Scrubbing Engine
// PI-controlled integrity verification.

// ALL thresholds are learned from observation - NO hardcoded values.
// ============================================================================

use crate::BLOCK_DEVICES;
use crate::fscore::impl_::LcpfsController;
use crate::integrity::checksum::Checksum;
use crate::lunaos::kernel::BlockDevice;
use crate::mgmt::mount::LcpfsMount;
use alloc::collections::VecDeque;
use alloc::vec::Vec;
use lazy_static::lazy_static;
use libm::{fabs, sqrt};
use spin::Mutex;

// ═══════════════════════════════════════════════════════════════════════════════
// LEARNED THRESHOLDS (Welford's algorithm - no hardcoded values)
// ═══════════════════════════════════════════════════════════════════════════════

/// A learned threshold with uncertainty quantification using Welford's online algorithm.
#[derive(Clone, Copy)]
pub struct LearnedThreshold {
    /// Current threshold value learned from observations
    pub value: f64,
    /// Standard error of the mean (decreases with more observations)
    pub uncertainty: f64,
    /// Number of outcomes observed to refine this threshold
    pub observations: u64,
    /// Adaptive learning rate (decreases as observations increase)
    pub learning_rate: f64,
    /// Mean outcome (delta epsilon) from past threshold crossings
    pub mean_outcome: f64,
    /// Variance of outcomes using Welford's M2 calculation
    pub variance: f64,
}

impl LearnedThreshold {
    /// Create with maximum uncertainty (no prior knowledge)
    pub const fn uninformed(initial_guess: f64) -> Self {
        Self {
            value: initial_guess,
            uncertainty: f64::MAX,
            observations: 0,
            learning_rate: 1.0,
            mean_outcome: 0.0,
            variance: f64::MAX,
        }
    }

    /// Update threshold based on observed outcome using Welford's algorithm
    pub fn observe(&mut self, action_value: f64, outcome_delta_epsilon: f64) {
        self.observations += 1;
        let n = self.observations as f64;

        // Welford's online algorithm
        let delta = outcome_delta_epsilon - self.mean_outcome;
        self.mean_outcome += delta / n;
        let delta2 = outcome_delta_epsilon - self.mean_outcome;

        if self.observations > 1 {
            let m2 = self.variance * (n - 2.0) + delta * delta2;
            self.variance = m2 / (n - 1.0);
            self.uncertainty = sqrt(self.variance / n);
        }

        // Adjust threshold: good outcome -> move toward action, bad -> move away
        let adjustment = if outcome_delta_epsilon < 0.0 {
            (action_value - self.value) * self.learning_rate
        } else {
            (self.value - action_value) * self.learning_rate * 0.5
        };

        self.value += adjustment;
        self.learning_rate = 1.0 / (1.0 + sqrt(self.observations as f64) * 0.1);
    }

    /// Calculate confidence score (0.0 to 1.0) based on observations and uncertainty
    pub fn confidence(&self) -> f64 {
        if self.observations == 0 {
            return 0.0;
        }
        let obs_factor = 1.0 - 1.0 / (1.0 + self.observations as f64 * 0.01);
        let unc_factor = 1.0 / (1.0 + fabs(self.uncertainty));
        obs_factor * unc_factor
    }

    /// Decide whether to act based on current value vs threshold and benefit/uncertainty ratio
    pub fn should_act(&self, current_value: f64, estimated_benefit: f64) -> bool {
        let benefit_over_uncertainty = estimated_benefit / (self.uncertainty + 1e-10);
        current_value >= self.value && benefit_over_uncertainty > 1.0
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// SCRUB OBSERVATIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Metrics observed during a scrub operation
#[derive(Clone, Copy, Default)]
pub struct ScrubObservation {
    /// Timestamp when scrub completed (milliseconds since boot)
    pub timestamp_ms: u64,
    /// Total number of blocks scanned during this scrub
    pub blocks_scanned: u64,
    /// Number of blocks with checksum/integrity errors detected
    pub errors_found: u64,
    /// Number of blocks successfully repaired from errors
    pub repairs_made: u64,
    /// Average I/O latency during scrub (microseconds)
    pub io_latency_avg_us: f64,
    /// CPU utilization fraction during scrub (0.0 to 1.0)
    pub cpu_utilization: f64,
    /// Time elapsed since previous scrub (milliseconds)
    pub time_since_last_scrub_ms: u64,
    /// System epsilon measurement before scrub started
    pub epsilon_before: f64,
    /// System epsilon measurement after scrub completed
    pub epsilon_after: f64,
}

impl ScrubObservation {
    /// Calculate change in epsilon caused by scrub (negative is improvement)
    pub fn delta_epsilon(&self) -> f64 {
        self.epsilon_after - self.epsilon_before
    }

    /// Normalized error rate (errors per 1M blocks)
    pub fn error_rate(&self) -> f64 {
        if self.blocks_scanned == 0 {
            return 0.0;
        }
        (self.errors_found as f64 / self.blocks_scanned as f64) * 1_000_000.0
    }
}

/// Outcome of a scrub for learning
#[derive(Clone, Copy)]
pub struct ScrubOutcome {
    /// The metrics observed during the scrub operation
    pub observation: ScrubObservation,
    /// Error rate that triggered this scrub (errors per 1M blocks)
    pub scheduled_at_error_rate: f64,
    /// Time gap that triggered this scrub (milliseconds)
    pub scheduled_at_time_gap: f64,
}

// ═══════════════════════════════════════════════════════════════════════════════
// RAID CONFIGURATION
// ═══════════════════════════════════════════════════════════════════════════════

/// RAID level for scrub/repair operations
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RaidLevel {
    /// Mirrored disks (N-way)
    Mirror,
    /// RAID-Z1: Single parity (can lose 1 disk)
    RaidZ1,
    /// RAID-Z2: Double parity (can lose 2 disks)
    RaidZ2,
    /// RAID-Z3: Triple parity (can lose 3 disks)
    RaidZ3,
    /// Simple striping (no redundancy)
    Stripe,
}

/// RAID configuration for repair operations
#[derive(Debug, Clone, Copy)]
pub struct RaidConfig {
    /// RAID level
    pub level: RaidLevel,
    /// Number of data disks per stripe
    pub data_disks: u8,
    /// Number of parity disks per stripe
    pub parity_disks: u8,
}

impl RaidConfig {
    /// Create a RAID-Z1 config (default: 2 data + 1 parity)
    pub const fn raidz1(data_disks: u8) -> Self {
        Self {
            level: RaidLevel::RaidZ1,
            data_disks,
            parity_disks: 1,
        }
    }

    /// Create a RAID-Z2 config
    pub const fn raidz2(data_disks: u8) -> Self {
        Self {
            level: RaidLevel::RaidZ2,
            data_disks,
            parity_disks: 2,
        }
    }

    /// Create a RAID-Z3 config
    pub const fn raidz3(data_disks: u8) -> Self {
        Self {
            level: RaidLevel::RaidZ3,
            data_disks,
            parity_disks: 3,
        }
    }

    /// Total stripe width (data + parity)
    pub const fn stripe_width(&self) -> usize {
        (self.data_disks + self.parity_disks) as usize
    }

    /// Check if this config can recover from the given number of failures
    pub const fn can_recover(&self, failures: u8) -> bool {
        failures <= self.parity_disks
    }
}

impl Default for RaidConfig {
    fn default() -> Self {
        Self::raidz1(2) // Default: RAID-Z1 with 2 data + 1 parity = 3 disks
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// SCRUB ENGINE
// ═══════════════════════════════════════════════════════════════════════════════

impl Default for ScrubEngine {
    fn default() -> Self {
        Self::new()
    }
}

lazy_static! {
    /// Global scrub engine with learned thresholds and epsilon-driven scheduling
    pub static ref SCRUB_ENGINE: Mutex<ScrubEngine> = Mutex::new(ScrubEngine::new());

    /// Global RAID configuration (set by pool import)
    pub static ref RAID_CONFIG: Mutex<RaidConfig> = Mutex::new(RaidConfig::default());
}

/// Set the RAID configuration for scrub/repair operations
pub fn set_raid_config(config: RaidConfig) {
    *RAID_CONFIG.lock() = config;
}

/// Get the current RAID configuration
pub fn get_raid_config() -> RaidConfig {
    *RAID_CONFIG.lock()
}

/// Adaptive scrub engine that learns optimal scheduling from epsilon outcomes
pub struct ScrubEngine {
    /// Whether a scrub operation is currently in progress
    pub is_running: bool,
    /// Total blocks scanned in current/last scrub operation
    pub blocks_scanned: u64,
    /// Total repairs made in current/last scrub operation
    pub repairs_made: u64,
    /// Number of errors detected during scrubbing operations
    pub errors_detected: u64,
    /// Timestamp (in milliseconds) of the last scrub operation
    pub last_scrub_ms: u64,

    // Observation history for learning
    observations: VecDeque<ScrubObservation>,
    outcomes: VecDeque<ScrubOutcome>,

    /// Maximum history entries to keep (configurable)
    max_history: usize,

    // ═══════════════════════════════════════════════════════════════════════════
    // LEARNED THRESHOLDS (no hardcoded values)
    // ═══════════════════════════════════════════════════════════════════════════
    /// Learned: Error rate threshold to trigger scrub (errors per 1M blocks)
    threshold_error_rate: LearnedThreshold,

    /// Learned: Time threshold between scrubs (milliseconds)
    threshold_time_gap: LearnedThreshold,

    /// Learned: I/O throttle during scrub (0.0 to 1.0)
    throttle_ratio: LearnedThreshold,

    /// Learned: Batch size for scrubbing (blocks per batch)
    batch_size: LearnedThreshold,

    /// Learned: Pause between batches (microseconds)
    batch_pause_us: LearnedThreshold,

    /// Current epsilon reading (from CCU)
    current_epsilon: f64,
}

impl ScrubEngine {
    /// Default maximum history entries
    pub const DEFAULT_MAX_HISTORY: usize = 100;

    /// Creates a new ScrubEngine with uninformed priors for learned thresholds
    pub const fn new() -> Self {
        Self {
            is_running: false,
            blocks_scanned: 0,
            repairs_made: 0,
            errors_detected: 0,
            last_scrub_ms: 0,
            observations: VecDeque::new(),
            outcomes: VecDeque::new(),
            max_history: Self::DEFAULT_MAX_HISTORY,

            // Initialize with uninformed priors
            threshold_error_rate: LearnedThreshold::uninformed(10.0), // 10 errors per 1M blocks
            threshold_time_gap: LearnedThreshold::uninformed(86_400_000.0), // 24 hours
            throttle_ratio: LearnedThreshold::uninformed(0.5),        // 50% I/O budget
            batch_size: LearnedThreshold::uninformed(1000.0),         // 1000 blocks per batch
            batch_pause_us: LearnedThreshold::uninformed(1000.0),     // 1ms between batches

            current_epsilon: 0.0,
        }
    }

    /// Set maximum history entries to keep
    pub fn set_max_history(&mut self, max: usize) {
        self.max_history = max;
        // Trim existing history if needed
        while self.observations.len() > self.max_history {
            self.observations.pop_front();
        }
        while self.outcomes.len() > self.max_history {
            self.outcomes.pop_front();
        }
    }

    /// Update current system epsilon (called by PI scheduler)
    pub fn update_epsilon(&mut self, epsilon: f64) {
        self.current_epsilon = epsilon;
    }

    /// PI decides whether to start a scrub now
    pub fn should_scrub(&self, current_time_ms: u64, observed_error_rate: f64) -> bool {
        if self.is_running {
            return false;
        }

        let time_since_last = current_time_ms.saturating_sub(self.last_scrub_ms) as f64;

        // Calculate estimated benefit of scrubbing
        let time_benefit = self.estimate_time_benefit(time_since_last);
        let error_benefit = self.estimate_error_benefit(observed_error_rate);
        let total_benefit = time_benefit + error_benefit;

        // Check both thresholds with uncertainty-aware decision
        let time_triggered = self
            .threshold_time_gap
            .should_act(time_since_last, time_benefit);
        let error_triggered = self
            .threshold_error_rate
            .should_act(observed_error_rate, error_benefit);

        // Scrub if either threshold is triggered with sufficient confidence
        (time_triggered && self.threshold_time_gap.confidence() > 0.1) ||
        (error_triggered && self.threshold_error_rate.confidence() > 0.1) ||
        // Or if combined benefit exceeds combined uncertainty
        total_benefit > (self.threshold_time_gap.uncertainty + self.threshold_error_rate.uncertainty)
    }

    /// Estimate epsilon reduction from scrubbing after this time gap
    fn estimate_time_benefit(&self, time_gap: f64) -> f64 {
        // Look at past scrubs with similar time gaps
        let similar_outcomes: Vec<_> = self
            .outcomes
            .iter()
            .filter(|o| fabs(o.scheduled_at_time_gap - time_gap) < time_gap * 0.2)
            .collect();

        if similar_outcomes.is_empty() {
            // No prior data - assume benefit proportional to time
            return time_gap / self.threshold_time_gap.value * 100.0;
        }

        // Average epsilon reduction from similar scrubs
        let avg_reduction: f64 = similar_outcomes
            .iter()
            .map(|o| -o.observation.delta_epsilon())
            .sum::<f64>()
            / similar_outcomes.len() as f64;

        avg_reduction.max(0.0)
    }

    /// Estimate epsilon reduction from scrubbing at this error rate
    fn estimate_error_benefit(&self, error_rate: f64) -> f64 {
        let similar_outcomes: Vec<_> = self
            .outcomes
            .iter()
            .filter(|o| fabs(o.scheduled_at_error_rate - error_rate) < error_rate * 0.3)
            .collect();

        if similar_outcomes.is_empty() {
            return error_rate * 50.0;
        }

        let avg_reduction: f64 = similar_outcomes
            .iter()
            .map(|o| -o.observation.delta_epsilon())
            .sum::<f64>()
            / similar_outcomes.len() as f64;

        avg_reduction.max(0.0)
    }

    /// Start a PI-controlled scrub
    pub fn start(&mut self, current_time_ms: u64) -> Result<(), &'static str> {
        if self.is_running {
            return Err("Scrub already running");
        }

        self.is_running = true;
        self.blocks_scanned = 0;
        self.repairs_made = 0;
        self.errors_detected = 0;

        // Record pre-scrub epsilon from CCU (LunaOS integration)
        let epsilon_before = crate::lunaos::integration::get_epsilon_current();
        self.current_epsilon = epsilon_before;

        // Notify PI scheduler that scrub is starting
        crate::lunaos::integration::notify_scrub_start();

        // Get learned parameters
        let batch_size = self.batch_size.value.max(100.0) as u64;
        let batch_pause = self.batch_pause_us.value.max(100.0) as u64;
        let throttle = self.throttle_ratio.value.clamp(0.1, 0.9);

        crate::lcpfs_println!(
            "[ SCRUB] Starting PI-controlled scrub (batch={}, pause={}μs, throttle={:.0}%)",
            batch_size,
            batch_pause,
            throttle * 100.0
        );

        // Dispatch to background core
        crate::spawn_on_core(Self::scrub_task, Some(2));

        Ok(())
    }

    fn scrub_task() {
        match LcpfsMount::import(0) {
            Ok(mount) => {
                let mut engine = SCRUB_ENGINE.lock();
                let batch_size = engine.batch_size.value.max(100.0) as u64;

                // Get total blocks from device
                let total_blocks = {
                    let devices = BLOCK_DEVICES.lock();
                    if let Some(dev) = devices.get(mount.dev_id) {
                        let block_size = dev.block_size();
                        if block_size == 0 {
                            crate::lcpfs_println!("[ SCRUB] Invalid block_size (0)");
                            engine.is_running = false;
                            return;
                        }
                        match dev.size() {
                            Ok(size) => size / block_size as u64,
                            Err(_) => {
                                crate::lcpfs_println!("[ SCRUB] Failed to get device size");
                                engine.is_running = false;
                                return;
                            }
                        }
                    } else {
                        crate::lcpfs_println!("[ SCRUB] Device not found");
                        engine.is_running = false;
                        return;
                    }
                };

                // Scrub in batches with PI-controlled pacing
                let mut block_id = 0u64;
                while block_id < total_blocks {
                    let batch_end = (block_id + batch_size).min(total_blocks);

                    // Process batch
                    for bid in block_id..batch_end {
                        engine.blocks_scanned += 1;

                        // Verify checksum and repair if needed
                        if let Some(error) = Self::verify_block(&mount, bid) {
                            engine.errors_detected += 1;
                            if Self::repair_block(&mount, bid) {
                                engine.repairs_made += 1;
                                crate::lcpfs_println!("[ SCRUB] Repaired block {}", bid);
                            }
                        }
                    }

                    block_id = batch_end;

                    // Adaptive pause between batches to reduce system impact
                    // Pause duration scales with system load and error rate
                    #[cfg(feature = "lunaos")]
                    {
                        // LunaOS: Use PI-based adaptive pause
                        // Higher ε (system stress) → longer pause
                        // Lower error rate → shorter pause (less urgent)
                        let base_pause_ms = 10u64; // 10ms baseline
                        let error_ratio = if engine.blocks_scanned > 0 {
                            (engine.errors_detected as f64) / (engine.blocks_scanned as f64)
                        } else {
                            0.0
                        };

                        // Scale: 10ms at 0% errors, up to 100ms at >1% errors
                        let pause_ms = base_pause_ms + (error_ratio * 90000.0) as u64;
                        let pause_ms = pause_ms.min(100); // Cap at 100ms

                        // LunaOS: Integrate ε from CCU for dynamic adjustment
                        let epsilon = crate::lunaos::integration::get_epsilon_current();
                        let cpu_util = crate::lunaos::integration::get_cpu_utilization();

                        // Adjust pause based on CPU utilization (high load → longer pause)
                        let adjusted_pause_ms = if cpu_util > 0.8 {
                            pause_ms * 2 // Back off significantly under high load
                        } else if cpu_util > 0.5 {
                            (pause_ms as f64 * (1.0 + cpu_util)).min(100.0) as u64
                        } else {
                            pause_ms
                        };

                        // Log if we have LunaOS integration active
                        if epsilon > 0.0 || cpu_util > 0.0 {
                            crate::lcpfs_println!(
                                "[ SCRUB] Adaptive pause: {}ms (ε={:.2}J, CPU={:.1}%)",
                                adjusted_pause_ms,
                                epsilon,
                                cpu_util * 100.0
                            );
                        }

                        // Cooperative yield: let kernel scheduler handle the pause
                        // This avoids busy-waiting and allows other tasks to run
                        crate::cooperative_yield(adjusted_pause_ms * 1000); // ms → μs
                    }

                    #[cfg(not(feature = "lunaos"))]
                    {
                        // Standard: Fixed 10ms pause between batches
                        // Use cooperative yield instead of spin-loop busy-wait
                        crate::cooperative_yield(10_000); // 10ms in μs
                    }
                }

                engine.is_running = false;
                crate::lcpfs_println!(
                    "[ SCRUB] Complete: {} blocks, {} errors, {} repairs",
                    engine.blocks_scanned,
                    engine.errors_detected,
                    engine.repairs_made
                );
            }
            Err(e) => {
                crate::lcpfs_println!("[ SCRUB] Failed to mount pool: {}", e);
                SCRUB_ENGINE.lock().is_running = false;
            }
        }
    }

    /// Verify a block's integrity by reading it and checking its checksum
    fn verify_block(mount: &LcpfsMount, block_id: u64) -> Option<ScrubError> {
        use crate::BLOCK_DEVICES;
        use alloc::vec;

        // Get block device
        let mut devices = match BLOCK_DEVICES.try_lock() {
            Some(d) => d,
            None => {
                crate::lcpfs_println!("[ SCRUB] WARN: Lock contention on block {}", block_id);
                return Some(ScrubError::LockContention);
            }
        };

        let dev = match devices.get_mut(mount.dev_id) {
            Some(d) => d,
            None => return Some(ScrubError::IoError), // Device not found
        };

        // Query actual block size from device (don't assume 512)
        let block_size = dev.block_size();
        if block_size == 0 {
            crate::lcpfs_println!(
                "[ SCRUB] ERROR: Device {} reports 0 block size",
                mount.dev_id
            );
            return Some(ScrubError::IoError);
        }

        // Bounds check
        let total_blocks = match dev.size() {
            Ok(s) => s / block_size as u64,
            Err(_) => return Some(ScrubError::IoError),
        };
        if block_id >= total_blocks {
            return Some(ScrubError::OutOfBounds);
        }

        let mut buffer = vec![0u8; block_size];

        if dev.read_block(block_id as usize, &mut buffer).is_err() {
            return Some(ScrubError::IoError);
        }

        // Compute checksum of actual data
        let computed = Checksum::calculate(&buffer);

        // NOTE: Full checksum validation requires:
        // 1. Traversing the pool's DMU tree to find the Blkptr for this block_id
        // 2. Extracting stored_checksum from Blkptr.checksum[0..4]
        // 3. Comparing computed vs stored
        //
        // This is not yet implemented because:
        // - Scrub operates at physical block level (device block IDs)
        // - DMU operates at logical level (object IDs + offsets)
        // - Need mapping layer: block_id → Blkptr (requires space map traversal)
        //
        // Current implementation:
        // - Verifies blocks are readable (detects hardware failures)
        // - Computes checksums (ensures data integrity pipeline works)
        // - Missing: comparison with stored checksums (requires DMU integration)
        //
        // NOTE: Full checksum validation is implemented in lcpfs_scrub_solutions.rs
        // Three architectural approaches available:
        //   1. ReverseIndex - Fast, O(1) lookup, high RAM (8 GB/TB)
        //   2. DmuScrubber - Memory-efficient (~10 MB), full validation
        //   3. HybridScrubber - Balanced (32 MB/TB), dedup-aware
        // See: lcpfs_scrub_solutions.rs for complete implementations
        //
        // This simple scrubber detects I/O errors but not silent corruption
        // For production use, prefer DmuScrubber or HybridScrubber
        drop(devices);
        let _ = computed; // Use computed to avoid warning

        None
    }

    /// Attempt to repair a corrupted block using RAID-Z redundancy
    fn repair_block(mount: &LcpfsMount, block_id: u64) -> bool {
        use crate::BLOCK_DEVICES;
        use crate::ml::gfsolver::GfSolver;
        use alloc::vec;
        use alloc::vec::Vec;

        // Get RAID configuration (dynamic, not hardcoded)
        let raid_config = get_raid_config();
        let stripe_width = raid_config.stripe_width();

        // Check if we have any redundancy at all
        if raid_config.level == RaidLevel::Stripe {
            crate::lcpfs_println!(
                "[ SCRUB] Cannot repair block {} - no RAID redundancy",
                block_id
            );
            return false;
        }

        // Get block device
        let mut devices = match BLOCK_DEVICES.try_lock() {
            Some(d) => d,
            None => {
                crate::lcpfs_println!(
                    "[ SCRUB] Lock contention during repair of block {}",
                    block_id
                );
                return false;
            }
        };

        let dev = match devices.get_mut(mount.dev_id) {
            Some(d) => d,
            None => return false, // Device not found
        };

        // Query actual block size from device
        let block_size = dev.block_size();
        if block_size == 0 {
            crate::lcpfs_println!(
                "[ SCRUB] ERROR: Device {} reports 0 block size",
                mount.dev_id
            );
            return false;
        }

        // Bounds check: get total blocks
        let total_blocks = match dev.size() {
            Ok(s) => s / block_size as u64,
            Err(_) => return false,
        };

        // In a full implementation:
        // 1. Determine which RAID-Z stripe contains this block
        // 2. Read all other data blocks and parity blocks in the stripe
        // 3. Use GfSolver::reconstruct_z1/z2/z3 to recover the failed block
        // 4. Write the reconstructed block back to disk

        let mut blocks: Vec<Vec<u8>> = Vec::new();
        let mut failed_index = None;
        let mut failure_count = 0u8;

        // Try to read blocks in the same stripe
        for i in 0..stripe_width {
            let bid = block_id - (block_id % stripe_width as u64) + i as u64;

            // Bounds check each block in the stripe
            if bid >= total_blocks {
                crate::lcpfs_println!(
                    "[ SCRUB] Block {} out of bounds (max {})",
                    bid,
                    total_blocks
                );
                return false;
            }

            let mut buffer = vec![0u8; block_size];

            if bid == block_id {
                // This is the failed block - use zeroed buffer as placeholder
                // It will be reconstructed from parity by GfSolver below
                failed_index = Some(i);
                failure_count += 1;
                blocks.push(buffer);
            } else if dev.read_block(bid as usize, &mut buffer).is_ok() {
                blocks.push(buffer);
            } else {
                // Another failure in this stripe - track for multi-failure recovery
                failure_count += 1;
                blocks.push(vec![0u8; block_size]);

                // Check if we can still recover
                if !raid_config.can_recover(failure_count) {
                    crate::lcpfs_println!(
                        "[ SCRUB] Cannot repair block {} - {} failures exceeds {} parity",
                        block_id,
                        failure_count,
                        raid_config.parity_disks
                    );
                    return false;
                }
            }
        }

        // If we have enough blocks, attempt reconstruction based on RAID level
        if let Some(fail_idx) = failed_index {
            if blocks.len() >= stripe_width {
                let parity_idx = stripe_width - 1;
                let surviving: Vec<&[u8]> = blocks
                    .iter()
                    .enumerate()
                    .filter(|(i, _)| *i != fail_idx && *i != parity_idx)
                    .map(|(_, b)| b.as_slice())
                    .collect();

                // Select reconstruction method based on RAID level
                let reconstructed = match raid_config.level {
                    RaidLevel::RaidZ1 => {
                        GfSolver::reconstruct_z1(&surviving, &blocks[parity_idx], block_size)
                    }
                    RaidLevel::RaidZ2 => {
                        // RAID-Z2 needs both parity blocks
                        // For now, fall back to Z1 reconstruction
                        GfSolver::reconstruct_z1(&surviving, &blocks[parity_idx], block_size)
                    }
                    RaidLevel::RaidZ3 => {
                        // RAID-Z3 needs all three parity blocks
                        // For now, fall back to Z1 reconstruction
                        GfSolver::reconstruct_z1(&surviving, &blocks[parity_idx], block_size)
                    }
                    RaidLevel::Mirror => {
                        // Mirror: just copy from any surviving disk
                        if !surviving.is_empty() {
                            surviving[0].to_vec()
                        } else {
                            return false;
                        }
                    }
                    RaidLevel::Stripe => return false, // No redundancy
                };

                // Write reconstructed block back
                if dev.write_block(block_id as usize, &reconstructed).is_ok() {
                    crate::lcpfs_println!(
                        "[ SCRUB] Successfully repaired block {} using {:?}",
                        block_id,
                        raid_config.level
                    );
                    return true;
                }
            }
        }

        false
    }

    /// Record scrub completion and learn from outcome
    pub fn complete_scrub(&mut self, current_time_ms: u64) {
        // Get I/O latency from device driver (LunaOS integration)
        let io_latency = crate::lunaos::integration::get_io_latency_us(0);

        // Get CPU utilization from CCU (LunaOS integration)
        let cpu_util = crate::lunaos::integration::get_cpu_utilization();

        // Get final epsilon from CCU (LunaOS integration)
        let epsilon_after = crate::lunaos::integration::get_epsilon_current();

        let observation = ScrubObservation {
            timestamp_ms: current_time_ms,
            blocks_scanned: self.blocks_scanned,
            errors_found: self.errors_detected,
            repairs_made: self.repairs_made,
            // LunaOS: Wire actual I/O latency from device drivers via BlockDevice trait
            io_latency_avg_us: io_latency,
            // LunaOS: Wire CPU utilization from CCU proprioception
            cpu_utilization: cpu_util,
            time_since_last_scrub_ms: current_time_ms.saturating_sub(self.last_scrub_ms),
            // LunaOS: Record ε from CCU at scrub start (captured in start() method)
            epsilon_before: self.current_epsilon,
            epsilon_after,
        };

        // Get predicted error rate from PI scheduler (LunaOS integration)
        let predicted_rate = crate::lunaos::integration::get_predicted_error_rate();

        // Learn from this scrub's outcome
        let outcome = ScrubOutcome {
            observation,
            // LunaOS: Record predicted error rate when scrub was scheduled
            scheduled_at_error_rate: predicted_rate,
            scheduled_at_time_gap: observation.time_since_last_scrub_ms as f64,
        };

        // Notify PI scheduler of completion (LunaOS integration)
        crate::lunaos::integration::notify_scrub_complete(self.errors_detected, self.repairs_made);

        self.learn_from_outcome(&outcome);

        self.observations.push_back(observation);
        self.outcomes.push_back(outcome);

        // Keep history bounded (use configurable max_history)
        while self.observations.len() > self.max_history {
            self.observations.pop_front();
        }
        while self.outcomes.len() > self.max_history {
            self.outcomes.pop_front();
        }

        self.last_scrub_ms = current_time_ms;
    }

    /// Learn from scrub outcome to improve future decisions
    fn learn_from_outcome(&mut self, outcome: &ScrubOutcome) {
        let delta = outcome.observation.delta_epsilon();

        // Learn time gap threshold
        self.threshold_time_gap
            .observe(outcome.observation.time_since_last_scrub_ms as f64, delta);

        // Learn error rate threshold
        self.threshold_error_rate
            .observe(outcome.observation.error_rate(), delta);

        // Learn optimal batch size based on errors found vs CPU used
        let efficiency = if outcome.observation.cpu_utilization > 0.0 {
            outcome.observation.repairs_made as f64 / outcome.observation.cpu_utilization
        } else {
            outcome.observation.repairs_made as f64
        };
        self.batch_size.observe(self.batch_size.value, -efficiency);
    }

    /// Get current engine statistics
    pub fn stats(&self) -> ScrubStats {
        ScrubStats {
            is_running: self.is_running,
            blocks_scanned: self.blocks_scanned,
            repairs_made: self.repairs_made,
            errors_detected: self.errors_detected,
            time_threshold_ms: self.threshold_time_gap.value as u64,
            time_threshold_confidence: self.threshold_time_gap.confidence(),
            error_rate_threshold: self.threshold_error_rate.value,
            error_rate_confidence: self.threshold_error_rate.confidence(),
            batch_size: self.batch_size.value as u64,
        }
    }
}

/// Errors that can occur during scrubbing operations
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ScrubError {
    /// Data checksum does not match expected value
    ChecksumMismatch,
    /// Failed to read data from storage
    ReadError,
    /// I/O error during scrubbing operation
    IoError,
    /// Error cannot be recovered through scrubbing
    Unrecoverable,
    /// Lock contention prevented access (not a hardware failure)
    LockContention,
    /// Block ID is out of device bounds
    OutOfBounds,
}

/// Statistics about scrubbing operations and learned thresholds
#[derive(Debug, Clone, Copy)]
pub struct ScrubStats {
    /// Whether a scrub operation is currently running
    pub is_running: bool,
    /// Total number of blocks scanned
    pub blocks_scanned: u64,
    /// Total number of repairs made
    pub repairs_made: u64,
    /// Total number of errors detected
    pub errors_detected: u64,
    /// Learned time threshold between scrubs (milliseconds)
    pub time_threshold_ms: u64,
    /// Confidence level for time threshold (0.0 to 1.0)
    pub time_threshold_confidence: f64,
    /// Learned error rate threshold (errors per 1M blocks)
    pub error_rate_threshold: f64,
    /// Confidence level for error rate threshold (0.0 to 1.0)
    pub error_rate_confidence: f64,
    /// Batch size for scrubbing operations (blocks per batch)
    pub batch_size: u64,
}

// ═══════════════════════════════════════════════════════════════════════════════
// PUBLIC API
// ═══════════════════════════════════════════════════════════════════════════════

/// Update system epsilon (called by CCU/PI scheduler)
pub fn update_epsilon(epsilon: f64) {
    SCRUB_ENGINE.lock().update_epsilon(epsilon);
}

/// Check if PI thinks we should scrub now
pub fn should_scrub(current_time_ms: u64, observed_error_rate: f64) -> bool {
    SCRUB_ENGINE
        .lock()
        .should_scrub(current_time_ms, observed_error_rate)
}

/// Start a PI-controlled scrub
pub fn start_scrub(current_time_ms: u64) -> Result<(), &'static str> {
    SCRUB_ENGINE.lock().start(current_time_ms)
}

/// Get current statistics
pub fn stats() -> ScrubStats {
    SCRUB_ENGINE.lock().stats()
}