lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// dRAID (Distributed RAID)
// ZFS-inspired distributed spare across pool for faster rebuilds.
//
// Key advantages over traditional RAID-Z:
// 1. Distributed spare capacity - no dedicated hot spare sitting idle
// 2. Parallel rebuild - ALL disks participate in reconstruction
// 3. Faster resilvering - rebuild time scales with disk count
// 4. Better I/O distribution - permutations spread load across all disks

use alloc::collections::BTreeMap;
use alloc::vec;
use alloc::vec::Vec;

use crate::FsError;
use crate::ml::gfalgo::GfAlgo;
use crate::ml::gfsolver::GfSolver;

// ═══════════════════════════════════════════════════════════════════════════════
// DRAID CONFIGURATION
// ═══════════════════════════════════════════════════════════════════════════════

/// dRAID configuration specifying layout parameters.
///
/// Example configurations:
/// - `draid1:4d:8c:1s` = 1 parity, 4 data per group, 8 children, 1 spare
/// - `draid2:4d:12c:2s` = 2 parity, 4 data per group, 12 children, 2 spares
/// - `draid3:8d:24c:3s` = 3 parity, 8 data per group, 24 children, 3 spares
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct DraidConfig {
    /// Number of data disks per redundancy group
    pub data_disks: usize,
    /// Parity level (1, 2, or 3 - like RAID-Z1/Z2/Z3)
    pub parity_level: u8,
    /// Number of distributed spare disks (logical spare capacity)
    pub spare_disks: usize,
    /// Total physical disks (children) in the vdev
    pub children: usize,
    /// Block size in bytes (default 128KB)
    pub block_size: usize,
    /// Permutation base for deterministic shuffling
    pub permutation_base: u64,
}

impl DraidConfig {
    /// Create a new dRAID configuration.
    ///
    /// # Arguments
    /// * `data_disks` - Data columns per group (e.g., 4, 8)
    /// * `parity_level` - 1, 2, or 3 (like RAID-Z)
    /// * `spare_disks` - Distributed spare capacity (e.g., 1, 2)
    /// * `children` - Total physical disks
    ///
    /// # Example
    /// ```rust,ignore
    /// // draid2:4d:10c:2s - RAID-Z2 with 4 data, 10 disks, 2 spares
    /// let config = DraidConfig::new(4, 2, 2, 10)?;
    /// ```
    pub fn new(
        data_disks: usize,
        parity_level: u8,
        spare_disks: usize,
        children: usize,
    ) -> Result<Self, DraidError> {
        // Validate parity level
        if !(1..=3).contains(&parity_level) {
            return Err(DraidError::InvalidConfig("parity level must be 1, 2, or 3"));
        }

        // Validate minimum disk count
        let group_width = data_disks + parity_level as usize;
        let min_disks = group_width + spare_disks;
        if children < min_disks {
            return Err(DraidError::InvalidConfig(
                "not enough children for data + parity + spare",
            ));
        }

        // Validate data disks
        if data_disks == 0 {
            return Err(DraidError::InvalidConfig("data_disks must be > 0"));
        }

        // Validate spare count
        if spare_disks == 0 {
            return Err(DraidError::InvalidConfig("spare_disks must be >= 1"));
        }

        Ok(Self {
            data_disks,
            parity_level,
            spare_disks,
            children,
            block_size: 128 * 1024, // 128KB default
            permutation_base: 0xDEAD_BEEF_CAFE_BABE,
        })
    }

    /// Width of a single redundancy group (data + parity columns).
    #[inline]
    pub fn group_width(&self) -> usize {
        self.data_disks + self.parity_level as usize
    }

    /// Number of groups that can fit in a single "row" of the vdev.
    ///
    /// A row uses all disks exactly once. Groups are distributed across
    /// the row using permutations.
    pub fn groups_per_row(&self) -> usize {
        // Total usable columns = children - spare capacity
        let usable = self.children - self.spare_disks;
        usable / self.group_width()
    }

    /// Total usable data capacity per row (excluding parity and spare).
    pub fn data_per_row(&self) -> usize {
        self.groups_per_row() * self.data_disks * self.block_size
    }

    /// Storage efficiency (data / total).
    pub fn efficiency(&self) -> f64 {
        let usable = self.children - self.spare_disks;
        let data_cols = self.groups_per_row() * self.data_disks;
        data_cols as f64 / usable as f64
    }

    /// Fault tolerance (number of disk failures that can be survived).
    pub fn fault_tolerance(&self) -> usize {
        self.parity_level as usize
    }

    /// Parse a dRAID string like "draid2:4d:10c:2s".
    pub fn parse(s: &str) -> Result<Self, DraidError> {
        let parts: Vec<&str> = s.split(':').collect();
        if parts.len() != 4 {
            return Err(DraidError::InvalidConfig("format: draidN:Xd:Yc:Zs"));
        }

        // Parse parity level from "draidN"
        let parity = parts[0]
            .strip_prefix("draid")
            .and_then(|n| n.parse::<u8>().ok())
            .ok_or(DraidError::InvalidConfig("invalid draid prefix"))?;

        // Parse data disks from "Xd"
        let data = parts[1]
            .strip_suffix('d')
            .and_then(|n| n.parse::<usize>().ok())
            .ok_or(DraidError::InvalidConfig("invalid data disk count"))?;

        // Parse children from "Yc"
        let children = parts[2]
            .strip_suffix('c')
            .and_then(|n| n.parse::<usize>().ok())
            .ok_or(DraidError::InvalidConfig("invalid children count"))?;

        // Parse spares from "Zs"
        let spares = parts[3]
            .strip_suffix('s')
            .and_then(|n| n.parse::<usize>().ok())
            .ok_or(DraidError::InvalidConfig("invalid spare count"))?;

        Self::new(data, parity, spares, children)
    }
}

impl core::fmt::Display for DraidConfig {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        write!(
            f,
            "draid{}:{}d:{}c:{}s",
            self.parity_level, self.data_disks, self.children, self.spare_disks
        )
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// ERROR TYPES
// ═══════════════════════════════════════════════════════════════════════════════

/// dRAID-specific errors.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DraidError {
    /// Invalid configuration parameter
    InvalidConfig(&'static str),
    /// Disk has failed
    DiskFailed(usize),
    /// Too many disk failures for recovery
    TooManyFailures,
    /// Block not found
    BlockNotFound,
    /// I/O error
    IoError,
    /// Rebuild in progress
    RebuildInProgress,
    /// Invalid offset
    InvalidOffset,
    /// Scrub error
    ScrubError(&'static str),
}

impl From<DraidError> for FsError {
    fn from(e: DraidError) -> Self {
        match e {
            DraidError::InvalidConfig(msg) => FsError::InvalidArgument { reason: msg },
            DraidError::DiskFailed(disk) => FsError::IoError {
                vdev: disk,
                reason: "disk failed",
            },
            DraidError::TooManyFailures => FsError::IoError {
                vdev: 0,
                reason: "too many disk failures",
            },
            DraidError::BlockNotFound => FsError::IoError {
                vdev: 0,
                reason: "block not found",
            },
            DraidError::IoError => FsError::IoError {
                vdev: 0,
                reason: "I/O error",
            },
            DraidError::RebuildInProgress => FsError::IoError {
                vdev: 0,
                reason: "rebuild in progress",
            },
            DraidError::InvalidOffset => FsError::InvalidArgument {
                reason: "invalid offset",
            },
            DraidError::ScrubError(msg) => FsError::InvalidArgument { reason: msg },
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// DISK STATE
// ═══════════════════════════════════════════════════════════════════════════════

/// State of a physical disk in the dRAID vdev.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DiskState {
    /// Disk is online and healthy
    Online,
    /// Disk is degraded (errors detected but still usable)
    Degraded,
    /// Disk has failed and needs rebuild
    Faulted,
    /// Disk is being rebuilt (resilvering)
    Resilvering,
    /// Disk is offline (removed or unavailable)
    Offline,
}

// ═══════════════════════════════════════════════════════════════════════════════
// REDUNDANCY GROUP
// ═══════════════════════════════════════════════════════════════════════════════

/// A single redundancy group within a dRAID row.
///
/// Each group contains data columns and parity columns, similar to a
/// single RAID-Z stripe but distributed across the pool.
#[derive(Debug, Clone)]
pub struct DraidGroup {
    /// Group ID within the row
    pub group_id: usize,
    /// Row this group belongs to
    pub row_id: u64,
    /// Disk indices for data columns (in order)
    pub data_columns: Vec<usize>,
    /// Disk indices for parity columns (P, Q, R)
    pub parity_columns: Vec<usize>,
}

impl DraidGroup {
    /// Total columns in this group.
    pub fn width(&self) -> usize {
        self.data_columns.len() + self.parity_columns.len()
    }

    /// Get the disk index for a specific column.
    ///
    /// Columns 0..data are data, remaining are parity.
    pub fn column_disk(&self, col: usize) -> Option<usize> {
        if col < self.data_columns.len() {
            Some(self.data_columns[col])
        } else {
            let parity_col = col - self.data_columns.len();
            self.parity_columns.get(parity_col).copied()
        }
    }

    /// Check if this group has any failed disks.
    pub fn has_failures(&self, disk_states: &[DiskState]) -> bool {
        for &disk in self.data_columns.iter().chain(self.parity_columns.iter()) {
            if matches!(disk_states[disk], DiskState::Faulted | DiskState::Offline) {
                return true;
            }
        }
        false
    }

    /// Get list of failed disk indices in this group.
    pub fn failed_disks(&self, disk_states: &[DiskState]) -> Vec<usize> {
        let mut failed = Vec::new();
        for &disk in self.data_columns.iter().chain(self.parity_columns.iter()) {
            if matches!(disk_states[disk], DiskState::Faulted | DiskState::Offline) {
                failed.push(disk);
            }
        }
        failed
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// PERMUTATION GENERATOR
// ═══════════════════════════════════════════════════════════════════════════════

/// Generates deterministic permutations for dRAID disk mapping.
///
/// Each row of blocks uses a different permutation to distribute
/// data evenly across all disks. This enables parallel rebuild.
pub struct PermutationGenerator {
    /// Number of disks
    num_disks: usize,
    /// Base seed for deterministic generation
    base_seed: u64,
}

impl PermutationGenerator {
    /// Create a new permutation generator.
    pub fn new(num_disks: usize, base_seed: u64) -> Self {
        Self {
            num_disks,
            base_seed,
        }
    }

    /// Generate the permutation for a given row.
    ///
    /// Uses Fisher-Yates shuffle with deterministic PRNG seeded by row number.
    /// This ensures the same row always produces the same permutation.
    pub fn get_permutation(&self, row: u64) -> Vec<usize> {
        let mut disks: Vec<usize> = (0..self.num_disks).collect();

        // Seed the PRNG with base_seed XOR row
        let mut rng_state = self.base_seed.wrapping_add(row);

        // Fisher-Yates shuffle
        for i in (1..disks.len()).rev() {
            // Linear congruential generator (same constants as glibc)
            rng_state = rng_state
                .wrapping_mul(6364136223846793005)
                .wrapping_add(1442695040888963407);

            let j = (rng_state as usize) % (i + 1);
            disks.swap(i, j);
        }

        disks
    }

    /// Map a logical block position to a physical disk.
    ///
    /// # Arguments
    /// * `row` - The row number
    /// * `column` - The column within the row (0..num_disks)
    ///
    /// # Returns
    /// Physical disk index
    pub fn map_to_disk(&self, row: u64, column: usize) -> usize {
        let perm = self.get_permutation(row);
        perm[column % self.num_disks]
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// DRAID VDEV
// ═══════════════════════════════════════════════════════════════════════════════

/// dRAID Virtual Device.
///
/// Manages block I/O with distributed parity and spare capacity.
/// All disks participate in both data storage and rebuild operations.
pub struct DraidVdev {
    /// Configuration
    config: DraidConfig,
    /// Disk states
    disk_states: Vec<DiskState>,
    /// Permutation generator
    perm_gen: PermutationGenerator,
    /// Block data (disk_id -> offset -> data)
    /// In a real implementation, this would be actual disk I/O
    block_data: BTreeMap<usize, BTreeMap<u64, Vec<u8>>>,
    /// Rebuild progress (disk_id -> blocks rebuilt)
    rebuild_progress: BTreeMap<usize, u64>,
    /// Total blocks per disk
    blocks_per_disk: u64,
    /// Scrub state
    scrub_state: Option<ScrubState>,
    /// Statistics
    stats: DraidStats,
}

/// dRAID statistics.
#[derive(Debug, Clone, Default)]
pub struct DraidStats {
    /// Total reads
    pub reads: u64,
    /// Total writes
    pub writes: u64,
    /// Reconstruction reads (degraded reads)
    pub reconstruction_reads: u64,
    /// Blocks rebuilt
    pub blocks_rebuilt: u64,
    /// Checksum errors detected
    pub checksum_errors: u64,
    /// Bytes read
    pub bytes_read: u64,
    /// Bytes written
    pub bytes_written: u64,
}

impl DraidVdev {
    /// Create a new dRAID vdev.
    pub fn new(config: DraidConfig) -> Self {
        let perm_gen = PermutationGenerator::new(config.children, config.permutation_base);
        let disk_states = vec![DiskState::Online; config.children];

        Self {
            config,
            disk_states,
            perm_gen,
            block_data: BTreeMap::new(),
            rebuild_progress: BTreeMap::new(),
            blocks_per_disk: 0,
            scrub_state: None,
            stats: DraidStats::default(),
        }
    }

    /// Get the configuration.
    pub fn config(&self) -> &DraidConfig {
        &self.config
    }

    /// Get the disk states.
    pub fn disk_states(&self) -> &[DiskState] {
        &self.disk_states
    }

    /// Get statistics.
    pub fn stats(&self) -> &DraidStats {
        &self.stats
    }

    /// Get the redundancy group for a logical block offset.
    ///
    /// # Arguments
    /// * `offset` - Logical byte offset in the vdev
    ///
    /// # Returns
    /// The redundancy group containing this offset, or error
    pub fn get_group_for_offset(&self, offset: u64) -> Result<DraidGroup, DraidError> {
        let row = self.offset_to_row(offset);
        let group_id = self.offset_to_group(offset);

        self.get_group(row, group_id)
    }

    /// Get a specific redundancy group.
    pub fn get_group(&self, row: u64, group_id: usize) -> Result<DraidGroup, DraidError> {
        let groups_per_row = self.config.groups_per_row();
        if group_id >= groups_per_row {
            return Err(DraidError::InvalidOffset);
        }

        // Get the permutation for this row
        let perm = self.perm_gen.get_permutation(row);

        // Calculate column positions for this group
        let group_width = self.config.group_width();
        let start_col = group_id * group_width;

        // First data_disks columns are data, rest are parity
        let data_columns: Vec<usize> = (0..self.config.data_disks)
            .map(|i| perm[start_col + i])
            .collect();

        let parity_columns: Vec<usize> = (0..self.config.parity_level as usize)
            .map(|i| perm[start_col + self.config.data_disks + i])
            .collect();

        Ok(DraidGroup {
            group_id,
            row_id: row,
            data_columns,
            parity_columns,
        })
    }

    /// Convert logical offset to row number.
    fn offset_to_row(&self, offset: u64) -> u64 {
        let data_per_row = self.config.data_per_row() as u64;
        offset / data_per_row
    }

    /// Convert logical offset to group within row.
    fn offset_to_group(&self, offset: u64) -> usize {
        let data_per_row = self.config.data_per_row() as u64;
        let data_per_group = (self.config.data_disks * self.config.block_size) as u64;
        let offset_in_row = offset % data_per_row;
        (offset_in_row / data_per_group) as usize
    }

    /// Convert logical offset to column within group.
    fn offset_to_column(&self, offset: u64) -> usize {
        let data_per_group = (self.config.data_disks * self.config.block_size) as u64;
        let offset_in_group = offset % data_per_group;
        (offset_in_group / self.config.block_size as u64) as usize
    }

    /// Write a block of data.
    ///
    /// Computes parity and writes to all columns in the group.
    pub fn write_block(&mut self, data: &[u8], offset: u64) -> Result<(), DraidError> {
        let group = self.get_group_for_offset(offset)?;

        // Check for failed disks - can't write if too many failures
        let failed = group.failed_disks(&self.disk_states);
        if failed.len() > self.config.parity_level as usize {
            return Err(DraidError::TooManyFailures);
        }

        // Split data into column-sized chunks
        let chunks = self.split_data_to_columns(data);
        if chunks.len() != self.config.data_disks {
            return Err(DraidError::InvalidConfig("data size mismatch"));
        }

        // Compute parity
        let parities = self.compute_parity(&chunks)?;

        // Write data columns
        for (i, chunk) in chunks.iter().enumerate() {
            let disk = group.data_columns[i];
            if self.disk_states[disk] == DiskState::Online
                || self.disk_states[disk] == DiskState::Degraded
            {
                self.write_disk_block(disk, offset, chunk)?;
            }
        }

        // Write parity columns
        for (i, parity) in parities.iter().enumerate() {
            let disk = group.parity_columns[i];
            if self.disk_states[disk] == DiskState::Online
                || self.disk_states[disk] == DiskState::Degraded
            {
                self.write_disk_block(disk, offset, parity)?;
            }
        }

        self.stats.writes += 1;
        self.stats.bytes_written += data.len() as u64;

        Ok(())
    }

    /// Read a block of data.
    ///
    /// If disks have failed, reconstructs data from parity.
    pub fn read_block(&mut self, offset: u64, size: usize) -> Result<Vec<u8>, DraidError> {
        let group = self.get_group_for_offset(offset)?;

        // Check how many disks have failed
        let failed = group.failed_disks(&self.disk_states);
        if failed.len() > self.config.parity_level as usize {
            return Err(DraidError::TooManyFailures);
        }

        self.stats.reads += 1;

        if failed.is_empty() {
            // Normal read - all disks healthy
            self.read_normal(&group, offset, size)
        } else {
            // Degraded read - need reconstruction
            self.stats.reconstruction_reads += 1;
            self.read_degraded(&group, offset, size, &failed)
        }
    }

    /// Normal read when all disks are healthy.
    fn read_normal(
        &mut self,
        group: &DraidGroup,
        offset: u64,
        size: usize,
    ) -> Result<Vec<u8>, DraidError> {
        let mut result = Vec::with_capacity(size);

        // Read each data column
        for &disk in &group.data_columns {
            if let Some(block) = self.read_disk_block(disk, offset)? {
                result.extend_from_slice(&block);
            } else {
                // Block not found - return zeros
                result.extend(vec![0u8; self.config.block_size]);
            }
        }

        result.truncate(size);
        self.stats.bytes_read += result.len() as u64;
        Ok(result)
    }

    /// Degraded read with reconstruction from parity.
    fn read_degraded(
        &mut self,
        group: &DraidGroup,
        offset: u64,
        size: usize,
        failed_disks: &[usize],
    ) -> Result<Vec<u8>, DraidError> {
        // Collect surviving data
        let mut surviving_data: Vec<(Vec<u8>, usize)> = Vec::new();
        let mut failed_indices: Vec<usize> = Vec::new();

        for (col, &disk) in group.data_columns.iter().enumerate() {
            if failed_disks.contains(&disk) {
                failed_indices.push(col);
            } else if let Some(block) = self.read_disk_block(disk, offset)? {
                surviving_data.push((block, col));
            }
        }

        // Read parity blocks
        let mut parity_blocks: Vec<Vec<u8>> = Vec::new();
        for &disk in &group.parity_columns {
            if !failed_disks.contains(&disk) {
                if let Some(block) = self.read_disk_block(disk, offset)? {
                    parity_blocks.push(block);
                }
            }
        }

        // Reconstruct failed data using parity
        let block_size = self.config.block_size;
        let reconstructed =
            self.reconstruct_data(&surviving_data, &parity_blocks, &failed_indices, block_size)?;

        // Merge surviving and reconstructed data
        let mut all_data: Vec<(Vec<u8>, usize)> = surviving_data;
        for (i, recon) in reconstructed.into_iter().enumerate() {
            all_data.push((recon, failed_indices[i]));
        }

        // Sort by column index
        all_data.sort_by_key(|(_, col)| *col);

        // Concatenate
        let mut result = Vec::with_capacity(size);
        for (data, _) in all_data {
            result.extend_from_slice(&data);
        }

        result.truncate(size);
        self.stats.bytes_read += result.len() as u64;
        Ok(result)
    }

    /// Reconstruct failed data blocks using parity.
    fn reconstruct_data(
        &self,
        surviving: &[(Vec<u8>, usize)],
        parities: &[Vec<u8>],
        failed_indices: &[usize],
        block_size: usize,
    ) -> Result<Vec<Vec<u8>>, DraidError> {
        if failed_indices.is_empty() {
            return Ok(Vec::new());
        }

        let parity_level = self.config.parity_level as usize;
        if failed_indices.len() > parity_level {
            return Err(DraidError::TooManyFailures);
        }

        match parity_level {
            1 => {
                // RAID-Z1: Simple XOR reconstruction
                if parities.is_empty() {
                    return Err(DraidError::TooManyFailures);
                }
                let surviving_refs: Vec<&[u8]> =
                    surviving.iter().map(|(d, _)| d.as_slice()).collect();
                let recovered = GfSolver::reconstruct_z1(&surviving_refs, &parities[0], block_size);
                Ok(vec![recovered])
            }
            2 => {
                // RAID-Z2: Use P and Q parities
                if parities.len() < 2 {
                    // Not enough parity - try with what we have
                    if failed_indices.len() == 1 && !parities.is_empty() {
                        let surviving_refs: Vec<&[u8]> =
                            surviving.iter().map(|(d, _)| d.as_slice()).collect();
                        let recovered =
                            GfSolver::reconstruct_z1(&surviving_refs, &parities[0], block_size);
                        return Ok(vec![recovered]);
                    }
                    return Err(DraidError::TooManyFailures);
                }

                let surviving_refs: Vec<(&[u8], usize)> = surviving
                    .iter()
                    .map(|(d, idx)| (d.as_slice(), *idx))
                    .collect();

                GfSolver::reconstruct_z2(
                    failed_indices,
                    &surviving_refs,
                    &parities[0],
                    &parities[1],
                    block_size,
                )
                .map_err(|_| DraidError::TooManyFailures)
            }
            3 => {
                // RAID-Z3: Use P, Q, and R parities
                if parities.len() < 3 {
                    // Fall back to Z2 or Z1 if we have fewer failures
                    if failed_indices.len() <= 2 && parities.len() >= 2 {
                        let surviving_refs: Vec<(&[u8], usize)> = surviving
                            .iter()
                            .map(|(d, idx)| (d.as_slice(), *idx))
                            .collect();
                        return GfSolver::reconstruct_z2(
                            failed_indices,
                            &surviving_refs,
                            &parities[0],
                            &parities[1],
                            block_size,
                        )
                        .map_err(|_| DraidError::TooManyFailures);
                    }
                    return Err(DraidError::TooManyFailures);
                }

                let surviving_refs: Vec<(&[u8], usize)> = surviving
                    .iter()
                    .map(|(d, idx)| (d.as_slice(), *idx))
                    .collect();

                GfSolver::reconstruct_z3(
                    failed_indices,
                    &surviving_refs,
                    &parities[0],
                    &parities[1],
                    &parities[2],
                    block_size,
                )
                .map_err(|_| DraidError::TooManyFailures)
            }
            _ => Err(DraidError::InvalidConfig("invalid parity level")),
        }
    }

    /// Mark a disk as failed.
    pub fn mark_disk_failed(&mut self, disk_id: usize) -> Result<(), DraidError> {
        if disk_id >= self.config.children {
            return Err(DraidError::InvalidConfig("invalid disk ID"));
        }

        crate::lcpfs_println!(
            "[ dRAID ] Disk {} marked as FAULTED (was {:?})",
            disk_id,
            self.disk_states[disk_id]
        );

        self.disk_states[disk_id] = DiskState::Faulted;
        Ok(())
    }

    /// Start rebuild (resilver) for a failed disk.
    ///
    /// dRAID reflow distributes reconstructed data across ALL disks'
    /// spare capacity, enabling parallel rebuild.
    pub fn start_rebuild(&mut self, failed_disk: usize) -> Result<(), DraidError> {
        if failed_disk >= self.config.children {
            return Err(DraidError::InvalidConfig("invalid disk ID"));
        }

        if self.disk_states[failed_disk] != DiskState::Faulted {
            return Err(DraidError::InvalidConfig("disk is not faulted"));
        }

        // Check we haven't exceeded fault tolerance
        let failed_count = self
            .disk_states
            .iter()
            .filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Offline))
            .count();

        if failed_count > self.config.parity_level as usize {
            return Err(DraidError::TooManyFailures);
        }

        crate::lcpfs_println!(
            "[ dRAID ] Starting rebuild for disk {} (distributed across {} surviving disks)",
            failed_disk,
            self.config.children - failed_count
        );

        self.disk_states[failed_disk] = DiskState::Resilvering;
        self.rebuild_progress.insert(failed_disk, 0);

        Ok(())
    }

    /// Perform one step of the rebuild process.
    ///
    /// Returns the number of blocks rebuilt, or 0 if complete.
    pub fn rebuild_step(&mut self, failed_disk: usize, batch_size: u64) -> Result<u64, DraidError> {
        if self.disk_states[failed_disk] != DiskState::Resilvering {
            return Err(DraidError::InvalidConfig("disk is not resilvering"));
        }

        let progress = *self.rebuild_progress.get(&failed_disk).unwrap_or(&0);

        // Find all groups that include this disk
        let mut rebuilt = 0u64;
        for row in 0..self.blocks_per_disk.max(1) {
            if rebuilt >= batch_size {
                break;
            }

            // Check each group in this row
            for group_id in 0..self.config.groups_per_row() {
                let group = self.get_group(row, group_id)?;

                // Check if this group uses the failed disk
                let uses_disk = group.data_columns.contains(&failed_disk)
                    || group.parity_columns.contains(&failed_disk);

                if !uses_disk {
                    continue;
                }

                // Skip already-rebuilt blocks
                let block_num = row * self.config.groups_per_row() as u64 + group_id as u64;
                if block_num < progress {
                    continue;
                }

                // Reconstruct and write to distributed spare
                if let Ok(()) = self.reflow_block(&group, failed_disk) {
                    rebuilt += 1;
                    self.stats.blocks_rebuilt += 1;
                }
            }
        }

        // Update progress
        self.rebuild_progress
            .insert(failed_disk, progress + rebuilt);

        // Check if rebuild is complete
        if rebuilt == 0 || progress + rebuilt >= self.blocks_per_disk.max(1) {
            crate::lcpfs_println!("[ dRAID ] Rebuild complete for disk {}", failed_disk);
            self.disk_states[failed_disk] = DiskState::Offline; // Now "replaced"
            self.rebuild_progress.remove(&failed_disk);
        }

        Ok(rebuilt)
    }

    /// Reflow a single block to distributed spare space.
    ///
    /// When a disk fails, its data is reconstructed and distributed
    /// across the spare capacity on ALL surviving disks.
    fn reflow_block(&mut self, group: &DraidGroup, failed_disk: usize) -> Result<(), DraidError> {
        // Find which column the failed disk represents
        let failed_col = group
            .data_columns
            .iter()
            .position(|&d| d == failed_disk)
            .or_else(|| {
                group
                    .parity_columns
                    .iter()
                    .position(|&d| d == failed_disk)
                    .map(|p| self.config.data_disks + p)
            });

        let failed_col = match failed_col {
            Some(c) => c,
            None => return Ok(()), // Disk not in this group
        };

        let offset = group.row_id * self.config.block_size as u64;

        // Collect surviving data
        let mut surviving: Vec<(Vec<u8>, usize)> = Vec::new();
        let mut parities: Vec<Vec<u8>> = Vec::new();

        for (col, &disk) in group.data_columns.iter().enumerate() {
            if disk != failed_disk {
                if let Ok(Some(block)) = self.read_disk_block(disk, offset) {
                    surviving.push((block, col));
                }
            }
        }

        for &disk in &group.parity_columns {
            if disk != failed_disk {
                if let Ok(Some(block)) = self.read_disk_block(disk, offset) {
                    parities.push(block);
                }
            }
        }

        // Reconstruct the failed block
        let block_size = self.config.block_size;
        let failed_indices = vec![failed_col];
        let reconstructed =
            self.reconstruct_data(&surviving, &parities, &failed_indices, block_size)?;

        if reconstructed.is_empty() {
            return Ok(());
        }

        // Write to distributed spare space
        // In a real implementation, this would use the spare capacity on surviving disks
        // For now, we mark the block as reflowed by writing to a "spare" offset
        let spare_offset = offset | 0x8000_0000_0000_0000; // Mark as spare location

        // Distribute across surviving disks
        let surviving_disks: Vec<usize> = (0..self.config.children)
            .filter(|&d| {
                d != failed_disk
                    && matches!(self.disk_states[d], DiskState::Online | DiskState::Degraded)
            })
            .collect();

        if surviving_disks.is_empty() {
            return Err(DraidError::TooManyFailures);
        }

        // Pick a target disk using the spare offset as index
        let target_disk = surviving_disks[(spare_offset as usize) % surviving_disks.len()];
        self.write_disk_block(target_disk, spare_offset, &reconstructed[0])?;

        Ok(())
    }

    /// Get rebuild progress as percentage.
    pub fn get_rebuild_progress(&self, disk_id: usize) -> Option<f64> {
        self.rebuild_progress.get(&disk_id).map(|&progress| {
            if self.blocks_per_disk == 0 {
                100.0
            } else {
                (progress as f64 / self.blocks_per_disk as f64) * 100.0
            }
        })
    }

    /// Calculate rebuild speedup compared to traditional RAID-Z.
    ///
    /// dRAID achieves faster rebuilds because ALL disks participate,
    /// not just the disks in a single vdev.
    pub fn rebuild_speedup(&self) -> f64 {
        let failed_count = self
            .disk_states
            .iter()
            .filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Resilvering))
            .count();

        let active_disks = self.config.children - failed_count;
        let traditional_disks = self.config.group_width() - 1; // Traditional RAID-Z uses only stripe width

        if traditional_disks == 0 {
            1.0
        } else {
            active_disks as f64 / traditional_disks as f64
        }
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Internal I/O helpers
    // ─────────────────────────────────────────────────────────────────────────────

    /// Split data into column-sized chunks.
    fn split_data_to_columns(&self, data: &[u8]) -> Vec<Vec<u8>> {
        let block_size = self.config.block_size;
        let mut chunks = Vec::with_capacity(self.config.data_disks);

        for i in 0..self.config.data_disks {
            let start = i * block_size;
            let end = ((i + 1) * block_size).min(data.len());

            if start < data.len() {
                let mut chunk = data[start..end].to_vec();
                // Pad to block_size if needed
                chunk.resize(block_size, 0);
                chunks.push(chunk);
            } else {
                chunks.push(vec![0u8; block_size]);
            }
        }

        chunks
    }

    /// Compute parity blocks for data columns.
    fn compute_parity(&self, data_columns: &[Vec<u8>]) -> Result<Vec<Vec<u8>>, DraidError> {
        let block_size = self.config.block_size;
        let parity_level = self.config.parity_level as usize;
        let mut parities = Vec::with_capacity(parity_level);

        // P parity: XOR of all data
        let mut p = vec![0u8; block_size];
        for col in data_columns {
            for (i, &byte) in col.iter().enumerate() {
                p[i] ^= byte;
            }
        }
        parities.push(p);

        if parity_level >= 2 {
            // Q parity: Sum of 2^i * D_i in GF(2^8)
            let mut q = vec![0u8; block_size];
            for (col_idx, col) in data_columns.iter().enumerate() {
                let coeff = gf_pow_2(col_idx);
                for (i, &byte) in col.iter().enumerate() {
                    q[i] ^= GfAlgo::multiply(byte, coeff);
                }
            }
            parities.push(q);
        }

        if parity_level >= 3 {
            // R parity: Sum of 4^i * D_i in GF(2^8)
            let mut r = vec![0u8; block_size];
            for (col_idx, col) in data_columns.iter().enumerate() {
                let coeff = gf_pow_4(col_idx);
                for (i, &byte) in col.iter().enumerate() {
                    r[i] ^= GfAlgo::multiply(byte, coeff);
                }
            }
            parities.push(r);
        }

        Ok(parities)
    }

    /// Write a block to a disk (in-memory simulation).
    fn write_disk_block(
        &mut self,
        disk: usize,
        offset: u64,
        data: &[u8],
    ) -> Result<(), DraidError> {
        let disk_map = self.block_data.entry(disk).or_default();
        disk_map.insert(offset, data.to_vec());

        // Update blocks_per_disk
        let max_offset = offset / self.config.block_size as u64 + 1;
        if max_offset > self.blocks_per_disk {
            self.blocks_per_disk = max_offset;
        }

        Ok(())
    }

    /// Read a block from a disk (in-memory simulation).
    fn read_disk_block(&self, disk: usize, offset: u64) -> Result<Option<Vec<u8>>, DraidError> {
        if matches!(
            self.disk_states[disk],
            DiskState::Faulted | DiskState::Offline
        ) {
            return Err(DraidError::DiskFailed(disk));
        }

        Ok(self
            .block_data
            .get(&disk)
            .and_then(|m| m.get(&offset))
            .cloned())
    }

    /// Check if the pool can tolerate another failure.
    pub fn can_tolerate_failure(&self) -> bool {
        let current_failures = self
            .disk_states
            .iter()
            .filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Offline))
            .count();

        current_failures < self.config.parity_level as usize
    }

    /// Get pool status summary.
    pub fn get_status(&self) -> (usize, usize, usize, usize, usize) {
        let online = self
            .disk_states
            .iter()
            .filter(|&&s| s == DiskState::Online)
            .count();
        let degraded = self
            .disk_states
            .iter()
            .filter(|&&s| s == DiskState::Degraded)
            .count();
        let faulted = self
            .disk_states
            .iter()
            .filter(|&&s| s == DiskState::Faulted)
            .count();
        let resilvering = self
            .disk_states
            .iter()
            .filter(|&&s| s == DiskState::Resilvering)
            .count();
        let offline = self
            .disk_states
            .iter()
            .filter(|&&s| s == DiskState::Offline)
            .count();

        (online, degraded, faulted, resilvering, offline)
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// SCRUB SUPPORT
// ═══════════════════════════════════════════════════════════════════════════════

/// Scrub state for dRAID.
#[derive(Debug, Clone)]
pub struct ScrubState {
    /// Current row being scrubbed
    pub current_row: u64,
    /// Current group being scrubbed
    pub current_group: usize,
    /// Total rows
    pub total_rows: u64,
    /// Errors found
    pub errors_found: u64,
    /// Errors repaired
    pub errors_repaired: u64,
    /// Started timestamp (as counter)
    pub started: u64,
}

impl DraidVdev {
    /// Start a scrub operation.
    pub fn start_scrub(&mut self) -> Result<(), DraidError> {
        if self.scrub_state.is_some() {
            return Err(DraidError::ScrubError("scrub already in progress"));
        }

        let total_rows = self.blocks_per_disk.max(1);

        self.scrub_state = Some(ScrubState {
            current_row: 0,
            current_group: 0,
            total_rows,
            errors_found: 0,
            errors_repaired: 0,
            started: 0, // Would use timestamp in real implementation
        });

        crate::lcpfs_println!(
            "[ dRAID ] Starting scrub ({} rows, {} groups/row)",
            total_rows,
            self.config.groups_per_row()
        );

        Ok(())
    }

    /// Perform one step of the scrub operation.
    ///
    /// Walks through all permutations, reading and verifying each group.
    pub fn scrub_step(&mut self, batch_size: u64) -> Result<u64, DraidError> {
        // Extract state to avoid borrow issues
        let (current_row, current_group, total_rows) = {
            let state = match &self.scrub_state {
                Some(s) => s,
                None => return Err(DraidError::ScrubError("no scrub in progress")),
            };
            (state.current_row, state.current_group, state.total_rows)
        };

        let mut row = current_row;
        let mut group_idx = current_group;
        let mut checked = 0u64;
        let mut errors_found = 0u64;
        let mut errors_repaired = 0u64;
        let groups_per_row = self.config.groups_per_row();

        while checked < batch_size {
            if row >= total_rows {
                // Scrub complete
                break;
            }

            // Get the group for current position
            let group = self.get_group(row, group_idx)?;
            let offset = row * self.config.block_size as u64;

            // Verify the group (read all columns and check parity)
            if self.verify_group(&group, offset).is_err() {
                errors_found += 1;

                // Attempt repair if possible
                if self.can_tolerate_failure() && self.repair_group(&group, offset).is_ok() {
                    errors_repaired += 1;
                }
            }

            checked += 1;

            // Move to next group/row
            group_idx += 1;
            if group_idx >= groups_per_row {
                group_idx = 0;
                row += 1;
            }
        }

        // Update state
        if let Some(state) = &mut self.scrub_state {
            state.current_row = row;
            state.current_group = group_idx;
            state.errors_found += errors_found;
            state.errors_repaired += errors_repaired;
        }

        Ok(checked)
    }

    /// Verify a group's parity matches data.
    fn verify_group(&self, group: &DraidGroup, offset: u64) -> Result<(), DraidError> {
        let block_size = self.config.block_size;

        // Read all data columns
        let mut data_columns: Vec<Vec<u8>> = Vec::new();
        for &disk in &group.data_columns {
            match self.read_disk_block(disk, offset) {
                Ok(Some(block)) => data_columns.push(block),
                Ok(None) => data_columns.push(vec![0u8; block_size]),
                Err(_) => return Err(DraidError::IoError),
            }
        }

        // Read parity columns
        let mut parity_columns: Vec<Vec<u8>> = Vec::new();
        for &disk in &group.parity_columns {
            match self.read_disk_block(disk, offset) {
                Ok(Some(block)) => parity_columns.push(block),
                Ok(None) => parity_columns.push(vec![0u8; block_size]),
                Err(_) => return Err(DraidError::IoError),
            }
        }

        // Recompute parity and compare
        // P parity check: XOR of all data should equal P
        let mut computed_p = vec![0u8; block_size];
        for col in &data_columns {
            for (i, &byte) in col.iter().enumerate() {
                computed_p[i] ^= byte;
            }
        }

        if !parity_columns.is_empty() && computed_p != parity_columns[0] {
            return Err(DraidError::ScrubError("P parity mismatch"));
        }

        Ok(())
    }

    /// Attempt to repair a group with mismatched parity.
    fn repair_group(&mut self, group: &DraidGroup, offset: u64) -> Result<(), DraidError> {
        // Read surviving data and parity
        let mut surviving: Vec<(Vec<u8>, usize)> = Vec::new();
        let mut parities: Vec<Vec<u8>> = Vec::new();
        let mut bad_disk: Option<usize> = None;

        for (col, &disk) in group.data_columns.iter().enumerate() {
            match self.read_disk_block(disk, offset) {
                Ok(Some(block)) => surviving.push((block, col)),
                Ok(None) => surviving.push((vec![0u8; self.config.block_size], col)),
                Err(_) => bad_disk = Some(disk),
            }
        }

        for &disk in &group.parity_columns {
            match self.read_disk_block(disk, offset) {
                Ok(Some(block)) => parities.push(block),
                Ok(None) => parities.push(vec![0u8; self.config.block_size]),
                Err(_) => {}
            }
        }

        // If we found a bad disk, try to reconstruct it
        if let Some(disk) = bad_disk {
            // Mark as faulted temporarily for reconstruction
            let old_state = self.disk_states[disk];
            self.disk_states[disk] = DiskState::Faulted;

            // Reconstruct and write back
            let failed_col = group.data_columns.iter().position(|&d| d == disk);
            if let Some(col) = failed_col {
                let reconstructed =
                    self.reconstruct_data(&surviving, &parities, &[col], self.config.block_size)?;

                if !reconstructed.is_empty() {
                    self.disk_states[disk] = old_state;
                    self.write_disk_block(disk, offset, &reconstructed[0])?;
                }
            }
        }

        Ok(())
    }

    /// Get scrub progress as percentage.
    pub fn get_scrub_progress(&self) -> Option<f64> {
        self.scrub_state.as_ref().map(|state| {
            let total = state.total_rows * self.config.groups_per_row() as u64;
            if total == 0 {
                100.0
            } else {
                let current = state.current_row * self.config.groups_per_row() as u64
                    + state.current_group as u64;
                (current as f64 / total as f64) * 100.0
            }
        })
    }

    /// Complete scrub and return summary.
    pub fn finish_scrub(&mut self) -> Option<(u64, u64)> {
        self.scrub_state.take().map(|state| {
            crate::lcpfs_println!(
                "[ dRAID ] Scrub complete: {} errors found, {} repaired",
                state.errors_found,
                state.errors_repaired
            );
            (state.errors_found, state.errors_repaired)
        })
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// HELPER FUNCTIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Compute 2^n in GF(2^8).
fn gf_pow_2(n: usize) -> u8 {
    let mut result = 1u8;
    for _ in 0..n {
        result = GfAlgo::multiply(result, 2);
    }
    result
}

/// Compute 4^n in GF(2^8).
fn gf_pow_4(n: usize) -> u8 {
    let mut result = 1u8;
    let four = GfAlgo::multiply(2, 2);
    for _ in 0..n {
        result = GfAlgo::multiply(result, four);
    }
    result
}

// ═══════════════════════════════════════════════════════════════════════════════
// LEGACY COMPATIBILITY (from old draid.rs)
// ═══════════════════════════════════════════════════════════════════════════════

/// Legacy dRAID pool wrapper for backward compatibility.
pub struct DraidPool {
    vdev: DraidVdev,
}

impl DraidPool {
    /// Create new dRAID pool.
    pub fn new(config: DraidConfig) -> Self {
        Self {
            vdev: DraidVdev::new(config),
        }
    }

    /// Get disk layout for a stripe (legacy API).
    pub fn get_stripe_layout(&mut self, stripe_id: u64) -> Vec<usize> {
        self.vdev.perm_gen.get_permutation(stripe_id)
    }

    /// Mark disk as failed.
    pub fn mark_failed(&mut self, disk_id: usize) -> Result<(), &'static str> {
        self.vdev
            .mark_disk_failed(disk_id)
            .map_err(|_| "Invalid disk ID")
    }

    /// Start rebuild for failed disk.
    pub fn start_rebuild(&mut self, failed_disk_id: usize) -> Result<usize, &'static str> {
        self.vdev
            .start_rebuild(failed_disk_id)
            .map_err(|_| "Cannot start rebuild")?;

        // Return the first available disk as "spare" (distributed across all)
        Ok((failed_disk_id + 1) % self.vdev.config.children)
    }

    /// Update rebuild progress.
    pub fn update_rebuild_progress(&mut self, disk_id: usize, progress: f64) {
        if progress >= 100.0 {
            self.vdev.disk_states[disk_id] = DiskState::Online;
            self.vdev.rebuild_progress.remove(&disk_id);
        } else {
            let blocks = ((progress / 100.0) * self.vdev.blocks_per_disk as f64) as u64;
            self.vdev.rebuild_progress.insert(disk_id, blocks);
        }
    }

    /// Get number of failed disks.
    pub fn failed_disk_count(&self) -> usize {
        self.vdev
            .disk_states
            .iter()
            .filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Offline))
            .count()
    }

    /// Check if pool can tolerate another failure.
    pub fn can_tolerate_failure(&self) -> bool {
        self.vdev.can_tolerate_failure()
    }

    /// Calculate rebuild speed advantage.
    pub fn rebuild_speedup(&self) -> f64 {
        self.vdev.rebuild_speedup()
    }

    /// Get pool statistics.
    pub fn get_stats(&self) -> (usize, usize, usize, usize, usize) {
        self.vdev.get_status()
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;

    // ─────────────────────────────────────────────────────────────────────────────
    // Configuration Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_draid_config_valid() {
        // draid1:4d:8c:1s
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        assert_eq!(config.data_disks, 4);
        assert_eq!(config.parity_level, 1);
        assert_eq!(config.spare_disks, 1);
        assert_eq!(config.children, 8);
        assert_eq!(config.group_width(), 5); // 4 data + 1 parity
    }

    #[test]
    fn test_draid_config_draid2() {
        // draid2:4d:12c:2s
        let config = DraidConfig::new(4, 2, 2, 12).expect("should create valid config");
        assert_eq!(config.group_width(), 6); // 4 data + 2 parity
        assert_eq!(config.fault_tolerance(), 2);
    }

    #[test]
    fn test_draid_config_draid3() {
        // draid3:8d:24c:3s
        let config = DraidConfig::new(8, 3, 3, 24).expect("should create valid config");
        assert_eq!(config.group_width(), 11); // 8 data + 3 parity
        assert_eq!(config.fault_tolerance(), 3);
    }

    #[test]
    fn test_draid_config_invalid_parity() {
        assert!(DraidConfig::new(4, 0, 1, 8).is_err());
        assert!(DraidConfig::new(4, 4, 1, 8).is_err());
    }

    #[test]
    fn test_draid_config_not_enough_disks() {
        // Need at least data + parity + spare
        assert!(DraidConfig::new(4, 1, 1, 5).is_err()); // 4+1+1=6, only 5 disks
    }

    #[test]
    fn test_draid_config_no_spare() {
        assert!(DraidConfig::new(4, 1, 0, 8).is_err());
    }

    #[test]
    fn test_draid_config_parse() {
        let config = DraidConfig::parse("draid2:4d:10c:2s").expect("should parse");
        assert_eq!(config.parity_level, 2);
        assert_eq!(config.data_disks, 4);
        assert_eq!(config.children, 10);
        assert_eq!(config.spare_disks, 2);
    }

    #[test]
    fn test_draid_config_efficiency() {
        // draid1:4d:8c:1s
        // Usable = 8 - 1 = 7 disks
        // Group width = 5, groups_per_row = 7/5 = 1
        // Data columns per row = 1 * 4 = 4
        // Efficiency = 4/7 ≈ 0.571
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let eff = config.efficiency();
        assert!(eff > 0.5 && eff < 0.6);
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Permutation Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_permutation_deterministic() {
        let perm_gen = PermutationGenerator::new(10, 0xDEADBEEF);

        let perm1 = perm_gen.get_permutation(42);
        let perm2 = perm_gen.get_permutation(42);

        assert_eq!(perm1, perm2, "Same row should give same permutation");
    }

    #[test]
    fn test_permutation_different_rows() {
        let perm_gen = PermutationGenerator::new(10, 0xDEADBEEF);

        let perm1 = perm_gen.get_permutation(0);
        let perm2 = perm_gen.get_permutation(1);

        assert_ne!(
            perm1, perm2,
            "Different rows should give different permutations"
        );
    }

    #[test]
    fn test_permutation_covers_all_disks() {
        let perm_gen = PermutationGenerator::new(10, 0xDEADBEEF);
        let perm = perm_gen.get_permutation(0);

        assert_eq!(perm.len(), 10);

        // All disks 0-9 should be present exactly once
        let mut sorted = perm.clone();
        sorted.sort();
        let expected: Vec<usize> = (0..10).collect();
        assert_eq!(sorted, expected);
    }

    #[test]
    fn test_map_to_disk() {
        let perm_gen = PermutationGenerator::new(8, 0xCAFE);

        let disk = perm_gen.map_to_disk(5, 3);
        assert!(disk < 8);

        // Same mapping should be deterministic
        let disk2 = perm_gen.map_to_disk(5, 3);
        assert_eq!(disk, disk2);
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Group Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_get_group() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let vdev = DraidVdev::new(config);

        let group = vdev.get_group(0, 0).expect("should get group");

        assert_eq!(group.data_columns.len(), 4);
        assert_eq!(group.parity_columns.len(), 1);
        assert_eq!(group.width(), 5);
    }

    #[test]
    fn test_group_disk_mapping() {
        let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
        let vdev = DraidVdev::new(config);

        let group = vdev.get_group(0, 0).expect("should get group");

        // Check data columns
        for &disk in &group.data_columns {
            assert!(disk < 10);
        }

        // Check parity columns
        for &disk in &group.parity_columns {
            assert!(disk < 10);
        }

        // All disks should be unique within the group
        let mut all_disks = group.data_columns.clone();
        all_disks.extend(&group.parity_columns);
        all_disks.sort();
        all_disks.dedup();
        assert_eq!(all_disks.len(), 6); // 4 data + 2 parity
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // I/O Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_write_and_read_block() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write some data
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Read it back
        let read_data = vdev.read_block(0, 512).expect("should read");
        assert_eq!(read_data, data);
    }

    #[test]
    fn test_write_and_read_multiple_blocks() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        let block_size = config.data_disks * config.block_size;

        // Write multiple blocks
        for i in 0..5 {
            let data: Vec<u8> = (0..block_size).map(|j| ((i + j) % 256) as u8).collect();
            let offset = i as u64 * block_size as u64;
            vdev.write_block(&data, offset).expect("should write");
        }

        // Read them back
        for i in 0..5 {
            let expected: Vec<u8> = (0..block_size).map(|j| ((i + j) % 256) as u8).collect();
            let offset = i as u64 * block_size as u64;
            let read_data = vdev.read_block(offset, block_size).expect("should read");
            assert_eq!(read_data, expected);
        }
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Degraded Read Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_degraded_read_draid1() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write data
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Fail one disk
        let group = vdev.get_group(0, 0).expect("should get group");
        let failed_disk = group.data_columns[0];
        vdev.mark_disk_failed(failed_disk)
            .expect("should mark failed");

        // Should still be able to read (degraded)
        let read_data = vdev.read_block(0, 512).expect("should read degraded");
        assert_eq!(read_data, data);
    }

    #[test]
    fn test_degraded_read_draid2() {
        let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write data
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Fail two disks
        let group = vdev.get_group(0, 0).expect("should get group");
        vdev.mark_disk_failed(group.data_columns[0])
            .expect("should mark failed");
        vdev.mark_disk_failed(group.data_columns[1])
            .expect("should mark failed");

        // Should still be able to read with 2 parity
        let read_data = vdev.read_block(0, 512).expect("should read degraded");
        assert_eq!(read_data, data);
    }

    #[test]
    fn test_too_many_failures() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write data
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Fail two disks (draid1 can only handle 1)
        let group = vdev.get_group(0, 0).expect("should get group");
        vdev.mark_disk_failed(group.data_columns[0])
            .expect("should mark failed");
        vdev.mark_disk_failed(group.data_columns[1])
            .expect("should mark failed");

        // Should fail to read
        assert!(vdev.read_block(0, 512).is_err());
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Rebuild Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_start_rebuild() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write some data first
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Fail a disk
        vdev.mark_disk_failed(3).expect("should mark failed");

        // Start rebuild
        vdev.start_rebuild(3).expect("should start rebuild");

        assert_eq!(vdev.disk_states[3], DiskState::Resilvering);
    }

    #[test]
    fn test_rebuild_speedup() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // dRAID with 8 disks, 1 failed
        // Traditional RAID-Z: 4 data + 1 parity = 5 disks
        // Traditional rebuild uses 5-1 = 4 disks
        // dRAID uses all 7 surviving disks
        // Speedup = 7/4 = 1.75

        vdev.mark_disk_failed(0).expect("should mark failed");
        let speedup = vdev.rebuild_speedup();

        assert!(speedup > 1.7 && speedup < 1.8);
    }

    #[test]
    fn test_can_tolerate_failure() {
        let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        assert!(vdev.can_tolerate_failure());

        vdev.mark_disk_failed(0).expect("should mark failed");
        assert!(vdev.can_tolerate_failure()); // draid2 can handle 1 failure

        vdev.mark_disk_failed(1).expect("should mark failed");
        assert!(!vdev.can_tolerate_failure()); // 2 failures = at limit
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Scrub Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_scrub_start_and_progress() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write some data
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Start scrub
        vdev.start_scrub().expect("should start scrub");

        assert!(vdev.get_scrub_progress().is_some());
    }

    #[test]
    fn test_scrub_double_start_fails() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        vdev.start_scrub().expect("should start scrub");
        assert!(vdev.start_scrub().is_err());
    }

    #[test]
    fn test_scrub_complete() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write data
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Start and run scrub
        vdev.start_scrub().expect("should start scrub");

        // Run until complete
        loop {
            let checked = vdev.scrub_step(100).expect("should scrub");
            if checked == 0 {
                break;
            }
        }

        // Finish scrub
        let (errors, repaired) = vdev.finish_scrub().expect("should have scrub state");
        assert_eq!(errors, 0);
        assert_eq!(repaired, 0);
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Legacy API Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_legacy_draid_pool() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let mut pool = DraidPool::new(config);

        let layout1 = pool.get_stripe_layout(0);
        let layout2 = pool.get_stripe_layout(1);

        assert_eq!(layout1.len(), 10);
        assert_eq!(layout2.len(), 10);
        assert_ne!(layout1, layout2);
    }

    #[test]
    fn test_legacy_disk_failure() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let mut pool = DraidPool::new(config);

        pool.mark_failed(5).expect("should mark failed");
        assert_eq!(pool.failed_disk_count(), 1);
    }

    #[test]
    fn test_legacy_rebuild() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let mut pool = DraidPool::new(config);

        pool.mark_failed(5).expect("should mark failed");
        let spare = pool.start_rebuild(5).expect("should start rebuild");

        // Spare is distributed, so just check it's a valid disk
        assert!(spare < 10);
        assert_ne!(spare, 5);
    }

    #[test]
    fn test_legacy_failure_tolerance() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let mut pool = DraidPool::new(config);

        assert!(pool.can_tolerate_failure());

        pool.mark_failed(0).expect("should mark failed");
        assert!(pool.can_tolerate_failure());

        pool.mark_failed(1).expect("should mark failed");
        assert!(!pool.can_tolerate_failure());
    }

    #[test]
    fn test_legacy_rebuild_speedup() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let pool = DraidPool::new(config);

        // 10 disks, group width 8
        // Traditional: 8-1 = 7 disks
        // dRAID: 10 disks
        // Speedup ≈ 10/7 ≈ 1.43
        let speedup = pool.rebuild_speedup();
        assert!(speedup > 1.4 && speedup < 1.5);
    }

    #[test]
    fn test_legacy_stats() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let mut pool = DraidPool::new(config);

        pool.mark_failed(5).expect("should mark failed");
        pool.start_rebuild(5).expect("should start rebuild");

        let (online, degraded, faulted, resilvering, offline) = pool.get_stats();
        // After marking failed and starting rebuild, disk is in Resilvering state
        // Total = online + degraded + faulted + resilvering + offline
        let total = online + degraded + faulted + resilvering + offline;
        assert_eq!(total, 10);
        assert_eq!(resilvering, 1); // Disk 5 is rebuilding
    }

    #[test]
    fn test_legacy_permutation_deterministic() {
        let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
        let mut pool = DraidPool::new(config);

        let layout1 = pool.get_stripe_layout(42);
        let layout2 = pool.get_stripe_layout(42);
        assert_eq!(layout1, layout2);
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Parity Computation Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_parity_computation_z1() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let vdev = DraidVdev::new(config);

        let data: Vec<Vec<u8>> = vec![
            vec![0x11; 128 * 1024],
            vec![0x22; 128 * 1024],
            vec![0x33; 128 * 1024],
            vec![0x44; 128 * 1024],
        ];

        let parities = vdev.compute_parity(&data).expect("should compute");
        assert_eq!(parities.len(), 1);

        // P = 0x11 XOR 0x22 XOR 0x33 XOR 0x44 = 0x44
        assert_eq!(parities[0][0], 0x44);
    }

    #[test]
    fn test_parity_computation_z2() {
        let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
        let vdev = DraidVdev::new(config);

        let data: Vec<Vec<u8>> = vec![
            vec![0x11; 128 * 1024],
            vec![0x22; 128 * 1024],
            vec![0x33; 128 * 1024],
            vec![0x44; 128 * 1024],
        ];

        let parities = vdev.compute_parity(&data).expect("should compute");
        assert_eq!(parities.len(), 2);

        // P is XOR
        assert_eq!(parities[0][0], 0x44);

        // Q is GF-weighted sum (non-trivial)
        assert!(parities[1][0] != 0);
    }

    #[test]
    fn test_parity_computation_z3() {
        let config = DraidConfig::new(4, 3, 1, 12).expect("should create valid config");
        let vdev = DraidVdev::new(config);

        let data: Vec<Vec<u8>> = vec![
            vec![0x11; 128 * 1024],
            vec![0x22; 128 * 1024],
            vec![0x33; 128 * 1024],
            vec![0x44; 128 * 1024],
        ];

        let parities = vdev.compute_parity(&data).expect("should compute");
        assert_eq!(parities.len(), 3);
    }

    // ─────────────────────────────────────────────────────────────────────────────
    // Statistics Tests
    // ─────────────────────────────────────────────────────────────────────────────

    #[test]
    fn test_stats_tracking() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        assert_eq!(vdev.stats().writes, 1);
        assert!(vdev.stats().bytes_written > 0);

        // Read
        vdev.read_block(0, 512).expect("should read");

        assert_eq!(vdev.stats().reads, 1);
        assert!(vdev.stats().bytes_read > 0);
    }

    #[test]
    fn test_reconstruction_stats() {
        let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
        let mut vdev = DraidVdev::new(config);

        // Write
        let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
        vdev.write_block(&data, 0).expect("should write");

        // Fail disk and read (degraded)
        let group = vdev.get_group(0, 0).expect("should get group");
        vdev.mark_disk_failed(group.data_columns[0])
            .expect("should mark failed");
        vdev.read_block(0, 512).expect("should read degraded");

        assert_eq!(vdev.stats().reconstruction_reads, 1);
    }
}