use alloc::collections::BTreeMap;
use alloc::vec;
use alloc::vec::Vec;
use crate::FsError;
use crate::ml::gfalgo::GfAlgo;
use crate::ml::gfsolver::GfSolver;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct DraidConfig {
pub data_disks: usize,
pub parity_level: u8,
pub spare_disks: usize,
pub children: usize,
pub block_size: usize,
pub permutation_base: u64,
}
impl DraidConfig {
pub fn new(
data_disks: usize,
parity_level: u8,
spare_disks: usize,
children: usize,
) -> Result<Self, DraidError> {
if !(1..=3).contains(&parity_level) {
return Err(DraidError::InvalidConfig("parity level must be 1, 2, or 3"));
}
let group_width = data_disks + parity_level as usize;
let min_disks = group_width + spare_disks;
if children < min_disks {
return Err(DraidError::InvalidConfig(
"not enough children for data + parity + spare",
));
}
if data_disks == 0 {
return Err(DraidError::InvalidConfig("data_disks must be > 0"));
}
if spare_disks == 0 {
return Err(DraidError::InvalidConfig("spare_disks must be >= 1"));
}
Ok(Self {
data_disks,
parity_level,
spare_disks,
children,
block_size: 128 * 1024, permutation_base: 0xDEAD_BEEF_CAFE_BABE,
})
}
#[inline]
pub fn group_width(&self) -> usize {
self.data_disks + self.parity_level as usize
}
pub fn groups_per_row(&self) -> usize {
let usable = self.children - self.spare_disks;
usable / self.group_width()
}
pub fn data_per_row(&self) -> usize {
self.groups_per_row() * self.data_disks * self.block_size
}
pub fn efficiency(&self) -> f64 {
let usable = self.children - self.spare_disks;
let data_cols = self.groups_per_row() * self.data_disks;
data_cols as f64 / usable as f64
}
pub fn fault_tolerance(&self) -> usize {
self.parity_level as usize
}
pub fn parse(s: &str) -> Result<Self, DraidError> {
let parts: Vec<&str> = s.split(':').collect();
if parts.len() != 4 {
return Err(DraidError::InvalidConfig("format: draidN:Xd:Yc:Zs"));
}
let parity = parts[0]
.strip_prefix("draid")
.and_then(|n| n.parse::<u8>().ok())
.ok_or(DraidError::InvalidConfig("invalid draid prefix"))?;
let data = parts[1]
.strip_suffix('d')
.and_then(|n| n.parse::<usize>().ok())
.ok_or(DraidError::InvalidConfig("invalid data disk count"))?;
let children = parts[2]
.strip_suffix('c')
.and_then(|n| n.parse::<usize>().ok())
.ok_or(DraidError::InvalidConfig("invalid children count"))?;
let spares = parts[3]
.strip_suffix('s')
.and_then(|n| n.parse::<usize>().ok())
.ok_or(DraidError::InvalidConfig("invalid spare count"))?;
Self::new(data, parity, spares, children)
}
}
impl core::fmt::Display for DraidConfig {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(
f,
"draid{}:{}d:{}c:{}s",
self.parity_level, self.data_disks, self.children, self.spare_disks
)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DraidError {
InvalidConfig(&'static str),
DiskFailed(usize),
TooManyFailures,
BlockNotFound,
IoError,
RebuildInProgress,
InvalidOffset,
ScrubError(&'static str),
}
impl From<DraidError> for FsError {
fn from(e: DraidError) -> Self {
match e {
DraidError::InvalidConfig(msg) => FsError::InvalidArgument { reason: msg },
DraidError::DiskFailed(disk) => FsError::IoError {
vdev: disk,
reason: "disk failed",
},
DraidError::TooManyFailures => FsError::IoError {
vdev: 0,
reason: "too many disk failures",
},
DraidError::BlockNotFound => FsError::IoError {
vdev: 0,
reason: "block not found",
},
DraidError::IoError => FsError::IoError {
vdev: 0,
reason: "I/O error",
},
DraidError::RebuildInProgress => FsError::IoError {
vdev: 0,
reason: "rebuild in progress",
},
DraidError::InvalidOffset => FsError::InvalidArgument {
reason: "invalid offset",
},
DraidError::ScrubError(msg) => FsError::InvalidArgument { reason: msg },
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DiskState {
Online,
Degraded,
Faulted,
Resilvering,
Offline,
}
#[derive(Debug, Clone)]
pub struct DraidGroup {
pub group_id: usize,
pub row_id: u64,
pub data_columns: Vec<usize>,
pub parity_columns: Vec<usize>,
}
impl DraidGroup {
pub fn width(&self) -> usize {
self.data_columns.len() + self.parity_columns.len()
}
pub fn column_disk(&self, col: usize) -> Option<usize> {
if col < self.data_columns.len() {
Some(self.data_columns[col])
} else {
let parity_col = col - self.data_columns.len();
self.parity_columns.get(parity_col).copied()
}
}
pub fn has_failures(&self, disk_states: &[DiskState]) -> bool {
for &disk in self.data_columns.iter().chain(self.parity_columns.iter()) {
if matches!(disk_states[disk], DiskState::Faulted | DiskState::Offline) {
return true;
}
}
false
}
pub fn failed_disks(&self, disk_states: &[DiskState]) -> Vec<usize> {
let mut failed = Vec::new();
for &disk in self.data_columns.iter().chain(self.parity_columns.iter()) {
if matches!(disk_states[disk], DiskState::Faulted | DiskState::Offline) {
failed.push(disk);
}
}
failed
}
}
pub struct PermutationGenerator {
num_disks: usize,
base_seed: u64,
}
impl PermutationGenerator {
pub fn new(num_disks: usize, base_seed: u64) -> Self {
Self {
num_disks,
base_seed,
}
}
pub fn get_permutation(&self, row: u64) -> Vec<usize> {
let mut disks: Vec<usize> = (0..self.num_disks).collect();
let mut rng_state = self.base_seed.wrapping_add(row);
for i in (1..disks.len()).rev() {
rng_state = rng_state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
let j = (rng_state as usize) % (i + 1);
disks.swap(i, j);
}
disks
}
pub fn map_to_disk(&self, row: u64, column: usize) -> usize {
let perm = self.get_permutation(row);
perm[column % self.num_disks]
}
}
pub struct DraidVdev {
config: DraidConfig,
disk_states: Vec<DiskState>,
perm_gen: PermutationGenerator,
block_data: BTreeMap<usize, BTreeMap<u64, Vec<u8>>>,
rebuild_progress: BTreeMap<usize, u64>,
blocks_per_disk: u64,
scrub_state: Option<ScrubState>,
stats: DraidStats,
}
#[derive(Debug, Clone, Default)]
pub struct DraidStats {
pub reads: u64,
pub writes: u64,
pub reconstruction_reads: u64,
pub blocks_rebuilt: u64,
pub checksum_errors: u64,
pub bytes_read: u64,
pub bytes_written: u64,
}
impl DraidVdev {
pub fn new(config: DraidConfig) -> Self {
let perm_gen = PermutationGenerator::new(config.children, config.permutation_base);
let disk_states = vec![DiskState::Online; config.children];
Self {
config,
disk_states,
perm_gen,
block_data: BTreeMap::new(),
rebuild_progress: BTreeMap::new(),
blocks_per_disk: 0,
scrub_state: None,
stats: DraidStats::default(),
}
}
pub fn config(&self) -> &DraidConfig {
&self.config
}
pub fn disk_states(&self) -> &[DiskState] {
&self.disk_states
}
pub fn stats(&self) -> &DraidStats {
&self.stats
}
pub fn get_group_for_offset(&self, offset: u64) -> Result<DraidGroup, DraidError> {
let row = self.offset_to_row(offset);
let group_id = self.offset_to_group(offset);
self.get_group(row, group_id)
}
pub fn get_group(&self, row: u64, group_id: usize) -> Result<DraidGroup, DraidError> {
let groups_per_row = self.config.groups_per_row();
if group_id >= groups_per_row {
return Err(DraidError::InvalidOffset);
}
let perm = self.perm_gen.get_permutation(row);
let group_width = self.config.group_width();
let start_col = group_id * group_width;
let data_columns: Vec<usize> = (0..self.config.data_disks)
.map(|i| perm[start_col + i])
.collect();
let parity_columns: Vec<usize> = (0..self.config.parity_level as usize)
.map(|i| perm[start_col + self.config.data_disks + i])
.collect();
Ok(DraidGroup {
group_id,
row_id: row,
data_columns,
parity_columns,
})
}
fn offset_to_row(&self, offset: u64) -> u64 {
let data_per_row = self.config.data_per_row() as u64;
offset / data_per_row
}
fn offset_to_group(&self, offset: u64) -> usize {
let data_per_row = self.config.data_per_row() as u64;
let data_per_group = (self.config.data_disks * self.config.block_size) as u64;
let offset_in_row = offset % data_per_row;
(offset_in_row / data_per_group) as usize
}
fn offset_to_column(&self, offset: u64) -> usize {
let data_per_group = (self.config.data_disks * self.config.block_size) as u64;
let offset_in_group = offset % data_per_group;
(offset_in_group / self.config.block_size as u64) as usize
}
pub fn write_block(&mut self, data: &[u8], offset: u64) -> Result<(), DraidError> {
let group = self.get_group_for_offset(offset)?;
let failed = group.failed_disks(&self.disk_states);
if failed.len() > self.config.parity_level as usize {
return Err(DraidError::TooManyFailures);
}
let chunks = self.split_data_to_columns(data);
if chunks.len() != self.config.data_disks {
return Err(DraidError::InvalidConfig("data size mismatch"));
}
let parities = self.compute_parity(&chunks)?;
for (i, chunk) in chunks.iter().enumerate() {
let disk = group.data_columns[i];
if self.disk_states[disk] == DiskState::Online
|| self.disk_states[disk] == DiskState::Degraded
{
self.write_disk_block(disk, offset, chunk)?;
}
}
for (i, parity) in parities.iter().enumerate() {
let disk = group.parity_columns[i];
if self.disk_states[disk] == DiskState::Online
|| self.disk_states[disk] == DiskState::Degraded
{
self.write_disk_block(disk, offset, parity)?;
}
}
self.stats.writes += 1;
self.stats.bytes_written += data.len() as u64;
Ok(())
}
pub fn read_block(&mut self, offset: u64, size: usize) -> Result<Vec<u8>, DraidError> {
let group = self.get_group_for_offset(offset)?;
let failed = group.failed_disks(&self.disk_states);
if failed.len() > self.config.parity_level as usize {
return Err(DraidError::TooManyFailures);
}
self.stats.reads += 1;
if failed.is_empty() {
self.read_normal(&group, offset, size)
} else {
self.stats.reconstruction_reads += 1;
self.read_degraded(&group, offset, size, &failed)
}
}
fn read_normal(
&mut self,
group: &DraidGroup,
offset: u64,
size: usize,
) -> Result<Vec<u8>, DraidError> {
let mut result = Vec::with_capacity(size);
for &disk in &group.data_columns {
if let Some(block) = self.read_disk_block(disk, offset)? {
result.extend_from_slice(&block);
} else {
result.extend(vec![0u8; self.config.block_size]);
}
}
result.truncate(size);
self.stats.bytes_read += result.len() as u64;
Ok(result)
}
fn read_degraded(
&mut self,
group: &DraidGroup,
offset: u64,
size: usize,
failed_disks: &[usize],
) -> Result<Vec<u8>, DraidError> {
let mut surviving_data: Vec<(Vec<u8>, usize)> = Vec::new();
let mut failed_indices: Vec<usize> = Vec::new();
for (col, &disk) in group.data_columns.iter().enumerate() {
if failed_disks.contains(&disk) {
failed_indices.push(col);
} else if let Some(block) = self.read_disk_block(disk, offset)? {
surviving_data.push((block, col));
}
}
let mut parity_blocks: Vec<Vec<u8>> = Vec::new();
for &disk in &group.parity_columns {
if !failed_disks.contains(&disk) {
if let Some(block) = self.read_disk_block(disk, offset)? {
parity_blocks.push(block);
}
}
}
let block_size = self.config.block_size;
let reconstructed =
self.reconstruct_data(&surviving_data, &parity_blocks, &failed_indices, block_size)?;
let mut all_data: Vec<(Vec<u8>, usize)> = surviving_data;
for (i, recon) in reconstructed.into_iter().enumerate() {
all_data.push((recon, failed_indices[i]));
}
all_data.sort_by_key(|(_, col)| *col);
let mut result = Vec::with_capacity(size);
for (data, _) in all_data {
result.extend_from_slice(&data);
}
result.truncate(size);
self.stats.bytes_read += result.len() as u64;
Ok(result)
}
fn reconstruct_data(
&self,
surviving: &[(Vec<u8>, usize)],
parities: &[Vec<u8>],
failed_indices: &[usize],
block_size: usize,
) -> Result<Vec<Vec<u8>>, DraidError> {
if failed_indices.is_empty() {
return Ok(Vec::new());
}
let parity_level = self.config.parity_level as usize;
if failed_indices.len() > parity_level {
return Err(DraidError::TooManyFailures);
}
match parity_level {
1 => {
if parities.is_empty() {
return Err(DraidError::TooManyFailures);
}
let surviving_refs: Vec<&[u8]> =
surviving.iter().map(|(d, _)| d.as_slice()).collect();
let recovered = GfSolver::reconstruct_z1(&surviving_refs, &parities[0], block_size);
Ok(vec![recovered])
}
2 => {
if parities.len() < 2 {
if failed_indices.len() == 1 && !parities.is_empty() {
let surviving_refs: Vec<&[u8]> =
surviving.iter().map(|(d, _)| d.as_slice()).collect();
let recovered =
GfSolver::reconstruct_z1(&surviving_refs, &parities[0], block_size);
return Ok(vec![recovered]);
}
return Err(DraidError::TooManyFailures);
}
let surviving_refs: Vec<(&[u8], usize)> = surviving
.iter()
.map(|(d, idx)| (d.as_slice(), *idx))
.collect();
GfSolver::reconstruct_z2(
failed_indices,
&surviving_refs,
&parities[0],
&parities[1],
block_size,
)
.map_err(|_| DraidError::TooManyFailures)
}
3 => {
if parities.len() < 3 {
if failed_indices.len() <= 2 && parities.len() >= 2 {
let surviving_refs: Vec<(&[u8], usize)> = surviving
.iter()
.map(|(d, idx)| (d.as_slice(), *idx))
.collect();
return GfSolver::reconstruct_z2(
failed_indices,
&surviving_refs,
&parities[0],
&parities[1],
block_size,
)
.map_err(|_| DraidError::TooManyFailures);
}
return Err(DraidError::TooManyFailures);
}
let surviving_refs: Vec<(&[u8], usize)> = surviving
.iter()
.map(|(d, idx)| (d.as_slice(), *idx))
.collect();
GfSolver::reconstruct_z3(
failed_indices,
&surviving_refs,
&parities[0],
&parities[1],
&parities[2],
block_size,
)
.map_err(|_| DraidError::TooManyFailures)
}
_ => Err(DraidError::InvalidConfig("invalid parity level")),
}
}
pub fn mark_disk_failed(&mut self, disk_id: usize) -> Result<(), DraidError> {
if disk_id >= self.config.children {
return Err(DraidError::InvalidConfig("invalid disk ID"));
}
crate::lcpfs_println!(
"[ dRAID ] Disk {} marked as FAULTED (was {:?})",
disk_id,
self.disk_states[disk_id]
);
self.disk_states[disk_id] = DiskState::Faulted;
Ok(())
}
pub fn start_rebuild(&mut self, failed_disk: usize) -> Result<(), DraidError> {
if failed_disk >= self.config.children {
return Err(DraidError::InvalidConfig("invalid disk ID"));
}
if self.disk_states[failed_disk] != DiskState::Faulted {
return Err(DraidError::InvalidConfig("disk is not faulted"));
}
let failed_count = self
.disk_states
.iter()
.filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Offline))
.count();
if failed_count > self.config.parity_level as usize {
return Err(DraidError::TooManyFailures);
}
crate::lcpfs_println!(
"[ dRAID ] Starting rebuild for disk {} (distributed across {} surviving disks)",
failed_disk,
self.config.children - failed_count
);
self.disk_states[failed_disk] = DiskState::Resilvering;
self.rebuild_progress.insert(failed_disk, 0);
Ok(())
}
pub fn rebuild_step(&mut self, failed_disk: usize, batch_size: u64) -> Result<u64, DraidError> {
if self.disk_states[failed_disk] != DiskState::Resilvering {
return Err(DraidError::InvalidConfig("disk is not resilvering"));
}
let progress = *self.rebuild_progress.get(&failed_disk).unwrap_or(&0);
let mut rebuilt = 0u64;
for row in 0..self.blocks_per_disk.max(1) {
if rebuilt >= batch_size {
break;
}
for group_id in 0..self.config.groups_per_row() {
let group = self.get_group(row, group_id)?;
let uses_disk = group.data_columns.contains(&failed_disk)
|| group.parity_columns.contains(&failed_disk);
if !uses_disk {
continue;
}
let block_num = row * self.config.groups_per_row() as u64 + group_id as u64;
if block_num < progress {
continue;
}
if let Ok(()) = self.reflow_block(&group, failed_disk) {
rebuilt += 1;
self.stats.blocks_rebuilt += 1;
}
}
}
self.rebuild_progress
.insert(failed_disk, progress + rebuilt);
if rebuilt == 0 || progress + rebuilt >= self.blocks_per_disk.max(1) {
crate::lcpfs_println!("[ dRAID ] Rebuild complete for disk {}", failed_disk);
self.disk_states[failed_disk] = DiskState::Offline; self.rebuild_progress.remove(&failed_disk);
}
Ok(rebuilt)
}
fn reflow_block(&mut self, group: &DraidGroup, failed_disk: usize) -> Result<(), DraidError> {
let failed_col = group
.data_columns
.iter()
.position(|&d| d == failed_disk)
.or_else(|| {
group
.parity_columns
.iter()
.position(|&d| d == failed_disk)
.map(|p| self.config.data_disks + p)
});
let failed_col = match failed_col {
Some(c) => c,
None => return Ok(()), };
let offset = group.row_id * self.config.block_size as u64;
let mut surviving: Vec<(Vec<u8>, usize)> = Vec::new();
let mut parities: Vec<Vec<u8>> = Vec::new();
for (col, &disk) in group.data_columns.iter().enumerate() {
if disk != failed_disk {
if let Ok(Some(block)) = self.read_disk_block(disk, offset) {
surviving.push((block, col));
}
}
}
for &disk in &group.parity_columns {
if disk != failed_disk {
if let Ok(Some(block)) = self.read_disk_block(disk, offset) {
parities.push(block);
}
}
}
let block_size = self.config.block_size;
let failed_indices = vec![failed_col];
let reconstructed =
self.reconstruct_data(&surviving, &parities, &failed_indices, block_size)?;
if reconstructed.is_empty() {
return Ok(());
}
let spare_offset = offset | 0x8000_0000_0000_0000;
let surviving_disks: Vec<usize> = (0..self.config.children)
.filter(|&d| {
d != failed_disk
&& matches!(self.disk_states[d], DiskState::Online | DiskState::Degraded)
})
.collect();
if surviving_disks.is_empty() {
return Err(DraidError::TooManyFailures);
}
let target_disk = surviving_disks[(spare_offset as usize) % surviving_disks.len()];
self.write_disk_block(target_disk, spare_offset, &reconstructed[0])?;
Ok(())
}
pub fn get_rebuild_progress(&self, disk_id: usize) -> Option<f64> {
self.rebuild_progress.get(&disk_id).map(|&progress| {
if self.blocks_per_disk == 0 {
100.0
} else {
(progress as f64 / self.blocks_per_disk as f64) * 100.0
}
})
}
pub fn rebuild_speedup(&self) -> f64 {
let failed_count = self
.disk_states
.iter()
.filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Resilvering))
.count();
let active_disks = self.config.children - failed_count;
let traditional_disks = self.config.group_width() - 1;
if traditional_disks == 0 {
1.0
} else {
active_disks as f64 / traditional_disks as f64
}
}
fn split_data_to_columns(&self, data: &[u8]) -> Vec<Vec<u8>> {
let block_size = self.config.block_size;
let mut chunks = Vec::with_capacity(self.config.data_disks);
for i in 0..self.config.data_disks {
let start = i * block_size;
let end = ((i + 1) * block_size).min(data.len());
if start < data.len() {
let mut chunk = data[start..end].to_vec();
chunk.resize(block_size, 0);
chunks.push(chunk);
} else {
chunks.push(vec![0u8; block_size]);
}
}
chunks
}
fn compute_parity(&self, data_columns: &[Vec<u8>]) -> Result<Vec<Vec<u8>>, DraidError> {
let block_size = self.config.block_size;
let parity_level = self.config.parity_level as usize;
let mut parities = Vec::with_capacity(parity_level);
let mut p = vec![0u8; block_size];
for col in data_columns {
for (i, &byte) in col.iter().enumerate() {
p[i] ^= byte;
}
}
parities.push(p);
if parity_level >= 2 {
let mut q = vec![0u8; block_size];
for (col_idx, col) in data_columns.iter().enumerate() {
let coeff = gf_pow_2(col_idx);
for (i, &byte) in col.iter().enumerate() {
q[i] ^= GfAlgo::multiply(byte, coeff);
}
}
parities.push(q);
}
if parity_level >= 3 {
let mut r = vec![0u8; block_size];
for (col_idx, col) in data_columns.iter().enumerate() {
let coeff = gf_pow_4(col_idx);
for (i, &byte) in col.iter().enumerate() {
r[i] ^= GfAlgo::multiply(byte, coeff);
}
}
parities.push(r);
}
Ok(parities)
}
fn write_disk_block(
&mut self,
disk: usize,
offset: u64,
data: &[u8],
) -> Result<(), DraidError> {
let disk_map = self.block_data.entry(disk).or_default();
disk_map.insert(offset, data.to_vec());
let max_offset = offset / self.config.block_size as u64 + 1;
if max_offset > self.blocks_per_disk {
self.blocks_per_disk = max_offset;
}
Ok(())
}
fn read_disk_block(&self, disk: usize, offset: u64) -> Result<Option<Vec<u8>>, DraidError> {
if matches!(
self.disk_states[disk],
DiskState::Faulted | DiskState::Offline
) {
return Err(DraidError::DiskFailed(disk));
}
Ok(self
.block_data
.get(&disk)
.and_then(|m| m.get(&offset))
.cloned())
}
pub fn can_tolerate_failure(&self) -> bool {
let current_failures = self
.disk_states
.iter()
.filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Offline))
.count();
current_failures < self.config.parity_level as usize
}
pub fn get_status(&self) -> (usize, usize, usize, usize, usize) {
let online = self
.disk_states
.iter()
.filter(|&&s| s == DiskState::Online)
.count();
let degraded = self
.disk_states
.iter()
.filter(|&&s| s == DiskState::Degraded)
.count();
let faulted = self
.disk_states
.iter()
.filter(|&&s| s == DiskState::Faulted)
.count();
let resilvering = self
.disk_states
.iter()
.filter(|&&s| s == DiskState::Resilvering)
.count();
let offline = self
.disk_states
.iter()
.filter(|&&s| s == DiskState::Offline)
.count();
(online, degraded, faulted, resilvering, offline)
}
}
#[derive(Debug, Clone)]
pub struct ScrubState {
pub current_row: u64,
pub current_group: usize,
pub total_rows: u64,
pub errors_found: u64,
pub errors_repaired: u64,
pub started: u64,
}
impl DraidVdev {
pub fn start_scrub(&mut self) -> Result<(), DraidError> {
if self.scrub_state.is_some() {
return Err(DraidError::ScrubError("scrub already in progress"));
}
let total_rows = self.blocks_per_disk.max(1);
self.scrub_state = Some(ScrubState {
current_row: 0,
current_group: 0,
total_rows,
errors_found: 0,
errors_repaired: 0,
started: 0, });
crate::lcpfs_println!(
"[ dRAID ] Starting scrub ({} rows, {} groups/row)",
total_rows,
self.config.groups_per_row()
);
Ok(())
}
pub fn scrub_step(&mut self, batch_size: u64) -> Result<u64, DraidError> {
let (current_row, current_group, total_rows) = {
let state = match &self.scrub_state {
Some(s) => s,
None => return Err(DraidError::ScrubError("no scrub in progress")),
};
(state.current_row, state.current_group, state.total_rows)
};
let mut row = current_row;
let mut group_idx = current_group;
let mut checked = 0u64;
let mut errors_found = 0u64;
let mut errors_repaired = 0u64;
let groups_per_row = self.config.groups_per_row();
while checked < batch_size {
if row >= total_rows {
break;
}
let group = self.get_group(row, group_idx)?;
let offset = row * self.config.block_size as u64;
if self.verify_group(&group, offset).is_err() {
errors_found += 1;
if self.can_tolerate_failure() && self.repair_group(&group, offset).is_ok() {
errors_repaired += 1;
}
}
checked += 1;
group_idx += 1;
if group_idx >= groups_per_row {
group_idx = 0;
row += 1;
}
}
if let Some(state) = &mut self.scrub_state {
state.current_row = row;
state.current_group = group_idx;
state.errors_found += errors_found;
state.errors_repaired += errors_repaired;
}
Ok(checked)
}
fn verify_group(&self, group: &DraidGroup, offset: u64) -> Result<(), DraidError> {
let block_size = self.config.block_size;
let mut data_columns: Vec<Vec<u8>> = Vec::new();
for &disk in &group.data_columns {
match self.read_disk_block(disk, offset) {
Ok(Some(block)) => data_columns.push(block),
Ok(None) => data_columns.push(vec![0u8; block_size]),
Err(_) => return Err(DraidError::IoError),
}
}
let mut parity_columns: Vec<Vec<u8>> = Vec::new();
for &disk in &group.parity_columns {
match self.read_disk_block(disk, offset) {
Ok(Some(block)) => parity_columns.push(block),
Ok(None) => parity_columns.push(vec![0u8; block_size]),
Err(_) => return Err(DraidError::IoError),
}
}
let mut computed_p = vec![0u8; block_size];
for col in &data_columns {
for (i, &byte) in col.iter().enumerate() {
computed_p[i] ^= byte;
}
}
if !parity_columns.is_empty() && computed_p != parity_columns[0] {
return Err(DraidError::ScrubError("P parity mismatch"));
}
Ok(())
}
fn repair_group(&mut self, group: &DraidGroup, offset: u64) -> Result<(), DraidError> {
let mut surviving: Vec<(Vec<u8>, usize)> = Vec::new();
let mut parities: Vec<Vec<u8>> = Vec::new();
let mut bad_disk: Option<usize> = None;
for (col, &disk) in group.data_columns.iter().enumerate() {
match self.read_disk_block(disk, offset) {
Ok(Some(block)) => surviving.push((block, col)),
Ok(None) => surviving.push((vec![0u8; self.config.block_size], col)),
Err(_) => bad_disk = Some(disk),
}
}
for &disk in &group.parity_columns {
match self.read_disk_block(disk, offset) {
Ok(Some(block)) => parities.push(block),
Ok(None) => parities.push(vec![0u8; self.config.block_size]),
Err(_) => {}
}
}
if let Some(disk) = bad_disk {
let old_state = self.disk_states[disk];
self.disk_states[disk] = DiskState::Faulted;
let failed_col = group.data_columns.iter().position(|&d| d == disk);
if let Some(col) = failed_col {
let reconstructed =
self.reconstruct_data(&surviving, &parities, &[col], self.config.block_size)?;
if !reconstructed.is_empty() {
self.disk_states[disk] = old_state;
self.write_disk_block(disk, offset, &reconstructed[0])?;
}
}
}
Ok(())
}
pub fn get_scrub_progress(&self) -> Option<f64> {
self.scrub_state.as_ref().map(|state| {
let total = state.total_rows * self.config.groups_per_row() as u64;
if total == 0 {
100.0
} else {
let current = state.current_row * self.config.groups_per_row() as u64
+ state.current_group as u64;
(current as f64 / total as f64) * 100.0
}
})
}
pub fn finish_scrub(&mut self) -> Option<(u64, u64)> {
self.scrub_state.take().map(|state| {
crate::lcpfs_println!(
"[ dRAID ] Scrub complete: {} errors found, {} repaired",
state.errors_found,
state.errors_repaired
);
(state.errors_found, state.errors_repaired)
})
}
}
fn gf_pow_2(n: usize) -> u8 {
let mut result = 1u8;
for _ in 0..n {
result = GfAlgo::multiply(result, 2);
}
result
}
fn gf_pow_4(n: usize) -> u8 {
let mut result = 1u8;
let four = GfAlgo::multiply(2, 2);
for _ in 0..n {
result = GfAlgo::multiply(result, four);
}
result
}
pub struct DraidPool {
vdev: DraidVdev,
}
impl DraidPool {
pub fn new(config: DraidConfig) -> Self {
Self {
vdev: DraidVdev::new(config),
}
}
pub fn get_stripe_layout(&mut self, stripe_id: u64) -> Vec<usize> {
self.vdev.perm_gen.get_permutation(stripe_id)
}
pub fn mark_failed(&mut self, disk_id: usize) -> Result<(), &'static str> {
self.vdev
.mark_disk_failed(disk_id)
.map_err(|_| "Invalid disk ID")
}
pub fn start_rebuild(&mut self, failed_disk_id: usize) -> Result<usize, &'static str> {
self.vdev
.start_rebuild(failed_disk_id)
.map_err(|_| "Cannot start rebuild")?;
Ok((failed_disk_id + 1) % self.vdev.config.children)
}
pub fn update_rebuild_progress(&mut self, disk_id: usize, progress: f64) {
if progress >= 100.0 {
self.vdev.disk_states[disk_id] = DiskState::Online;
self.vdev.rebuild_progress.remove(&disk_id);
} else {
let blocks = ((progress / 100.0) * self.vdev.blocks_per_disk as f64) as u64;
self.vdev.rebuild_progress.insert(disk_id, blocks);
}
}
pub fn failed_disk_count(&self) -> usize {
self.vdev
.disk_states
.iter()
.filter(|&&s| matches!(s, DiskState::Faulted | DiskState::Offline))
.count()
}
pub fn can_tolerate_failure(&self) -> bool {
self.vdev.can_tolerate_failure()
}
pub fn rebuild_speedup(&self) -> f64 {
self.vdev.rebuild_speedup()
}
pub fn get_stats(&self) -> (usize, usize, usize, usize, usize) {
self.vdev.get_status()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_draid_config_valid() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
assert_eq!(config.data_disks, 4);
assert_eq!(config.parity_level, 1);
assert_eq!(config.spare_disks, 1);
assert_eq!(config.children, 8);
assert_eq!(config.group_width(), 5); }
#[test]
fn test_draid_config_draid2() {
let config = DraidConfig::new(4, 2, 2, 12).expect("should create valid config");
assert_eq!(config.group_width(), 6); assert_eq!(config.fault_tolerance(), 2);
}
#[test]
fn test_draid_config_draid3() {
let config = DraidConfig::new(8, 3, 3, 24).expect("should create valid config");
assert_eq!(config.group_width(), 11); assert_eq!(config.fault_tolerance(), 3);
}
#[test]
fn test_draid_config_invalid_parity() {
assert!(DraidConfig::new(4, 0, 1, 8).is_err());
assert!(DraidConfig::new(4, 4, 1, 8).is_err());
}
#[test]
fn test_draid_config_not_enough_disks() {
assert!(DraidConfig::new(4, 1, 1, 5).is_err()); }
#[test]
fn test_draid_config_no_spare() {
assert!(DraidConfig::new(4, 1, 0, 8).is_err());
}
#[test]
fn test_draid_config_parse() {
let config = DraidConfig::parse("draid2:4d:10c:2s").expect("should parse");
assert_eq!(config.parity_level, 2);
assert_eq!(config.data_disks, 4);
assert_eq!(config.children, 10);
assert_eq!(config.spare_disks, 2);
}
#[test]
fn test_draid_config_efficiency() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let eff = config.efficiency();
assert!(eff > 0.5 && eff < 0.6);
}
#[test]
fn test_permutation_deterministic() {
let perm_gen = PermutationGenerator::new(10, 0xDEADBEEF);
let perm1 = perm_gen.get_permutation(42);
let perm2 = perm_gen.get_permutation(42);
assert_eq!(perm1, perm2, "Same row should give same permutation");
}
#[test]
fn test_permutation_different_rows() {
let perm_gen = PermutationGenerator::new(10, 0xDEADBEEF);
let perm1 = perm_gen.get_permutation(0);
let perm2 = perm_gen.get_permutation(1);
assert_ne!(
perm1, perm2,
"Different rows should give different permutations"
);
}
#[test]
fn test_permutation_covers_all_disks() {
let perm_gen = PermutationGenerator::new(10, 0xDEADBEEF);
let perm = perm_gen.get_permutation(0);
assert_eq!(perm.len(), 10);
let mut sorted = perm.clone();
sorted.sort();
let expected: Vec<usize> = (0..10).collect();
assert_eq!(sorted, expected);
}
#[test]
fn test_map_to_disk() {
let perm_gen = PermutationGenerator::new(8, 0xCAFE);
let disk = perm_gen.map_to_disk(5, 3);
assert!(disk < 8);
let disk2 = perm_gen.map_to_disk(5, 3);
assert_eq!(disk, disk2);
}
#[test]
fn test_get_group() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let vdev = DraidVdev::new(config);
let group = vdev.get_group(0, 0).expect("should get group");
assert_eq!(group.data_columns.len(), 4);
assert_eq!(group.parity_columns.len(), 1);
assert_eq!(group.width(), 5);
}
#[test]
fn test_group_disk_mapping() {
let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
let vdev = DraidVdev::new(config);
let group = vdev.get_group(0, 0).expect("should get group");
for &disk in &group.data_columns {
assert!(disk < 10);
}
for &disk in &group.parity_columns {
assert!(disk < 10);
}
let mut all_disks = group.data_columns.clone();
all_disks.extend(&group.parity_columns);
all_disks.sort();
all_disks.dedup();
assert_eq!(all_disks.len(), 6); }
#[test]
fn test_write_and_read_block() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
let read_data = vdev.read_block(0, 512).expect("should read");
assert_eq!(read_data, data);
}
#[test]
fn test_write_and_read_multiple_blocks() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let block_size = config.data_disks * config.block_size;
for i in 0..5 {
let data: Vec<u8> = (0..block_size).map(|j| ((i + j) % 256) as u8).collect();
let offset = i as u64 * block_size as u64;
vdev.write_block(&data, offset).expect("should write");
}
for i in 0..5 {
let expected: Vec<u8> = (0..block_size).map(|j| ((i + j) % 256) as u8).collect();
let offset = i as u64 * block_size as u64;
let read_data = vdev.read_block(offset, block_size).expect("should read");
assert_eq!(read_data, expected);
}
}
#[test]
fn test_degraded_read_draid1() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
let group = vdev.get_group(0, 0).expect("should get group");
let failed_disk = group.data_columns[0];
vdev.mark_disk_failed(failed_disk)
.expect("should mark failed");
let read_data = vdev.read_block(0, 512).expect("should read degraded");
assert_eq!(read_data, data);
}
#[test]
fn test_degraded_read_draid2() {
let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
let group = vdev.get_group(0, 0).expect("should get group");
vdev.mark_disk_failed(group.data_columns[0])
.expect("should mark failed");
vdev.mark_disk_failed(group.data_columns[1])
.expect("should mark failed");
let read_data = vdev.read_block(0, 512).expect("should read degraded");
assert_eq!(read_data, data);
}
#[test]
fn test_too_many_failures() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
let group = vdev.get_group(0, 0).expect("should get group");
vdev.mark_disk_failed(group.data_columns[0])
.expect("should mark failed");
vdev.mark_disk_failed(group.data_columns[1])
.expect("should mark failed");
assert!(vdev.read_block(0, 512).is_err());
}
#[test]
fn test_start_rebuild() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
vdev.mark_disk_failed(3).expect("should mark failed");
vdev.start_rebuild(3).expect("should start rebuild");
assert_eq!(vdev.disk_states[3], DiskState::Resilvering);
}
#[test]
fn test_rebuild_speedup() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
vdev.mark_disk_failed(0).expect("should mark failed");
let speedup = vdev.rebuild_speedup();
assert!(speedup > 1.7 && speedup < 1.8);
}
#[test]
fn test_can_tolerate_failure() {
let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
assert!(vdev.can_tolerate_failure());
vdev.mark_disk_failed(0).expect("should mark failed");
assert!(vdev.can_tolerate_failure());
vdev.mark_disk_failed(1).expect("should mark failed");
assert!(!vdev.can_tolerate_failure()); }
#[test]
fn test_scrub_start_and_progress() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
vdev.start_scrub().expect("should start scrub");
assert!(vdev.get_scrub_progress().is_some());
}
#[test]
fn test_scrub_double_start_fails() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
vdev.start_scrub().expect("should start scrub");
assert!(vdev.start_scrub().is_err());
}
#[test]
fn test_scrub_complete() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
vdev.start_scrub().expect("should start scrub");
loop {
let checked = vdev.scrub_step(100).expect("should scrub");
if checked == 0 {
break;
}
}
let (errors, repaired) = vdev.finish_scrub().expect("should have scrub state");
assert_eq!(errors, 0);
assert_eq!(repaired, 0);
}
#[test]
fn test_legacy_draid_pool() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let mut pool = DraidPool::new(config);
let layout1 = pool.get_stripe_layout(0);
let layout2 = pool.get_stripe_layout(1);
assert_eq!(layout1.len(), 10);
assert_eq!(layout2.len(), 10);
assert_ne!(layout1, layout2);
}
#[test]
fn test_legacy_disk_failure() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let mut pool = DraidPool::new(config);
pool.mark_failed(5).expect("should mark failed");
assert_eq!(pool.failed_disk_count(), 1);
}
#[test]
fn test_legacy_rebuild() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let mut pool = DraidPool::new(config);
pool.mark_failed(5).expect("should mark failed");
let spare = pool.start_rebuild(5).expect("should start rebuild");
assert!(spare < 10);
assert_ne!(spare, 5);
}
#[test]
fn test_legacy_failure_tolerance() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let mut pool = DraidPool::new(config);
assert!(pool.can_tolerate_failure());
pool.mark_failed(0).expect("should mark failed");
assert!(pool.can_tolerate_failure());
pool.mark_failed(1).expect("should mark failed");
assert!(!pool.can_tolerate_failure());
}
#[test]
fn test_legacy_rebuild_speedup() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let pool = DraidPool::new(config);
let speedup = pool.rebuild_speedup();
assert!(speedup > 1.4 && speedup < 1.5);
}
#[test]
fn test_legacy_stats() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let mut pool = DraidPool::new(config);
pool.mark_failed(5).expect("should mark failed");
pool.start_rebuild(5).expect("should start rebuild");
let (online, degraded, faulted, resilvering, offline) = pool.get_stats();
let total = online + degraded + faulted + resilvering + offline;
assert_eq!(total, 10);
assert_eq!(resilvering, 1); }
#[test]
fn test_legacy_permutation_deterministic() {
let config = DraidConfig::new(6, 2, 2, 10).expect("should create valid config");
let mut pool = DraidPool::new(config);
let layout1 = pool.get_stripe_layout(42);
let layout2 = pool.get_stripe_layout(42);
assert_eq!(layout1, layout2);
}
#[test]
fn test_parity_computation_z1() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let vdev = DraidVdev::new(config);
let data: Vec<Vec<u8>> = vec![
vec![0x11; 128 * 1024],
vec![0x22; 128 * 1024],
vec![0x33; 128 * 1024],
vec![0x44; 128 * 1024],
];
let parities = vdev.compute_parity(&data).expect("should compute");
assert_eq!(parities.len(), 1);
assert_eq!(parities[0][0], 0x44);
}
#[test]
fn test_parity_computation_z2() {
let config = DraidConfig::new(4, 2, 1, 10).expect("should create valid config");
let vdev = DraidVdev::new(config);
let data: Vec<Vec<u8>> = vec![
vec![0x11; 128 * 1024],
vec![0x22; 128 * 1024],
vec![0x33; 128 * 1024],
vec![0x44; 128 * 1024],
];
let parities = vdev.compute_parity(&data).expect("should compute");
assert_eq!(parities.len(), 2);
assert_eq!(parities[0][0], 0x44);
assert!(parities[1][0] != 0);
}
#[test]
fn test_parity_computation_z3() {
let config = DraidConfig::new(4, 3, 1, 12).expect("should create valid config");
let vdev = DraidVdev::new(config);
let data: Vec<Vec<u8>> = vec![
vec![0x11; 128 * 1024],
vec![0x22; 128 * 1024],
vec![0x33; 128 * 1024],
vec![0x44; 128 * 1024],
];
let parities = vdev.compute_parity(&data).expect("should compute");
assert_eq!(parities.len(), 3);
}
#[test]
fn test_stats_tracking() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
assert_eq!(vdev.stats().writes, 1);
assert!(vdev.stats().bytes_written > 0);
vdev.read_block(0, 512).expect("should read");
assert_eq!(vdev.stats().reads, 1);
assert!(vdev.stats().bytes_read > 0);
}
#[test]
fn test_reconstruction_stats() {
let config = DraidConfig::new(4, 1, 1, 8).expect("should create valid config");
let mut vdev = DraidVdev::new(config);
let data: Vec<u8> = (0..512).map(|i| (i % 256) as u8).collect();
vdev.write_block(&data, 0).expect("should write");
let group = vdev.get_group(0, 0).expect("should get group");
vdev.mark_disk_failed(group.data_columns[0])
.expect("should mark failed");
vdev.read_block(0, 512).expect("should read degraded");
assert_eq!(vdev.stats().reconstruction_reads, 1);
}
}