use alloc::{vec, vec::Vec};
use crate::{
block_read::{read_exact, BlockRead, PoolMember},
cksum::verify_block,
compress::decompress,
error::{Error, Location, Result},
phys::{BlockPointer, BlockPointerRegular, Dva},
};
const LABEL_RESERVE: u64 = 4 * 1024 * 1024;
const SECTOR_SHIFT: u32 = 9;
const ASHIFT_MAX: u8 = 16;
pub(crate) enum Topology {
Single(usize),
Mirror(Vec<usize>),
RaidZ1 {
children: Vec<Option<usize>>,
ashift: u8,
},
}
impl Topology {
fn read_order(&self) -> &[usize] {
match self {
Topology::Single(idx) => core::slice::from_ref(idx),
Topology::Mirror(children) => children,
Topology::RaidZ1 { .. } => &[],
}
}
}
fn dva_byte_offset(sector_offset: u64) -> Option<u64> {
sector_offset
.checked_mul(1 << SECTOR_SHIFT)
.and_then(|b| b.checked_add(LABEL_RESERVE))
}
pub(crate) fn read_block_pointer<R: BlockRead>(
members: &mut [PoolMember<R>],
topo: &Topology,
bp: &BlockPointer,
) -> Result<Vec<u8>> {
match bp {
BlockPointer::Embedded(e) => {
let src = e
.payload
.get(..e.physical_size)
.ok_or(Error::Inconsistent {
token: "embedded_short",
where_: Location::Mos,
})?;
decompress(e.compression, src, e.logical_size)
}
BlockPointer::Encrypted(_) => Err(Error::UnsupportedFeature("encryption")),
BlockPointer::Regular(r) => {
let lsize = (r.logical_sectors as usize) << SECTOR_SHIFT;
let psize = (r.physical_sectors as usize) << SECTOR_SHIFT;
if psize == 0 || psize > crate::compress::MAX_BLOCK_LSIZE {
return Err(Error::Inconsistent {
token: "bp_bad_psize",
where_: Location::Mos,
});
}
if let Topology::RaidZ1 { children, ashift } = topo {
return raidz1_read(members, children, *ashift, r, psize, lsize);
}
let mut last = Error::Inconsistent {
token: "bp_no_readable_copy",
where_: Location::Mos,
};
for dva in r.dvas.iter().flatten() {
let Some(byte) = dva_byte_offset(dva.offset) else {
last = Error::Inconsistent {
token: "dva_out_of_range",
where_: Location::Mos,
};
continue;
};
for &mi in topo.read_order() {
let Some(member) = members.get_mut(mi) else {
continue;
};
let mut raw = vec![0u8; psize];
if read_exact(
&mut member.reader,
byte,
&mut raw,
u64::from(dva.vdev),
"io_block",
)
.is_err()
{
last = Error::Io {
token: "io_block",
vdev: u64::from(dva.vdev),
offset: byte,
};
continue;
}
match verify_block(
r.checksum_type,
r.order,
&raw,
&r.checksum_value,
u64::from(dva.vdev),
dva.offset,
) {
Ok(()) => return decompress(r.compression, &raw, lsize),
Err(e) => last = e,
}
}
}
Err(last)
}
}
}
struct RaidzCol {
devidx: usize,
offset: u64,
size: usize,
parity: bool,
}
#[expect(
clippy::many_single_char_names,
reason = "b/s/f/o/q/r mirror the variable names in ZFS vdev_raidz_map_alloc"
)]
fn raidz_map(
dva: &Dva,
ashift: u8,
dcols: u64,
nparity: u64,
psize: usize,
) -> Result<Vec<RaidzCol>> {
let geom = || Error::Inconsistent {
token: "raidz_geom",
where_: Location::Vdev { guid: 0 },
};
if dcols <= nparity || !(9..=ASHIFT_MAX).contains(&ashift) {
return Err(geom());
}
let ash = u32::from(ashift);
let secsize = 1u64 << ash;
let b = dva.offset >> (ash - 9);
let s = ((psize as u64) + secsize - 1) >> ash;
if s == 0 {
return Err(geom());
}
let f = b % dcols;
let o = (b / dcols) << ash;
let q = s / (dcols - nparity);
let r = s - q * (dcols - nparity);
let bc = if r == 0 { 0 } else { r + nparity };
let acols = if q == 0 { bc } else { dcols };
let mut cols = Vec::with_capacity(acols as usize);
for c in 0..acols {
let col = f + c;
let (devidx, coff) = if col >= dcols {
((col - dcols) as usize, o + secsize)
} else {
(col as usize, o)
};
let size = if c < bc {
(q + 1) * secsize
} else {
q * secsize
};
cols.push(RaidzCol {
devidx,
offset: coff,
size: size as usize,
parity: c < nparity,
});
}
Ok(cols)
}
fn raidz1_read_columns<R: BlockRead>(
members: &mut [PoolMember<R>],
children: &[Option<usize>],
cols: &[RaidzCol],
vdev: u32,
) -> Vec<Option<Vec<u8>>> {
cols.iter()
.map(|col| {
let mi = (*children.get(col.devidx)?)?;
let byte = LABEL_RESERVE.checked_add(col.offset)?;
let member = members.get_mut(mi)?;
let mut buf = vec![0u8; col.size];
read_exact(
&mut member.reader,
byte,
&mut buf,
u64::from(vdev),
"io_raidz",
)
.ok()?;
Some(buf)
})
.collect()
}
fn raidz1_assemble(
cols: &[RaidzCol],
colbufs: &[Option<Vec<u8>>],
psize: usize,
replace_idx: Option<usize>,
replacement: Option<&[u8]>,
) -> Option<Vec<u8>> {
let mut data = Vec::with_capacity(psize);
for (i, col) in cols.iter().enumerate() {
if col.parity {
continue;
}
if Some(i) == replace_idx {
data.extend_from_slice(replacement?);
} else {
data.extend_from_slice(colbufs.get(i)?.as_deref()?);
}
}
if data.len() < psize {
return None;
}
data.truncate(psize);
Some(data)
}
fn raidz1_reconstruct(
cols: &[RaidzCol],
colbufs: &[Option<Vec<u8>>],
target: usize,
) -> Option<Vec<u8>> {
let parity = colbufs.first()?.as_deref()?;
let size = cols.get(target)?.size;
let mut rebuilt = vec![0u8; size];
let n = size.min(parity.len());
rebuilt[..n].copy_from_slice(&parity[..n]);
for (i, _col) in cols.iter().enumerate() {
if i == 0 || i == target {
continue; }
let other = colbufs.get(i)?.as_deref()?;
for (k, b) in other.iter().enumerate().take(size) {
rebuilt[k] ^= b;
}
}
Some(rebuilt)
}
fn raidz1_read<R: BlockRead>(
members: &mut [PoolMember<R>],
children: &[Option<usize>],
ashift: u8,
r: &BlockPointerRegular,
psize: usize,
lsize: usize,
) -> Result<Vec<u8>> {
let dcols = children.len() as u64;
let mut last = Error::Inconsistent {
token: "raidz_no_copy",
where_: Location::Vdev { guid: 0 },
};
for dva in r.dvas.iter().flatten() {
let cols = match raidz_map(dva, ashift, dcols, 1, psize) {
Ok(c) => c,
Err(e) => {
last = e;
continue;
}
};
let colbufs = raidz1_read_columns(members, children, &cols, dva.vdev);
let verify = |raw: &[u8]| {
verify_block(
r.checksum_type,
r.order,
raw,
&r.checksum_value,
u64::from(dva.vdev),
dva.offset,
)
.is_ok()
};
if let Some(raw) = raidz1_assemble(&cols, &colbufs, psize, None, None) {
if verify(&raw) {
return decompress(r.compression, &raw, lsize);
}
}
for (i, col) in cols.iter().enumerate() {
if col.parity {
continue;
}
let Some(rebuilt) = raidz1_reconstruct(&cols, &colbufs, i) else {
continue;
};
if let Some(raw) = raidz1_assemble(&cols, &colbufs, psize, Some(i), Some(&rebuilt)) {
if verify(&raw) {
return decompress(r.compression, &raw, lsize);
}
}
}
last = Error::ChecksumMismatch {
vdev: u64::from(dva.vdev),
offset: dva.offset,
what: "raidz_unrecoverable",
};
}
Err(last)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn dva_offset_maps_past_the_label_reserve() {
assert_eq!(dva_byte_offset(0), Some(LABEL_RESERVE));
assert_eq!(dva_byte_offset(1), Some(LABEL_RESERVE + 512));
assert_eq!(dva_byte_offset(1 << 60), None);
}
#[test]
fn read_order_single_and_mirror() {
assert_eq!(Topology::Single(2).read_order(), &[2]);
assert_eq!(Topology::Mirror(vec![1, 3, 5]).read_order(), &[1, 3, 5]);
}
#[test]
fn raidz_map_rejects_hostile_geometry() {
let dva = Dva {
allocated: 8,
offset: 1024,
is_gang: false,
vdev: 0,
};
assert!(raidz_map(&dva, 200, 3, 1, 4096).is_err());
assert!(raidz_map(&dva, 8, 3, 1, 4096).is_err());
assert!(raidz_map(&dva, 12, 1, 1, 4096).is_err());
let cols = raidz_map(&dva, 12, 3, 1, 4096).unwrap();
assert_eq!(cols.iter().filter(|c| c.parity).count(), 1);
}
}