use std::{
collections::{BTreeMap, HashMap, HashSet},
mem::size_of,
num::NonZeroUsize,
os::unix::ffi::OsStrExt,
};
use log::trace;
use xxhash_rust::xxh32::xxh32;
use zerocopy::{Immutable, IntoBytes};
use crate::{
erofs::{
composefs::OverlayMetacopy,
format::{self, FormatEpoch},
reader::round_up,
},
fsverity::FsVerityHashValue,
generic_tree::LeafId,
tree,
};
pub struct ValidatedFileSystem<ObjectID: FsVerityHashValue>(pub(crate) tree::FileSystem<ObjectID>);
impl<ObjectID: FsVerityHashValue + std::fmt::Debug> std::fmt::Debug
for ValidatedFileSystem<ObjectID>
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("ValidatedFileSystem").field(&self.0).finish()
}
}
impl<ObjectID: FsVerityHashValue> ValidatedFileSystem<ObjectID> {
pub fn new(fs: tree::FileSystem<ObjectID>) -> anyhow::Result<Self> {
validate_filesystem(&fs)?;
Ok(Self(fs))
}
}
impl<ObjectID: FsVerityHashValue> std::ops::Deref for ValidatedFileSystem<ObjectID> {
type Target = tree::FileSystem<ObjectID>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
pub(crate) fn validate_filesystem<ObjectID: FsVerityHashValue>(
fs: &tree::FileSystem<ObjectID>,
) -> anyhow::Result<()> {
fs.fsck()
.map_err(|e| anyhow::anyhow!("invalid composefs filesystem: {e}"))?;
let nlinks = fs.nlinks();
for (idx, leaf) in fs.leaves.iter().enumerate() {
if matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) {
let nlink = nlinks[idx];
if nlink > 1 {
anyhow::bail!("invalid composefs filesystem: whiteout inode has nlink > 1");
}
}
}
Ok(())
}
const INODE_SLOT_SIZE: usize = 32;
const XATTR_WORD_SIZE: usize = size_of::<u32>();
const INODE_XATTR_HEADER_SIZE: usize = size_of::<format::InodeXAttrHeader>();
fn block_offset(pos: u64) -> u64 {
pos % u64::from(format::BLOCK_SIZE)
}
fn bytes_to_block_boundary(pos: u64) -> Option<u64> {
let offset = block_offset(pos);
if offset == 0 {
return None;
}
let block_size = u64::from(format::BLOCK_SIZE);
let padding = block_size
.checked_sub(offset)
.expect("block_offset(pos) < BLOCK_SIZE by construction");
debug_assert!(padding >= 1 && padding < block_size);
Some(padding)
}
#[cfg(test)]
pub(crate) struct WriterFaults {
rng: rand::rngs::SmallRng,
pub skip_symlink_pad_rate: f64,
decisions: Vec<bool>,
replay_idx: usize,
replaying: bool,
}
#[cfg(test)]
impl WriterFaults {
pub fn new(seed: u64) -> Self {
use rand::SeedableRng;
Self {
rng: rand::rngs::SmallRng::seed_from_u64(seed),
skip_symlink_pad_rate: 0.0,
decisions: Vec::new(),
replay_idx: 0,
replaying: false,
}
}
pub(crate) fn start_replay(&mut self) {
self.replaying = true;
self.replay_idx = 0;
}
fn should_skip_symlink_pad(&mut self) -> bool {
if self.replaying {
let decision = self.decisions[self.replay_idx];
self.replay_idx += 1;
decision
} else {
use rand::RngExt;
let decision = self.rng.random::<f64>() < self.skip_symlink_pad_rate;
self.decisions.push(decision);
decision
}
}
}
struct WriteContext {
version: format::FormatVersion,
min_mtime: (u64, u32),
header_flags: u32,
composefs_version: u32,
#[cfg(test)]
faults: Option<WriterFaults>,
}
trait Output {
fn note_header_emitted(&mut self);
fn note_superblock_emitted(&mut self);
fn note_inode(&mut self);
fn note_inodes_end(&mut self);
fn note_xattr(&mut self);
fn note_block(&mut self);
fn note_end(&mut self);
fn get_inode_offset(&self, idx: usize) -> Option<NonZeroUsize>;
fn get_inodes_end(&self) -> Option<NonZeroUsize>;
fn get_xattr_offset(&self, idx: usize) -> Option<NonZeroUsize>;
fn get_block_offset(&self, idx: usize) -> Option<NonZeroUsize>;
fn get_end(&self) -> Option<NonZeroUsize>;
fn write(&mut self, data: &[u8]);
fn pad(&mut self, alignment: usize);
fn len(&self) -> usize;
fn write_zeros(&mut self, n: usize) {
const BUF: [u8; 1024] = [0u8; 1024];
let mut remaining = n;
while remaining > 0 {
let chunk = remaining.min(BUF.len());
self.write(&BUF[..chunk]);
remaining -= chunk;
}
}
fn write_composefs_header(&mut self, hdr: format::ComposefsHeader) {
self.note_header_emitted();
self.write(hdr.as_bytes());
self.pad(1024);
}
fn write_superblock(&mut self, sb: format::Superblock) {
self.note_superblock_emitted();
self.write(sb.as_bytes());
}
fn write_struct(&mut self, st: impl IntoBytes + Immutable) {
self.write(st.as_bytes());
}
fn get_nid(&self, idx: usize) -> u64 {
let Some(offset) = self.get_inode_offset(idx) else {
return 0;
};
assert_eq!(offset.get() % INODE_SLOT_SIZE, 0);
(offset.get() / INODE_SLOT_SIZE) as u64
}
fn get_xattr_v1(&self, idx: usize) -> u32 {
let (Some(absolute_offset), Some(inodes_end)) =
(self.get_xattr_offset(idx), self.get_inodes_end())
else {
return 0;
};
let (absolute_offset, inodes_end) = (absolute_offset.get(), inodes_end.get());
let offset_within_block = inodes_end % format::BLOCK_SIZE as usize;
let xattr_offset_from_inodes_end = absolute_offset
.checked_sub(inodes_end)
.expect("shared xattr offset must be >= inode table end");
let raw_ref = (offset_within_block + xattr_offset_from_inodes_end) / XATTR_WORD_SIZE;
raw_ref
.try_into()
.expect("xattr reference index exceeds u32::MAX")
}
fn get_xattr_v2(&self, idx: usize) -> u32 {
let Some(offset) = self.get_xattr_offset(idx) else {
return 0;
};
assert_eq!(offset.get() % XATTR_WORD_SIZE, 0);
(offset.get() / XATTR_WORD_SIZE)
.try_into()
.expect("xattr reference index exceeds u32::MAX")
}
fn get_block_start(&self, idx: usize) -> usize {
self.get_block_offset(idx).map_or(0, NonZeroUsize::get)
}
fn get_xattr_blkaddr(&self) -> u32 {
self.get_inodes_end()
.map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32)
}
fn get_block_count(&self) -> u32 {
self.get_end()
.map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32)
}
}
#[derive(PartialOrd, PartialEq, Eq, Ord, Clone)]
struct XAttr {
prefix: u8,
suffix: Box<[u8]>,
value: Box<[u8]>,
}
impl XAttr {
fn cmp_by_full_key(&self, other: &Self) -> std::cmp::Ordering {
let self_key = format::XATTR_PREFIXES[self.prefix as usize]
.iter()
.chain(self.suffix.iter());
let other_key = format::XATTR_PREFIXES[other.prefix as usize]
.iter()
.chain(other.suffix.iter());
self_key.cmp(other_key).then_with(|| {
self.value
.len()
.cmp(&other.value.len())
.then_with(|| self.value.cmp(&other.value))
})
}
}
#[derive(Clone, Default)]
struct InodeXAttrs {
shared: Vec<usize>,
local: Vec<XAttr>,
filter: u32,
}
type InodeIdx = usize;
#[derive(Debug, Clone, Copy)]
enum InodeRef {
Known(InodeIdx),
Deferred(LeafId),
}
#[derive(Debug)]
struct DirEnt<'a> {
name: &'a [u8],
inode: InodeRef,
file_type: format::FileType,
}
struct InodeMeta {
layout: format::DataLayout,
i_u: u32,
size: u64,
nlink: usize,
}
#[derive(Debug, Default)]
struct Directory<'a> {
blocks: Box<[Box<[DirEnt<'a>]>]>,
inline: Box<[DirEnt<'a>]>,
size: u64,
nlink: usize,
}
#[derive(Debug)]
struct Leaf<'a, ObjectID: FsVerityHashValue> {
content: &'a tree::LeafContent<ObjectID>,
nlink: usize,
}
#[derive(Debug)]
enum InodeContent<'a, ObjectID: FsVerityHashValue> {
Directory(Directory<'a>),
Leaf(Leaf<'a, ObjectID>),
}
struct Inode<'a, ObjectID: FsVerityHashValue> {
stat: &'a tree::Stat,
xattrs: InodeXAttrs,
content: InodeContent<'a, ObjectID>,
escaped_whiteout: bool,
}
impl XAttr {
pub fn write(&self, output: &mut impl Output) {
output.write_struct(format::XAttrHeader {
name_len: self.suffix.len() as u8,
name_index: self.prefix,
value_size: (self.value.len() as u16).into(),
});
output.write(&self.suffix);
output.write(&self.value);
output.pad(XATTR_WORD_SIZE);
}
}
impl InodeXAttrs {
fn byte_size(&self, version: format::FormatVersion) -> usize {
let mut counter = FirstPass::default();
self.write(&mut counter, version);
counter.offset
}
fn add(&mut self, name: &[u8], value: &[u8], version: format::FormatVersion) {
for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() {
if version.epoch() == FormatEpoch::Epoch1 && idx == 5 {
continue;
}
if let Some(suffix) = name.strip_prefix(*prefix) {
self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32);
self.local.push(XAttr {
prefix: idx as u8,
suffix: Box::from(suffix),
value: Box::from(value),
});
return;
}
}
unreachable!("{:?}", std::str::from_utf8(name)); }
fn write(&self, output: &mut impl Output, version: format::FormatVersion) {
if self.filter != 0 {
trace!(" write xattrs block");
output.write_struct(format::InodeXAttrHeader {
name_filter: (!self.filter).into(),
shared_count: self.shared.len() as u8,
..Default::default()
});
for idx in &self.shared {
trace!(" shared {} @{}", idx, output.len());
let xattr_ref = match version.epoch() {
FormatEpoch::Epoch1 => output.get_xattr_v1(*idx),
FormatEpoch::Epoch2 => output.get_xattr_v2(*idx),
};
output.write(&xattr_ref.to_le_bytes());
}
for attr in &self.local {
trace!(" local @{}", output.len());
attr.write(output);
}
}
}
}
impl<'a> Directory<'a> {
pub fn from_entries(entries: Vec<DirEnt<'a>>) -> Self {
let mut blocks = vec![];
let mut rest = vec![];
let mut n_bytes = 0u64;
let mut nlink = 0;
trace!("Directory with {} items", entries.len());
for entry in entries.into_iter() {
let entry_size: u64 = (size_of::<format::DirectoryEntryHeader>() + entry.name.len())
.try_into()
.unwrap();
assert!(entry_size <= 4096);
trace!(" {:?}", entry.file_type);
if matches!(entry.file_type, format::FileType::Directory) {
nlink += 1;
}
n_bytes += entry_size;
if n_bytes <= 4096 {
rest.push(entry);
} else {
trace!(" block {}", rest.len());
blocks.push(rest.into_boxed_slice());
rest = vec![entry];
n_bytes = entry_size;
}
}
if n_bytes > 2048 {
blocks.push(rest.into_boxed_slice());
rest = vec![];
n_bytes = 0;
}
trace!(
" blocks {} inline {} inline_size {n_bytes}",
blocks.len(),
rest.len()
);
let block_size: u64 = format::BLOCK_SIZE.into();
let size = block_size * blocks.len() as u64 + n_bytes;
Self {
blocks: blocks.into_boxed_slice(),
inline: rest.into_boxed_slice(),
size,
nlink,
}
}
fn write_block(&self, output: &mut impl Output, block: &[DirEnt]) {
trace!(" write dir block {} @{}", block.len(), output.len());
let mut nameofs = size_of::<format::DirectoryEntryHeader>() * block.len();
for entry in block {
trace!(
" entry {:?} name {} @{}",
entry.file_type,
nameofs,
output.len()
);
let inode_idx = match entry.inode {
InodeRef::Known(idx) => idx,
InodeRef::Deferred(_) => panic!("all inodes must be resolved before writing"),
};
output.write_struct(format::DirectoryEntryHeader {
name_offset: (nameofs as u16).into(),
inode_offset: output.get_nid(inode_idx).into(),
file_type: entry.file_type.into(),
..Default::default()
});
nameofs += entry.name.len();
}
for entry in block {
trace!(" name @{}", output.len());
output.write(entry.name.as_bytes());
}
}
fn write_inline(&self, output: &mut impl Output) {
trace!(
" write inline len {} expected size {} of {}",
self.inline.len(),
self.size % 4096,
self.size
);
self.write_block(output, &self.inline);
}
fn write_blocks(&self, output: &mut impl Output) {
let block_size: usize = format::BLOCK_SIZE.into();
for block in &self.blocks {
assert_eq!(output.len() % block_size, 0);
self.write_block(output, block);
output.pad(block_size);
}
}
fn inode_meta(&self, block_offset: usize) -> InodeMeta {
let blkaddr: u32 = (block_offset / 4096)
.try_into()
.expect("block address exceeds u32::MAX");
let (layout, i_u) = if self.inline.is_empty() {
(format::DataLayout::FlatPlain, blkaddr)
} else if !self.blocks.is_empty() {
(format::DataLayout::FlatInline, blkaddr)
} else {
(format::DataLayout::FlatInline, 0)
};
InodeMeta {
layout,
i_u,
size: self.size,
nlink: self.nlink,
}
}
}
fn compute_chunk_format(file_size: u64) -> u32 {
const BLOCK_BITS: u32 = format::BLOCK_BITS as u32;
const CHUNK_FORMAT_BLKBITS_MASK: u32 = 0x001F;
let mut chunkbits = if file_size > 1 {
64 - (file_size - 1).leading_zeros()
} else {
1
};
if chunkbits < BLOCK_BITS {
chunkbits = BLOCK_BITS;
}
if chunkbits - BLOCK_BITS > CHUNK_FORMAT_BLKBITS_MASK {
chunkbits = CHUNK_FORMAT_BLKBITS_MASK + BLOCK_BITS;
}
chunkbits - BLOCK_BITS
}
impl<ObjectID: FsVerityHashValue> Leaf<'_, ObjectID> {
fn inode_meta(&self, version: format::FormatVersion) -> InodeMeta {
let (layout, i_u, size) = match &self.content {
tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => {
if data.is_empty() {
(format::DataLayout::FlatPlain, 0, data.len() as u64)
} else {
(format::DataLayout::FlatInline, 0, data.len() as u64)
}
}
tree::LeafContent::Regular(tree::RegularFile::External(.., size)) => {
let chunk_format = match version.epoch() {
FormatEpoch::Epoch1 => compute_chunk_format(*size),
FormatEpoch::Epoch2 => 31,
};
(format::DataLayout::ChunkBased, chunk_format, *size)
}
tree::LeafContent::CharacterDevice(rdev) | tree::LeafContent::BlockDevice(rdev) => {
let rdev32: u32 = (*rdev)
.try_into()
.expect("device number exceeds EROFS u32 limit");
(format::DataLayout::FlatPlain, rdev32, 0)
}
tree::LeafContent::Fifo | tree::LeafContent::Socket => {
(format::DataLayout::FlatPlain, 0, 0)
}
tree::LeafContent::Symlink(target) => {
assert!(
target.len() <= crate::SYMLINK_MAX,
"symlink target is {} bytes (max {})",
target.len(),
crate::SYMLINK_MAX,
);
(format::DataLayout::FlatInline, 0, target.len() as u64)
}
};
InodeMeta {
layout,
i_u,
size,
nlink: self.nlink,
}
}
fn write_inline(&self, output: &mut impl Output) {
output.write(match self.content {
tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => data,
tree::LeafContent::Regular(tree::RegularFile::External(..)) => b"\xff\xff\xff\xff", tree::LeafContent::Symlink(target) => target.as_bytes(),
_ => &[],
});
}
}
impl<ObjectID: FsVerityHashValue> Inode<'_, ObjectID> {
fn file_type(&self) -> format::FileType {
if self.escaped_whiteout {
return format::FileType::RegularFile;
}
match &self.content {
InodeContent::Directory(..) => format::FileType::Directory,
InodeContent::Leaf(leaf) => match &leaf.content {
tree::LeafContent::Regular(..) => format::FileType::RegularFile,
tree::LeafContent::CharacterDevice(..) => format::FileType::CharacterDevice,
tree::LeafContent::BlockDevice(..) => format::FileType::BlockDevice,
tree::LeafContent::Fifo => format::FileType::Fifo,
tree::LeafContent::Socket => format::FileType::Socket,
tree::LeafContent::Symlink(..) => format::FileType::Symlink,
},
}
}
fn fits_in_compact(&self, min_mtime: (u64, u32), size: u64, nlink: usize) -> bool {
if self.stat.st_mtim_sec as u64 != min_mtime.0 {
return false;
}
if self.stat.st_mtim_nsec != min_mtime.1 {
return false;
}
if nlink > u16::MAX as usize {
return false;
}
if self.stat.st_uid > u16::MAX as u32 || self.stat.st_gid > u16::MAX as u32 {
return false;
}
if size > u32::MAX as u64 {
return false;
}
true
}
fn pad_inline_tail_v1(
&self,
output: &mut impl Output,
inode_and_xattr_size: u64,
size: u64,
#[cfg(test)] ctx: &mut WriteContext,
#[cfg(not(test))] _ctx: &mut WriteContext,
) {
let block_size = u64::from(format::BLOCK_SIZE);
let current_pos: u64 = output.len().try_into().unwrap();
let inline_size = size % block_size;
if matches!(self.file_type(), format::FileType::Symlink) {
#[cfg(test)]
let skip_pad = ctx
.faults
.as_mut()
.map(|f| f.should_skip_symlink_pad())
.unwrap_or(false);
#[cfg(not(test))]
let skip_pad = false;
if !skip_pad {
let total_size = inode_and_xattr_size + inline_size;
if block_offset(current_pos) + total_size > block_size {
if let Some(pad_size) = bytes_to_block_boundary(current_pos) {
output.write_zeros(pad_size as usize);
}
}
}
} else {
let inline_start = current_pos
.checked_add(inode_and_xattr_size)
.expect("image position + inode header size cannot overflow u64");
if let Some(block_remainder) = bytes_to_block_boundary(inline_start)
&& block_remainder < inline_size
{
let pad_size = (block_remainder.div_ceil(INODE_SLOT_SIZE as u64)
* INODE_SLOT_SIZE as u64) as usize;
output.write_zeros(pad_size);
}
}
}
fn pad_inline_tail_v2(&self, output: &mut impl Output, inode_and_xattr_size: u64, size: u64) {
let block_size = u64::from(format::BLOCK_SIZE);
let inline_start: u64 = output.len().try_into().unwrap();
let inline_start = inline_start
.checked_add(inode_and_xattr_size)
.expect("image position + inode header size cannot overflow u64");
let end_of_metadata = inline_start - 1;
let inline_end = inline_start + (size % block_size);
if end_of_metadata / block_size != inline_end / block_size {
let pad_size = (block_size - end_of_metadata % block_size) as usize;
output.write_zeros(pad_size);
output.pad(INODE_SLOT_SIZE);
}
}
fn write_inode(&self, output: &mut impl Output, idx: usize, ctx: &mut WriteContext) {
let version = ctx.version;
let min_mtime = ctx.min_mtime;
let meta = match &self.content {
InodeContent::Directory(dir) => dir.inode_meta(output.get_block_start(idx)),
InodeContent::Leaf(leaf) => leaf.inode_meta(version),
};
let InodeMeta {
layout,
i_u: u,
size,
nlink,
} = meta;
let xattr_size = self.xattrs.byte_size(version);
let use_compact =
version.epoch() == FormatEpoch::Epoch1 && self.fits_in_compact(min_mtime, size, nlink);
let inode_header_size = if use_compact {
size_of::<format::CompactInodeHeader>()
} else {
size_of::<format::ExtendedInodeHeader>()
};
output.pad(INODE_SLOT_SIZE);
if matches!(layout, format::DataLayout::FlatInline) {
let inode_and_xattr_size: u64 = (inode_header_size + xattr_size).try_into().unwrap();
match version.epoch() {
FormatEpoch::Epoch1 => {
self.pad_inline_tail_v1(output, inode_and_xattr_size, size, ctx);
}
FormatEpoch::Epoch2 => {
self.pad_inline_tail_v2(output, inode_and_xattr_size, size);
}
}
}
let xattr_icount: u16 = match xattr_size {
0 => 0,
n => {
let word_count = n
.checked_sub(INODE_XATTR_HEADER_SIZE)
.expect("non-empty xattr block must be >= header size")
/ XATTR_WORD_SIZE;
(1 + word_count) as u16
}
};
output.note_inode();
if use_compact {
let format = format::InodeLayout::Compact | layout;
let ino = idx as u32;
output.write_struct(format::CompactInodeHeader {
format,
xattr_icount: xattr_icount.into(),
mode: self.file_type() | self.stat.st_mode,
nlink: (nlink as u16).into(),
size: (size as u32).into(),
reserved: 0.into(),
u: u.into(),
ino: ino.into(),
uid: (self.stat.st_uid as u16).into(),
gid: (self.stat.st_gid as u16).into(),
reserved2: [0; 4],
});
} else {
let format = format::InodeLayout::Extended | layout;
let ino = match version.epoch() {
FormatEpoch::Epoch1 => idx as u32,
FormatEpoch::Epoch2 => (output.len() / INODE_SLOT_SIZE) as u32,
};
let mtime_nsec: u32 = match version.epoch() {
FormatEpoch::Epoch1 => self.stat.st_mtim_nsec,
FormatEpoch::Epoch2 => 0,
};
output.write_struct(format::ExtendedInodeHeader {
format,
xattr_icount: xattr_icount.into(),
mode: self.file_type() | self.stat.st_mode,
size: size.into(),
u: u.into(),
ino: ino.into(),
uid: self.stat.st_uid.into(),
gid: self.stat.st_gid.into(),
mtime: (self.stat.st_mtim_sec as u64).into(),
mtime_nsec: mtime_nsec.into(),
nlink: (nlink as u32).into(),
..Default::default()
});
}
self.xattrs.write(output, version);
match &self.content {
InodeContent::Directory(dir) => dir.write_inline(output),
InodeContent::Leaf(leaf) => leaf.write_inline(output),
};
output.pad(INODE_SLOT_SIZE);
}
fn write_blocks(&self, output: &mut impl Output) {
if let InodeContent::Directory(dir) = &self.content {
dir.write_blocks(output);
}
}
}
struct InodeCollector<'a, ObjectID: FsVerityHashValue> {
inodes: Vec<Inode<'a, ObjectID>>,
hardlinks: HashMap<LeafId, InodeIdx>,
fs: &'a tree::FileSystem<ObjectID>,
nlink_map: Vec<u32>,
version: format::FormatVersion,
}
impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> {
fn push_inode(
&mut self,
stat: &'a tree::Stat,
content: InodeContent<'a, ObjectID>,
) -> InodeIdx {
let mut xattrs = InodeXAttrs::default();
if let InodeContent::Leaf(Leaf {
content: tree::LeafContent::Regular(tree::RegularFile::External(id, ..)),
..
}) = content
{
xattrs.add(
format::XATTR_OVERLAY_METACOPY,
OverlayMetacopy::new(id).as_bytes(),
self.version,
);
let redirect = format!("/{}", id.to_object_pathname());
xattrs.add(
format::XATTR_OVERLAY_REDIRECT,
redirect.as_bytes(),
self.version,
);
}
for (name, value) in stat.xattrs.iter() {
let name = name.as_bytes();
if let Some(escapee) = name.strip_prefix(format::XATTR_OVERLAY_PREFIX) {
let escaped = [format::XATTR_OVERLAY_ESCAPED_PREFIX, escapee].concat();
xattrs.add(&escaped, value, self.version);
} else {
xattrs.add(name, value, self.version);
}
}
let inode = self.inodes.len();
self.inodes.push(Inode {
stat,
xattrs,
content,
escaped_whiteout: false,
});
inode
}
fn collect_leaf(&mut self, leaf_id: LeafId) -> InodeIdx {
let nlink = self.nlink_map[leaf_id.0] as usize;
if nlink > 1
&& let Some(inode) = self.hardlinks.get(&leaf_id)
{
return *inode;
}
let leaf = self.fs.leaf(leaf_id);
debug_assert!(
!(matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) && nlink > 1),
"ValidatedFileSystem guarantees whiteout nlink == 1"
);
let inode = self.push_inode(
&leaf.stat,
InodeContent::Leaf(Leaf {
content: &leaf.content,
nlink,
}),
);
if nlink > 1 {
self.hardlinks.insert(leaf_id, inode);
}
inode
}
fn collect_dir(&mut self, dir: &'a tree::Directory<ObjectID>, parent: InodeIdx) -> InodeIdx {
let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default()));
let mut entries = vec![
DirEnt {
name: b".",
inode: InodeRef::Known(me),
file_type: format::FileType::Directory,
},
DirEnt {
name: b"..",
inode: InodeRef::Known(parent),
file_type: format::FileType::Directory,
},
];
for (name, inode) in dir.sorted_entries() {
let child = match inode {
tree::Inode::Directory(dir) => self.collect_dir(dir, me),
tree::Inode::Leaf(leaf_id, _) => self.collect_leaf(*leaf_id),
};
entries.push(DirEnt {
name: name.as_bytes(),
inode: InodeRef::Known(child),
file_type: self.inodes[child].file_type(),
});
}
entries.sort_unstable_by_key(|e| e.name);
self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries));
me
}
fn is_overlay_whiteout_stub(
&self,
name: &[u8],
leaf_id: LeafId,
me: InodeIdx,
root_inode: InodeIdx,
) -> bool {
let root_stat = &self.fs.root.stat;
let leaf_stat = &self.fs.leaf(leaf_id).stat;
let selinux_key = std::ffi::OsStr::new("security.selinux");
let expected_xattrs = if root_stat.xattrs.contains_key(selinux_key) {
1
} else {
0
};
let has_correct_xattrs = leaf_stat.xattrs.len() == expected_xattrs
&& (expected_xattrs == 0
|| leaf_stat.xattrs.get(selinux_key) == root_stat.xattrs.get(selinux_key));
me == root_inode
&& name.len() == 2
&& name
.iter()
.all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f'))
&& leaf_stat.st_mode == 0o644
&& leaf_stat.st_uid == root_stat.st_uid
&& leaf_stat.st_gid == root_stat.st_gid
&& leaf_stat.st_mtim_sec == root_stat.st_mtim_sec
&& leaf_stat.st_mtim_nsec == root_stat.st_mtim_nsec
&& has_correct_xattrs
}
fn is_v1_whiteout(content: &tree::LeafContent<ObjectID>) -> bool {
matches!(content, tree::LeafContent::CharacterDevice(0))
}
fn collect_tree(&mut self, root: &'a tree::Directory<ObjectID>) {
use std::collections::VecDeque;
let canonical_dirs = Self::find_canonical_dirs(root, &self.nlink_map);
let root_inode = self.push_inode(&root.stat, InodeContent::Directory(Directory::default()));
let mut queue: VecDeque<(&'a tree::Directory<ObjectID>, InodeIdx, InodeIdx)> =
VecDeque::new();
queue.push_back((root, root_inode, root_inode));
let mut dir_entries: Vec<(InodeIdx, InodeIdx, Vec<DirEnt<'a>>)> = vec![];
while let Some((dir, parent, me)) = queue.pop_front() {
let mut entries = vec![
DirEnt {
name: b".",
inode: InodeRef::Known(me),
file_type: format::FileType::Directory,
},
DirEnt {
name: b"..",
inode: InodeRef::Known(parent),
file_type: format::FileType::Directory,
},
];
let mut dir_has_whiteout = false;
for (name, inode) in dir.sorted_entries() {
match inode {
tree::Inode::Directory(subdir) => {
let child = self.push_inode(
&subdir.stat,
InodeContent::Directory(Directory::default()),
);
queue.push_back((subdir, me, child));
entries.push(DirEnt {
name: name.as_bytes(),
inode: InodeRef::Known(child),
file_type: format::FileType::Directory,
});
}
tree::Inode::Leaf(leaf_id, _) => {
let name_bytes = name.as_bytes();
let is_stub =
self.is_overlay_whiteout_stub(name_bytes, *leaf_id, me, root_inode);
let nlink = self.nlink_map[leaf_id.0];
let is_canonical = if nlink > 1 {
canonical_dirs
.get(leaf_id)
.is_some_and(|&p| std::ptr::eq(p, dir))
} else {
true
};
let child_ref = if is_canonical {
InodeRef::Known(self.collect_leaf(*leaf_id))
} else if let Some(&nid) = self.hardlinks.get(leaf_id) {
InodeRef::Known(nid)
} else {
InodeRef::Deferred(*leaf_id)
};
if is_canonical
&& matches!(child_ref, InodeRef::Known(_))
&& self.version.epoch() == FormatEpoch::Epoch1
&& !is_stub
&& Self::is_v1_whiteout(&self.fs.leaf(*leaf_id).content)
{
let InodeRef::Known(child) = child_ref else {
unreachable!()
};
if !self.inodes[child].escaped_whiteout {
self.inodes[child].escaped_whiteout = true;
self.inodes[child].xattrs.add(
format::XATTR_OVERLAY_WHITEOUT,
b"",
self.version,
);
self.inodes[child].xattrs.add(
format::XATTR_USERXATTR_WHITEOUT,
b"",
self.version,
);
dir_has_whiteout = true;
}
}
let file_type = if let InodeRef::Known(child) = child_ref {
self.inodes[child].file_type()
} else {
format::FileType::RegularFile
};
entries.push(DirEnt {
name: name.as_bytes(),
inode: child_ref,
file_type,
});
}
}
}
if self.version.epoch() == FormatEpoch::Epoch1 && dir_has_whiteout {
self.inodes[me]
.xattrs
.add(format::XATTR_OVERLAY_WHITEOUTS, b"", self.version);
self.inodes[me]
.xattrs
.add(format::XATTR_USERXATTR_WHITEOUTS, b"", self.version);
self.inodes[me]
.xattrs
.add(format::XATTR_OVERLAY_OPAQUE, b"x", self.version);
self.inodes[me]
.xattrs
.add(format::XATTR_USERXATTR_OPAQUE, b"x", self.version);
}
entries.sort_unstable_by_key(|e| e.name);
dir_entries.push((me, parent, entries));
}
for (_me, _parent, entries) in &mut dir_entries {
for entry in entries.iter_mut() {
if let InodeRef::Deferred(leaf_id) = entry.inode {
let nid = *self
.hardlinks
.get(&leaf_id)
.expect("canonical leaf must have been assigned a nid during BFS");
entry.inode = InodeRef::Known(nid);
entry.file_type = self.inodes[nid].file_type();
}
}
}
for (me, _parent, entries) in dir_entries {
self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries));
}
}
fn find_canonical_dirs(
root: &'a tree::Directory<ObjectID>,
nlink_map: &[u32],
) -> HashMap<LeafId, *const tree::Directory<ObjectID>> {
let mut seen: HashSet<LeafId> = HashSet::new();
let mut canonical_dirs: HashMap<LeafId, *const tree::Directory<ObjectID>> = HashMap::new();
Self::dfs_find_canonical(root, nlink_map, &mut seen, &mut canonical_dirs);
canonical_dirs
}
fn dfs_find_canonical(
dir: &'a tree::Directory<ObjectID>,
nlink_map: &[u32],
seen: &mut HashSet<LeafId>,
canonical_dirs: &mut HashMap<LeafId, *const tree::Directory<ObjectID>>,
) {
let dir_ptr: *const tree::Directory<ObjectID> = dir;
for (_, inode) in dir.sorted_entries() {
match inode {
tree::Inode::Directory(subdir) => {
Self::dfs_find_canonical(subdir, nlink_map, seen, canonical_dirs);
}
tree::Inode::Leaf(leaf_id, _) => {
if nlink_map[leaf_id.0] > 1 && seen.insert(*leaf_id) {
canonical_dirs.insert(*leaf_id, dir_ptr);
}
}
}
}
}
pub fn collect(
fs: &'a tree::FileSystem<ObjectID>,
version: format::FormatVersion,
) -> Vec<Inode<'a, ObjectID>> {
let mut this = Self {
inodes: vec![],
hardlinks: HashMap::new(),
fs,
nlink_map: fs.nlinks(),
version,
};
match version.epoch() {
FormatEpoch::Epoch1 => this.collect_tree(&fs.root),
FormatEpoch::Epoch2 => {
let root_inode = this.collect_dir(&fs.root, 0);
assert_eq!(root_inode, 0);
}
}
this.inodes
}
}
fn share_xattrs(
inodes: &mut [Inode<impl FsVerityHashValue>],
version: format::FormatVersion,
) -> Vec<XAttr> {
let mut xattrs: BTreeMap<XAttr, usize> = BTreeMap::new();
if version.epoch() == FormatEpoch::Epoch1 {
for inode in inodes.iter_mut() {
inode.xattrs.local.sort_by(|a, b| a.cmp_by_full_key(b));
}
}
for inode in inodes.iter() {
for attr in &inode.xattrs.local {
if let Some(count) = xattrs.get_mut(attr) {
*count += 1;
} else {
xattrs.insert(attr.clone(), 1);
}
}
}
xattrs.retain(|_k, v| *v > 1);
let (xattrs, shared): (BTreeMap<XAttr, usize>, Vec<XAttr>) = match version.epoch() {
FormatEpoch::Epoch1 => {
let mut sorted: Vec<_> = xattrs.into_iter().collect();
sorted.sort_by(|(a, _), (b, _)| a.cmp_by_full_key(b));
let n_shared = sorted.len();
let xattrs_map: BTreeMap<XAttr, usize> = sorted
.iter()
.enumerate()
.map(|(i, (k, _))| (k.clone(), n_shared - 1 - i))
.collect();
let mut out = sorted;
out.reverse();
let shared_vec = out.into_iter().map(|(k, _)| k).collect();
(xattrs_map, shared_vec)
}
FormatEpoch::Epoch2 => {
for (idx, value) in xattrs.values_mut().enumerate() {
*value = idx;
}
let shared_vec = xattrs.keys().cloned().collect();
(xattrs, shared_vec)
}
};
for inode in inodes.iter_mut() {
inode.xattrs.local.retain(|attr| {
if let Some(idx) = xattrs.get(attr) {
inode.xattrs.shared.push(*idx);
false
} else {
true
}
});
}
shared
}
fn write_erofs(
output: &mut impl Output,
inodes: &[Inode<impl FsVerityHashValue>],
xattrs: &[XAttr],
ctx: &mut WriteContext,
) {
let version = ctx.version;
let min_mtime = ctx.min_mtime;
let header_flags = ctx.header_flags;
let composefs_version: u32 = ctx.composefs_version;
let (build_time, build_time_nsec) = match version.epoch() {
FormatEpoch::Epoch1 => min_mtime,
FormatEpoch::Epoch2 => (0, 0),
};
output.write_composefs_header(format::ComposefsHeader {
magic: format::COMPOSEFS_MAGIC,
version: format::VERSION,
flags: header_flags.into(),
composefs_version: composefs_version.into(),
..Default::default()
});
let xattr_blkaddr = match version.epoch() {
FormatEpoch::Epoch1 => output.get_xattr_blkaddr(),
FormatEpoch::Epoch2 => 0,
};
output.write_superblock(format::Superblock {
magic: format::MAGIC_V1,
blkszbits: format::BLOCK_BITS,
feature_compat: (format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER).into(),
root_nid: (output.get_nid(0) as u16).into(),
inos: (inodes.len() as u64).into(),
blocks: output.get_block_count().into(),
build_time: build_time.into(),
build_time_nsec: build_time_nsec.into(),
xattr_blkaddr: xattr_blkaddr.into(),
..Default::default()
});
for (idx, inode) in inodes.iter().enumerate() {
inode.write_inode(output, idx, ctx);
}
output.pad(INODE_SLOT_SIZE);
output.note_inodes_end();
for xattr in xattrs {
output.note_xattr();
xattr.write(output);
}
output.pad(4096);
for inode in inodes.iter() {
output.note_block();
inode.write_blocks(output);
}
output.note_end();
}
#[derive(Default)]
struct Layout {
inodes: Vec<NonZeroUsize>,
inodes_end: Option<NonZeroUsize>,
xattrs: Vec<NonZeroUsize>,
blocks: Vec<NonZeroUsize>,
end: Option<NonZeroUsize>,
}
#[derive(Default)]
struct FirstPass {
offset: usize,
layout: Layout,
header_emitted: bool,
superblock_emitted: bool,
}
struct SecondPass {
output: Vec<u8>,
layout: Layout,
}
impl Output for FirstPass {
fn note_header_emitted(&mut self) {
assert!(!self.header_emitted, "composefs header written twice");
self.header_emitted = true;
}
fn note_superblock_emitted(&mut self) {
assert!(!self.superblock_emitted, "superblock written twice");
self.superblock_emitted = true;
}
fn note_inode(&mut self) {
self.layout
.inodes
.push(NonZeroUsize::new(self.offset).expect("inode recorded at offset 0"));
}
fn note_inodes_end(&mut self) {
assert!(
self.layout.inodes_end.is_none(),
"inodes_end recorded twice"
);
self.layout.inodes_end = NonZeroUsize::new(self.offset);
}
fn note_xattr(&mut self) {
self.layout
.xattrs
.push(NonZeroUsize::new(self.offset).expect("xattr recorded at offset 0"));
}
fn note_block(&mut self) {
debug_assert_eq!(
self.offset % format::BLOCK_SIZE as usize,
0,
"block data must start at a block-aligned offset"
);
self.layout
.blocks
.push(NonZeroUsize::new(self.offset).expect("block recorded at offset 0"));
}
fn note_end(&mut self) {
assert!(self.layout.end.is_none(), "end recorded twice");
self.layout.end = NonZeroUsize::new(self.offset);
}
fn get_inode_offset(&self, _idx: usize) -> Option<NonZeroUsize> {
None
}
fn get_inodes_end(&self) -> Option<NonZeroUsize> {
None
}
fn get_xattr_offset(&self, _idx: usize) -> Option<NonZeroUsize> {
None
}
fn get_block_offset(&self, _idx: usize) -> Option<NonZeroUsize> {
None
}
fn get_end(&self) -> Option<NonZeroUsize> {
None
}
fn write(&mut self, data: &[u8]) {
self.offset += data.len();
}
fn pad(&mut self, alignment: usize) {
self.offset = round_up(self.offset, alignment);
}
fn len(&self) -> usize {
self.offset
}
}
impl Output for SecondPass {
fn note_header_emitted(&mut self) {}
fn note_superblock_emitted(&mut self) {}
fn note_inode(&mut self) {}
fn note_inodes_end(&mut self) {
debug_assert_eq!(
self.output.len(),
self.layout
.inodes_end
.expect("inodes_end not recorded")
.get(),
"second pass diverged from first at inodes_end"
);
}
fn note_xattr(&mut self) {}
fn note_block(&mut self) {}
fn note_end(&mut self) {
debug_assert_eq!(
self.output.len(),
self.layout.end.expect("end not recorded").get(),
"second pass diverged from first at end"
);
}
fn get_inode_offset(&self, idx: usize) -> Option<NonZeroUsize> {
Some(self.layout.inodes[idx])
}
fn get_inodes_end(&self) -> Option<NonZeroUsize> {
Some(self.layout.inodes_end.expect("inodes_end not recorded"))
}
fn get_xattr_offset(&self, idx: usize) -> Option<NonZeroUsize> {
Some(self.layout.xattrs[idx])
}
fn get_block_offset(&self, idx: usize) -> Option<NonZeroUsize> {
Some(self.layout.blocks[idx])
}
fn get_end(&self) -> Option<NonZeroUsize> {
Some(self.layout.end.expect("end not recorded"))
}
fn write(&mut self, data: &[u8]) {
self.output.extend_from_slice(data);
}
fn pad(&mut self, alignment: usize) {
self.output
.resize(round_up(self.output.len(), alignment), 0);
}
fn len(&self) -> usize {
self.output.len()
}
}
fn calculate_min_mtime(inodes: &[Inode<impl FsVerityHashValue>]) -> (u64, u32) {
inodes
.iter()
.map(|inode| (inode.stat.st_mtim_sec as u64, inode.stat.st_mtim_nsec))
.reduce(|(a_sec, a_nsec), (b_sec, b_nsec)| {
if (b_sec, b_nsec) < (a_sec, a_nsec) {
(b_sec, b_nsec)
} else {
(a_sec, a_nsec)
}
})
.unwrap_or((0, 0))
}
type PreparedInodes<'a, ObjectID> = (Vec<Inode<'a, ObjectID>>, Vec<XAttr>, (u64, u32), u32, u32);
fn prepare_erofs_inodes<'a, ObjectID: FsVerityHashValue>(
fs: &'a tree::FileSystem<ObjectID>,
version: format::FormatVersion,
) -> PreparedInodes<'a, ObjectID> {
let mut inodes = InodeCollector::collect(fs, version);
if version.epoch() == FormatEpoch::Epoch1 && !inodes.is_empty() {
inodes[0]
.xattrs
.add(format::XATTR_OVERLAY_OPAQUE_ROOT, b"y", version);
}
let (header_flags, composefs_version) = match version.epoch() {
FormatEpoch::Epoch1 => {
let has_acl = inodes.iter().any(|inode| {
inode.xattrs.local.iter().any(|xattr| {
xattr.prefix == format::XATTR_INDEX_POSIX_ACL_ACCESS
|| xattr.prefix == format::XATTR_INDEX_POSIX_ACL_DEFAULT
})
});
let flags = if has_acl {
format::COMPOSEFS_FLAGS_HAS_ACL.get()
} else {
0
};
let cfs_ver = match version {
format::FormatVersion::V0 => {
let has_user_whiteout = inodes.iter().any(|inode| inode.escaped_whiteout);
if has_user_whiteout { 1u32 } else { 0u32 }
}
_ => version.composefs_version().get(),
};
(flags, cfs_ver)
}
FormatEpoch::Epoch2 => (0u32, format::COMPOSEFS_VERSION.get()),
};
let xattrs = share_xattrs(&mut inodes, version);
let min_mtime = calculate_min_mtime(&inodes);
(inodes, xattrs, min_mtime, header_flags, composefs_version)
}
pub fn mkfs_erofs<ObjectID: FsVerityHashValue>(fs: &ValidatedFileSystem<ObjectID>) -> Box<[u8]> {
mkfs_erofs_inner(
&fs.0,
format::FormatVersion::default(),
#[cfg(test)]
None,
)
}
pub(crate) fn mkfs_erofs_inner<ObjectID: FsVerityHashValue>(
fs: &tree::FileSystem<ObjectID>,
version: format::FormatVersion,
#[cfg(test)] faults: Option<WriterFaults>,
) -> Box<[u8]> {
let fs_with_whiteouts;
let fs = if version.epoch() == FormatEpoch::Epoch1 {
fs_with_whiteouts = {
let mut cloned = fs.clone();
cloned.add_overlay_whiteouts();
cloned
};
&fs_with_whiteouts
} else {
fs
};
let (inodes, xattrs, min_mtime, header_flags, composefs_version) =
prepare_erofs_inodes(fs, version);
let mut ctx = WriteContext {
version,
min_mtime,
header_flags,
composefs_version,
#[cfg(test)]
faults,
};
let mut first_pass = FirstPass::default();
write_erofs(&mut first_pass, &inodes, &xattrs, &mut ctx);
#[cfg(test)]
if let Some(ref mut f) = ctx.faults {
f.start_replay();
}
let mut second_pass = SecondPass {
output: vec![],
layout: first_pass.layout,
};
write_erofs(&mut second_pass, &inodes, &xattrs, &mut ctx);
second_pass.output.into_boxed_slice()
}
pub fn mkfs_erofs_versioned<ObjectID: FsVerityHashValue>(
fs: &ValidatedFileSystem<ObjectID>,
version: format::FormatVersion,
) -> Box<[u8]> {
mkfs_erofs_inner(
&fs.0,
version,
#[cfg(test)]
None,
)
}
#[cfg(test)]
pub(crate) fn mkfs_erofs_with_faults<ObjectID: FsVerityHashValue>(
fs: &ValidatedFileSystem<ObjectID>,
version: format::FormatVersion,
faults: WriterFaults,
) -> Box<[u8]> {
mkfs_erofs_inner(&fs.0, version, Some(faults))
}
#[cfg(test)]
mod tests {
use super::compute_chunk_format;
#[test]
fn test_compute_chunk_format_boundary_values() {
assert_eq!(compute_chunk_format(1), 0, "size=1");
assert_eq!(compute_chunk_format(2), 0, "size=2");
assert_eq!(compute_chunk_format(4096), 0, "size=4096");
assert_eq!(compute_chunk_format(4097), 1, "size=4097");
assert_eq!(compute_chunk_format(1 << 20), 8, "size=1<<20");
assert_eq!(compute_chunk_format((1 << 20) + 1), 9, "size=(1<<20)+1");
}
#[test]
fn test_v2_digest_unaffected_by_prior_v1_generation() {
use crate::{
dumpfile::dumpfile_to_filesystem,
erofs::{
format::FormatVersion,
writer::{ValidatedFileSystem, mkfs_erofs_inner},
},
fsverity::Sha256HashValue,
};
let dumpfile = concat!(
"/ 0 40755 2 0 0 0 1000.0 - - -\n",
"/usr 0 40755 2 0 0 0 1000.0 - - -\n",
"/usr/lib 0 40755 2 0 0 0 1000.0 - - -\n",
"/usr/lib/libfoo.so 5 100644 1 0 0 0 1000.0 - hello -\n",
);
let fs_v2_only = dumpfile_to_filesystem::<Sha256HashValue>(dumpfile).unwrap();
let v2_alone = mkfs_erofs_inner(
&ValidatedFileSystem::new(fs_v2_only).unwrap().0,
FormatVersion::V2,
None,
);
let fs_both = dumpfile_to_filesystem::<Sha256HashValue>(dumpfile).unwrap();
let inner = ValidatedFileSystem::new(fs_both).unwrap().0;
let root_entries_before = inner.root.entries.len();
let leaves_before = inner.leaves.len();
let _v1 = mkfs_erofs_inner(&inner, FormatVersion::V1, None);
assert_eq!(
inner.root.entries.len(),
root_entries_before,
"V1 generation unexpectedly modified the root directory"
);
assert_eq!(
inner.leaves.len(),
leaves_before,
"V1 generation unexpectedly modified the leaves table"
);
let v2_after_v1 = mkfs_erofs_inner(&inner, FormatVersion::V2, None);
assert_eq!(
v2_alone, v2_after_v1,
"V2 image differs depending on whether V1 was generated first"
);
}
}