supermachine 0.7.82

//! In-process, journal-less ext4 image synthesis for data volumes.
//!
//! Replaces the host `mke2fs` shell-out (the last host-binary dependency on
//! the volume path, on BOTH backends — brew e2fsprogs on macOS, e2fsprogs on
//! Linux). An *empty* ext4 filesystem is a closed-form artifact: superblock,
//! group descriptors, bitmaps, the reserved inodes, root + `lost+found` —
//! everything else is zeros a sparse file already provides (including the
//! inode tables, which is exactly what `mke2fs -E lazy_itable_init=1` defers
//! to first mount; here they're simply holes).
//!
//! Deliberately **no journal** (`mke2fs -O ^has_journal` equivalent): the
//! volume durability contract makes it pure overhead. Pool-cycle writes are
//! ephemeral by design (per-cycle COW isolation, see `with_volume`); durable
//! writes happen at snapshot capture, where the pre-capture guest `sync` is
//! the consistency point. The journal would cost (a) a double-write on every
//! guest metadata commit — an `npm install` issues tens of thousands — and
//! (b) up to 64 MiB of touched journal blocks per volume dragged into every
//! capture-materialized `volumes/<i>.img`. Crash-mid-bake recovery is a
//! rebake, same as every other bake artifact.
//!
//! Also deliberately **deterministic**: fixed UUID/label/timestamps/hash
//! seed, so equal-sized volumes are byte-identical. Host mke2fs version
//! differences silently change defaults; this never drifts, and identical
//! blank images share page cache across VMs for free.
//!
//! Feature set (chosen for plain kernel-driver mounting, no checksumming):
//!   compat:    DIR_INDEX | EXT_ATTR
//!   incompat:  FILETYPE | EXTENTS
//!   ro_compat: SPARSE_SUPER | LARGE_FILE | DIR_NLINK | EXTRA_ISIZE
//! No resize_inode either — volumes are fixed-size by design (growing or
//! shrinking the backing file would invalidate the filesystem anyway).
//!
//! Validated by `e2fsck -fn` in unit tests (test-only host dependency,
//! skipped when absent) and by the volume integration suites, which mount,
//! exercise, snapshot, and cycle these filesystems on both backends.

use std::io::{Seek, SeekFrom, Write};
use std::path::Path;

const BLOCK_SIZE: u64 = 4096;
const BLOCKS_PER_GROUP: u64 = 8 * BLOCK_SIZE; // one 4 KiB bitmap block of bits
/// mke2fs's default inode ratio (one inode per 16 KiB of space).
const INODE_RATIO: u64 = 16384;
const INODES_PER_GROUP: u64 = BLOCKS_PER_GROUP * BLOCK_SIZE / INODE_RATIO; // 8192
const INODE_SIZE: u64 = 256;
const ITABLE_BLOCKS: u64 = INODES_PER_GROUP * INODE_SIZE / BLOCK_SIZE; // 512

const EXT4_MAGIC: u16 = 0xEF53;
const ROOT_INO: u32 = 2;
const FIRST_NON_RESERVED_INO: u32 = 11; // lost+found
const LOST_FOUND_BLOCKS: u64 = 4; // 16 KiB, mke2fs convention

// Feature flags.
const COMPAT_EXT_ATTR: u32 = 0x0008;
const COMPAT_DIR_INDEX: u32 = 0x0020;
const INCOMPAT_FILETYPE: u32 = 0x0002;
const INCOMPAT_EXTENTS: u32 = 0x0040;
const RO_COMPAT_SPARSE_SUPER: u32 = 0x0001;
const RO_COMPAT_LARGE_FILE: u32 = 0x0002;
const RO_COMPAT_DIR_NLINK: u32 = 0x0020;
const RO_COMPAT_EXTRA_ISIZE: u32 = 0x0040;

/// Fixed identity for determinism. The guest mounts volumes by device path
/// (`sm.volume=/dev/vdX:...`), never by UUID, so a shared UUID is harmless.
const VOLUME_UUID: [u8; 16] = *b"supermachine.vol";
const VOLUME_LABEL: &[u8; 16] = b"supermachine-vol";
/// 2020-09-13T12:26:40Z — any fixed PAST instant keeps e2fsck quiet
/// ("in the future" is what it warns about) while staying deterministic.
const FIXED_TIME: u32 = 1_600_000_000;
const HASH_SEED: [u32; 4] = [0x5375_7065, 0x724d_6163, 0x6869_6e65, 0x4558_5434];

/// Little-endian field writer into a fixed-size buffer.
struct Buf(Vec<u8>);
impl Buf {
    fn new(len: usize) -> Self {
        Buf(vec![0u8; len])
    }
    fn u16(&mut self, off: usize, v: u16) {
        self.0[off..off + 2].copy_from_slice(&v.to_le_bytes());
    }
    fn u32(&mut self, off: usize, v: u32) {
        self.0[off..off + 4].copy_from_slice(&v.to_le_bytes());
    }
    fn bytes(&mut self, off: usize, v: &[u8]) {
        self.0[off..off + v.len()].copy_from_slice(v);
    }
}

/// Does group `g` carry a superblock + GDT backup under sparse_super
/// (groups 0, 1, and powers of 3, 5, 7)?
fn has_super(g: u64) -> bool {
    if g <= 1 {
        return g == 0 || g == 1;
    }
    for base in [3u64, 5, 7] {
        let mut p = base;
        while p < g {
            p *= base;
        }
        if p == g {
            return true;
        }
    }
    false
}

struct Layout {
    blocks_count: u64,
    groups: u64,
    gdt_blocks: u64,
}

impl Layout {
    fn new(size_bytes: u64) -> std::io::Result<Layout> {
        let device_blocks = size_bytes / BLOCK_SIZE;
        // Group 0 must hold sb + GDT + bitmaps + itable + root + lost+found
        // with room to spare; 8 MiB is a comfortable floor.
        if size_bytes < 8 * 1024 * 1024 {
            return Err(std::io::Error::other(format!(
                "volume too small to format: {size_bytes} bytes (minimum 8 MiB)"
            )));
        }
        let mut groups = device_blocks.div_ceil(BLOCKS_PER_GROUP);
        let mut gdt_blocks = (groups * 32).div_ceil(BLOCK_SIZE);
        let mut blocks_count = device_blocks;
        // A runt tail group that can't hold its own metadata plus a little
        // data is dropped — the filesystem is simply smaller than the device
        // (perfectly valid; nothing requires the fs to fill the file).
        loop {
            let tail = blocks_count - (groups - 1) * BLOCKS_PER_GROUP;
            let overhead = Self::group_overhead(groups - 1, gdt_blocks);
            if groups > 1 && tail < overhead + 8 {
                groups -= 1;
                blocks_count = groups * BLOCKS_PER_GROUP;
                gdt_blocks = (groups * 32).div_ceil(BLOCK_SIZE);
                continue;
            }
            break;
        }
        Ok(Layout {
            blocks_count,
            groups,
            gdt_blocks,
        })
    }

    /// Metadata blocks at the start of group `g`.
    fn group_overhead(g: u64, gdt_blocks: u64) -> u64 {
        let sb = if has_super(g) { 1 + gdt_blocks } else { 0 };
        sb + 1 /* block bitmap */ + 1 /* inode bitmap */ + ITABLE_BLOCKS
    }

    fn group_start(&self, g: u64) -> u64 {
        g * BLOCKS_PER_GROUP
    }

    fn blocks_in_group(&self, g: u64) -> u64 {
        if g == self.groups - 1 {
            self.blocks_count - g * BLOCKS_PER_GROUP
        } else {
            BLOCKS_PER_GROUP
        }
    }

    /// (block bitmap, inode bitmap, inode table) absolute block numbers.
    fn group_meta(&self, g: u64) -> (u64, u64, u64) {
        let base = self.group_start(g) + if has_super(g) { 1 + self.gdt_blocks } else { 0 };
        (base, base + 1, base + 2)
    }

    /// First data block of group 0 (right after its inode table): the root
    /// directory block, then lost+found's blocks.
    fn first_data_block(&self) -> u64 {
        let (_, _, itable) = self.group_meta(0);
        itable + ITABLE_BLOCKS
    }
}

/// Create `path` as a sparse file of `size_bytes` containing an empty,
/// journal-less ext4 filesystem. Byte-deterministic for a given size.
/// Refuses to touch an existing file (the caller's create-if-missing
/// contract lives with the caller; this is the format step only).
pub fn make_empty_ext4(path: &Path, size_bytes: u64) -> std::io::Result<()> {
    let layout = Layout::new(size_bytes)?;
    let mut f = std::fs::OpenOptions::new()
        .read(true)
        .write(true)
        .create_new(true)
        .open(path)?;
    f.set_len(size_bytes)?;
    if let Err(e) = write_fs(&mut f, &layout) {
        // Never leave a half-formatted file for the "already exists →
        // reuse" branch of the caller to trip over on retry.
        drop(f);
        let _ = std::fs::remove_file(path);
        return Err(e);
    }
    f.flush()?;
    Ok(())
}

fn write_fs(f: &mut std::fs::File, l: &Layout) -> std::io::Result<()> {
    let root_block = l.first_data_block();
    let lf_block = root_block + 1;

    // ---- accounting ----------------------------------------------------
    let mut free_blocks_total = 0u64;
    let mut group_free_blocks = Vec::with_capacity(l.groups as usize);
    for g in 0..l.groups {
        let mut free = l.blocks_in_group(g) - Layout::group_overhead(g, l.gdt_blocks);
        if g == 0 {
            free -= 1 + LOST_FOUND_BLOCKS; // root dir block + lost+found
        }
        group_free_blocks.push(free);
        free_blocks_total += free;
    }
    let inodes_count = l.groups * INODES_PER_GROUP;
    let free_inodes = inodes_count - FIRST_NON_RESERVED_INO as u64;

    // ---- superblock ------------------------------------------------------
    let mut sb = Buf::new(1024);
    sb.u32(0, inodes_count as u32);
    sb.u32(4, l.blocks_count as u32);
    sb.u32(8, 0); // reserved blocks: none — guests run as root anyway
    sb.u32(12, free_blocks_total as u32);
    sb.u32(16, free_inodes as u32);
    sb.u32(20, 0); // first_data_block (0 for 4 KiB blocks)
    sb.u32(24, 2); // log_block_size: 4096
    sb.u32(28, 2); // log_cluster_size (= block size, no bigalloc)
    sb.u32(32, BLOCKS_PER_GROUP as u32);
    sb.u32(36, BLOCKS_PER_GROUP as u32); // clusters_per_group
    sb.u32(40, INODES_PER_GROUP as u32);
    sb.u32(44, 0); // mtime: never mounted
    sb.u32(48, FIXED_TIME); // wtime
    sb.u16(52, 0); // mnt_count
    sb.u16(54, 0xFFFF); // max_mnt_count: disabled
    sb.u16(56, EXT4_MAGIC);
    sb.u16(58, 1); // state: clean
    sb.u16(60, 1); // errors: continue
    sb.u16(62, 0); // minor_rev
    sb.u32(64, FIXED_TIME); // lastcheck
    sb.u32(68, 0); // checkinterval: disabled
    sb.u32(72, 0); // creator_os: linux
    sb.u32(76, 1); // rev_level: dynamic
    sb.u32(84, FIRST_NON_RESERVED_INO);
    sb.u16(88, INODE_SIZE as u16);
    sb.u16(90, 0); // block_group_nr (primary; backups patch this)
    sb.u32(92, COMPAT_EXT_ATTR | COMPAT_DIR_INDEX);
    sb.u32(96, INCOMPAT_FILETYPE | INCOMPAT_EXTENTS);
    sb.u32(
        100,
        RO_COMPAT_SPARSE_SUPER | RO_COMPAT_LARGE_FILE | RO_COMPAT_DIR_NLINK | RO_COMPAT_EXTRA_ISIZE,
    );
    sb.bytes(104, &VOLUME_UUID);
    sb.bytes(120, VOLUME_LABEL);
    // s_journal_uuid (208) / s_journal_inum (224) / s_journal_dev (228) stay
    // zero — that, plus no HAS_JOURNAL compat bit, is "journal-less".
    sb.u32(236, HASH_SEED[0]); // s_hash_seed
    sb.u32(240, HASH_SEED[1]);
    sb.u32(244, HASH_SEED[2]);
    sb.u32(248, HASH_SEED[3]);
    sb.0[252] = 1; // s_def_hash_version: half_md4
    sb.u16(254, 0); // s_desc_size: 0 → 32 (no 64bit)
    sb.u32(264, FIXED_TIME); // s_mkfs_time
    sb.u16(348, 32); // s_min_extra_isize
    sb.u16(350, 32); // s_want_extra_isize

    // ---- group descriptor table -----------------------------------------
    let mut gdt = Buf::new((l.gdt_blocks * BLOCK_SIZE) as usize);
    for g in 0..l.groups {
        let (bb, ib, it) = l.group_meta(g);
        let off = (g * 32) as usize;
        gdt.u32(off, bb as u32);
        gdt.u32(off + 4, ib as u32);
        gdt.u32(off + 8, it as u32);
        gdt.u16(off + 12, group_free_blocks[g as usize] as u16);
        gdt.u16(
            off + 14,
            (INODES_PER_GROUP
                - if g == 0 {
                    FIRST_NON_RESERVED_INO as u64
                } else {
                    0
                }) as u16,
        );
        gdt.u16(off + 16, if g == 0 { 2 } else { 0 }); // used dirs: /, /lost+found
    }

    // ---- per-group on-disk metadata --------------------------------------
    for g in 0..l.groups {
        let start = l.group_start(g);
        let (bb, ib, _it) = l.group_meta(g);

        if has_super(g) {
            if g == 0 {
                // Primary superblock lives at byte offset 1024 inside block 0.
                write_at(f, 1024, &sb.0)?;
            } else {
                let mut backup = Buf(sb.0.clone());
                backup.u16(90, g as u16); // s_block_group_nr
                write_at(f, start * BLOCK_SIZE, &backup.0)?;
            }
            write_at(f, (start + 1) * BLOCK_SIZE, &gdt.0)?;
        }

        // Block bitmap: metadata (+ root/lost+found in group 0) used; the
        // tail group's padding bits past blocks_in_group are 1.
        let mut bbm = Buf::new(BLOCK_SIZE as usize);
        let used_meta = Layout::group_overhead(g, l.gdt_blocks)
            + if g == 0 { 1 + LOST_FOUND_BLOCKS } else { 0 };
        for bit in 0..used_meta {
            set_bit(&mut bbm.0, bit as usize);
        }
        for bit in l.blocks_in_group(g)..BLOCKS_PER_GROUP {
            set_bit(&mut bbm.0, bit as usize);
        }
        write_at(f, bb * BLOCK_SIZE, &bbm.0)?;

        // Inode bitmap: group 0 reserves inodes 1..=11; every group pads the
        // bits past inodes_per_group with 1.
        let mut ibm = Buf::new(BLOCK_SIZE as usize);
        if g == 0 {
            for bit in 0..FIRST_NON_RESERVED_INO as usize {
                set_bit(&mut ibm.0, bit);
            }
        }
        for bit in INODES_PER_GROUP..(BLOCK_SIZE * 8) {
            set_bit(&mut ibm.0, bit as usize);
        }
        write_at(f, ib * BLOCK_SIZE, &ibm.0)?;

        // Inode tables stay holes: zeroed-on-read, exactly what
        // lazy_itable_init would leave for the kernel.
    }

    // ---- the two live inodes ---------------------------------------------
    let (_, _, itable0) = l.group_meta(0);
    write_at(
        f,
        itable0 * BLOCK_SIZE + (ROOT_INO as u64 - 1) * INODE_SIZE,
        &dir_inode(3, root_block, 1).0,
    )?;
    write_at(
        f,
        itable0 * BLOCK_SIZE + (FIRST_NON_RESERVED_INO as u64 - 1) * INODE_SIZE,
        &dir_inode(2, lf_block, LOST_FOUND_BLOCKS).0,
    )?;

    // ---- directory data ---------------------------------------------------
    // Root: ".", "..", "lost+found".
    let mut root = Buf::new(BLOCK_SIZE as usize);
    let mut off = dirent(&mut root, 0, ROOT_INO, b".", 12);
    off += dirent(&mut root, off, ROOT_INO, b"..", 12);
    dirent(
        &mut root,
        off,
        FIRST_NON_RESERVED_INO,
        b"lost+found",
        BLOCK_SIZE as usize - off,
    );
    write_at(f, root_block * BLOCK_SIZE, &root.0)?;

    // lost+found: ".", ".." in its first block; the rest are empty dir
    // blocks (a single unused entry spanning the block).
    let mut lf = Buf::new(BLOCK_SIZE as usize);
    let off = dirent(&mut lf, 0, FIRST_NON_RESERVED_INO, b".", 12);
    dirent(&mut lf, off, ROOT_INO, b"..", BLOCK_SIZE as usize - off);
    write_at(f, lf_block * BLOCK_SIZE, &lf.0)?;
    let mut empty = Buf::new(BLOCK_SIZE as usize);
    empty.u16(4, BLOCK_SIZE as u16); // inode 0, rec_len = whole block
    for b in 1..LOST_FOUND_BLOCKS {
        write_at(f, (lf_block + b) * BLOCK_SIZE, &empty.0)?;
    }
    Ok(())
}

/// A directory inode (mode 0755) whose data is `nblocks` contiguous blocks
/// at `start`, expressed as a single depth-0 extent.
fn dir_inode(links: u16, start: u64, nblocks: u64) -> Buf {
    let mut ino = Buf::new(INODE_SIZE as usize);
    ino.u16(0, 0o040755);
    ino.u32(4, (nblocks * BLOCK_SIZE) as u32); // size_lo
    ino.u32(8, FIXED_TIME); // atime
    ino.u32(12, FIXED_TIME); // ctime
    ino.u32(16, FIXED_TIME); // mtime
    ino.u16(26, links);
    ino.u32(28, (nblocks * BLOCK_SIZE / 512) as u32); // i_blocks in 512s
    ino.u32(32, 0x0008_0000); // EXT4_EXTENTS_FL
                              // Extent tree in i_block (offset 40): header + one leaf extent.
    ino.u16(40, 0xF30A); // eh_magic
    ino.u16(42, 1); // eh_entries
    ino.u16(44, 4); // eh_max
    ino.u16(46, 0); // eh_depth
    ino.u32(52, 0); // ee_block (logical 0)
    ino.u16(56, nblocks as u16); // ee_len
    ino.u16(58, 0); // ee_start_hi
    ino.u32(60, start as u32); // ee_start_lo
    ino.u16(128, 32); // i_extra_isize
    ino
}

/// Write one dirent; returns its rec_len.
fn dirent(buf: &mut Buf, off: usize, ino: u32, name: &[u8], rec_len: usize) -> usize {
    buf.u32(off, ino);
    buf.u16(off + 4, rec_len as u16);
    buf.0[off + 6] = name.len() as u8;
    buf.0[off + 7] = 2; // file_type: directory
    buf.bytes(off + 8, name);
    rec_len
}

fn set_bit(bytes: &mut [u8], bit: usize) {
    bytes[bit / 8] |= 1 << (bit % 8);
}

fn write_at(f: &mut std::fs::File, offset: u64, data: &[u8]) -> std::io::Result<()> {
    f.seek(SeekFrom::Start(offset))?;
    f.write_all(data)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    fn tmp(name: &str) -> PathBuf {
        let p = std::env::temp_dir().join(format!(
            "sm-ext4-test-{name}-{}-{}",
            std::process::id(),
            std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .unwrap()
                .as_nanos()
        ));
        let _ = std::fs::remove_file(&p);
        p
    }

    /// e2fsck if the host has it (brew e2fsprogs on mac, e2fsprogs on
    /// linux); otherwise skip — the volume integration suites still mount
    /// and exercise the filesystem in a real guest on both backends.
    fn e2fsck() -> Option<PathBuf> {
        let mut candidates = vec![
            PathBuf::from("/usr/sbin/e2fsck"),
            PathBuf::from("/sbin/e2fsck"),
            PathBuf::from("/opt/homebrew/sbin/e2fsck"),
            PathBuf::from("/usr/local/sbin/e2fsck"),
        ];
        if let Ok(home) = std::env::var("HOME") {
            // Android platform-tools ship e2fsprogs on macOS dev boxes —
            // the same place `locate_mke2fs` historically found mke2fs.
            candidates.push(PathBuf::from(home).join("Library/Android/sdk/platform-tools/e2fsck"));
        }
        for p in candidates {
            if p.is_file() {
                return Some(p);
            }
        }
        if let Ok(out) = std::process::Command::new("which").arg("e2fsck").output() {
            if out.status.success() {
                return Some(PathBuf::from(
                    String::from_utf8_lossy(&out.stdout).trim().to_string(),
                ));
            }
        }
        None
    }

    fn fsck_clean(path: &Path) {
        let Some(fsck) = e2fsck() else {
            eprintln!("[skip] e2fsck not on this host — relying on guest-mount integration tests");
            return;
        };
        let out = std::process::Command::new(&fsck)
            .arg("-fn")
            .arg(path)
            .output()
            .expect("run e2fsck");
        assert!(
            out.status.success(),
            "e2fsck -fn failed on {}:\n{}\n{}",
            path.display(),
            String::from_utf8_lossy(&out.stdout),
            String::from_utf8_lossy(&out.stderr)
        );
    }

    #[test]
    fn fsck_clean_across_sizes() {
        // Exercise: single group, multi-group, sparse_super backups (g 3,5,7,9),
        // a runt tail that gets dropped, and the common 1 GiB default.
        for size in [
            8u64 * 1024 * 1024,
            128 * 1024 * 1024,
            128 * 1024 * 1024 + 4096, // 1-block runt tail group
            1024 * 1024 * 1024,
            1297 * 1024 * 1024, // odd size, partial tail group
        ] {
            let p = tmp(&format!("sz{size}"));
            make_empty_ext4(&p, size).unwrap();
            fsck_clean(&p);
            let _ = std::fs::remove_file(&p);
        }
    }

    #[test]
    fn deterministic_output() {
        let a = tmp("det-a");
        let b = tmp("det-b");
        make_empty_ext4(&a, 256 * 1024 * 1024).unwrap();
        make_empty_ext4(&b, 256 * 1024 * 1024).unwrap();
        assert_eq!(
            std::fs::read(&a).unwrap(),
            std::fs::read(&b).unwrap(),
            "same-size volumes must be byte-identical"
        );
        let _ = std::fs::remove_file(&a);
        let _ = std::fs::remove_file(&b);
    }

    #[test]
    fn refuses_existing_file_and_tiny_sizes() {
        let p = tmp("exists");
        std::fs::write(&p, b"precious").unwrap();
        assert!(make_empty_ext4(&p, 64 * 1024 * 1024).is_err());
        assert_eq!(std::fs::read(&p).unwrap(), b"precious");
        let _ = std::fs::remove_file(&p);

        let q = tmp("tiny");
        assert!(make_empty_ext4(&q, 4 * 1024 * 1024).is_err());
        assert!(!q.exists());
    }

    #[test]
    fn superblock_invariants() {
        let p = tmp("sbinv");
        make_empty_ext4(&p, 64 * 1024 * 1024).unwrap();
        let data = std::fs::read(&p).unwrap();
        let sb = &data[1024..2048];
        let magic = u16::from_le_bytes([sb[56], sb[57]]);
        assert_eq!(magic, EXT4_MAGIC);
        let compat = u32::from_le_bytes([sb[92], sb[93], sb[94], sb[95]]);
        assert_eq!(compat & 0x0004, 0, "HAS_JOURNAL must NOT be set");
        let blocks = u32::from_le_bytes([sb[4], sb[5], sb[6], sb[7]]) as u64;
        assert_eq!(blocks, 64 * 1024 * 1024 / BLOCK_SIZE);
        let _ = std::fs::remove_file(&p);
    }
}