supermachine 0.7.82

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
//! In-process, journal-less ext4 image synthesis for data volumes.
//!
//! Replaces the host `mke2fs` shell-out (the last host-binary dependency on
//! the volume path, on BOTH backends — brew e2fsprogs on macOS, e2fsprogs on
//! Linux). An *empty* ext4 filesystem is a closed-form artifact: superblock,
//! group descriptors, bitmaps, the reserved inodes, root + `lost+found` —
//! everything else is zeros a sparse file already provides (including the
//! inode tables, which is exactly what `mke2fs -E lazy_itable_init=1` defers
//! to first mount; here they're simply holes).
//!
//! Deliberately **no journal** (`mke2fs -O ^has_journal` equivalent): the
//! volume durability contract makes it pure overhead. Pool-cycle writes are
//! ephemeral by design (per-cycle COW isolation, see `with_volume`); durable
//! writes happen at snapshot capture, where the pre-capture guest `sync` is
//! the consistency point. The journal would cost (a) a double-write on every
//! guest metadata commit — an `npm install` issues tens of thousands — and
//! (b) up to 64 MiB of touched journal blocks per volume dragged into every
//! capture-materialized `volumes/<i>.img`. Crash-mid-bake recovery is a
//! rebake, same as every other bake artifact.
//!
//! Also deliberately **deterministic**: fixed UUID/label/timestamps/hash
//! seed, so equal-sized volumes are byte-identical. Host mke2fs version
//! differences silently change defaults; this never drifts, and identical
//! blank images share page cache across VMs for free.
//!
//! Feature set (chosen for plain kernel-driver mounting, no checksumming):
//!   compat:    DIR_INDEX | EXT_ATTR
//!   incompat:  FILETYPE | EXTENTS
//!   ro_compat: SPARSE_SUPER | LARGE_FILE | DIR_NLINK | EXTRA_ISIZE
//! No resize_inode either — volumes are fixed-size by design (growing or
//! shrinking the backing file would invalidate the filesystem anyway).
//!
//! Validated by `e2fsck -fn` in unit tests (test-only host dependency,
//! skipped when absent) and by the volume integration suites, which mount,
//! exercise, snapshot, and cycle these filesystems on both backends.

use std::io::{Seek, SeekFrom, Write};
use std::path::Path;

const BLOCK_SIZE: u64 = 4096;
const BLOCKS_PER_GROUP: u64 = 8 * BLOCK_SIZE; // one 4 KiB bitmap block of bits
/// mke2fs's default inode ratio (one inode per 16 KiB of space).
const INODE_RATIO: u64 = 16384;
const INODES_PER_GROUP: u64 = BLOCKS_PER_GROUP * BLOCK_SIZE / INODE_RATIO; // 8192
const INODE_SIZE: u64 = 256;
const ITABLE_BLOCKS: u64 = INODES_PER_GROUP * INODE_SIZE / BLOCK_SIZE; // 512

const EXT4_MAGIC: u16 = 0xEF53;
const ROOT_INO: u32 = 2;
const FIRST_NON_RESERVED_INO: u32 = 11; // lost+found
const LOST_FOUND_BLOCKS: u64 = 4; // 16 KiB, mke2fs convention

// Feature flags.
const COMPAT_EXT_ATTR: u32 = 0x0008;
const COMPAT_DIR_INDEX: u32 = 0x0020;
const INCOMPAT_FILETYPE: u32 = 0x0002;
const INCOMPAT_EXTENTS: u32 = 0x0040;
const RO_COMPAT_SPARSE_SUPER: u32 = 0x0001;
const RO_COMPAT_LARGE_FILE: u32 = 0x0002;
const RO_COMPAT_DIR_NLINK: u32 = 0x0020;
const RO_COMPAT_EXTRA_ISIZE: u32 = 0x0040;

/// Fixed identity for determinism. The guest mounts volumes by device path
/// (`sm.volume=/dev/vdX:...`), never by UUID, so a shared UUID is harmless.
const VOLUME_UUID: [u8; 16] = *b"supermachine.vol";
const VOLUME_LABEL: &[u8; 16] = b"supermachine-vol";
/// 2020-09-13T12:26:40Z — any fixed PAST instant keeps e2fsck quiet
/// ("in the future" is what it warns about) while staying deterministic.
const FIXED_TIME: u32 = 1_600_000_000;
const HASH_SEED: [u32; 4] = [0x5375_7065, 0x724d_6163, 0x6869_6e65, 0x4558_5434];

/// Little-endian field writer into a fixed-size buffer.
struct Buf(Vec<u8>);
impl Buf {
    fn new(len: usize) -> Self {
        Buf(vec![0u8; len])
    }
    fn u16(&mut self, off: usize, v: u16) {
        self.0[off..off + 2].copy_from_slice(&v.to_le_bytes());
    }
    fn u32(&mut self, off: usize, v: u32) {
        self.0[off..off + 4].copy_from_slice(&v.to_le_bytes());
    }
    fn bytes(&mut self, off: usize, v: &[u8]) {
        self.0[off..off + v.len()].copy_from_slice(v);
    }
}

/// Does group `g` carry a superblock + GDT backup under sparse_super
/// (groups 0, 1, and powers of 3, 5, 7)?
fn has_super(g: u64) -> bool {
    if g <= 1 {
        return g == 0 || g == 1;
    }
    for base in [3u64, 5, 7] {
        let mut p = base;
        while p < g {
            p *= base;
        }
        if p == g {
            return true;
        }
    }
    false
}

struct Layout {
    blocks_count: u64,
    groups: u64,
    gdt_blocks: u64,
}

impl Layout {
    fn new(size_bytes: u64) -> std::io::Result<Layout> {
        let device_blocks = size_bytes / BLOCK_SIZE;
        // Group 0 must hold sb + GDT + bitmaps + itable + root + lost+found
        // with room to spare; 8 MiB is a comfortable floor.
        if size_bytes < 8 * 1024 * 1024 {
            return Err(std::io::Error::other(format!(
                "volume too small to format: {size_bytes} bytes (minimum 8 MiB)"
            )));
        }
        let mut groups = device_blocks.div_ceil(BLOCKS_PER_GROUP);
        let mut gdt_blocks = (groups * 32).div_ceil(BLOCK_SIZE);
        let mut blocks_count = device_blocks;
        // A runt tail group that can't hold its own metadata plus a little
        // data is dropped — the filesystem is simply smaller than the device
        // (perfectly valid; nothing requires the fs to fill the file).
        loop {
            let tail = blocks_count - (groups - 1) * BLOCKS_PER_GROUP;
            let overhead = Self::group_overhead(groups - 1, gdt_blocks);
            if groups > 1 && tail < overhead + 8 {
                groups -= 1;
                blocks_count = groups * BLOCKS_PER_GROUP;
                gdt_blocks = (groups * 32).div_ceil(BLOCK_SIZE);
                continue;
            }
            break;
        }
        Ok(Layout {
            blocks_count,
            groups,
            gdt_blocks,
        })
    }

    /// Metadata blocks at the start of group `g`.
    fn group_overhead(g: u64, gdt_blocks: u64) -> u64 {
        let sb = if has_super(g) { 1 + gdt_blocks } else { 0 };
        sb + 1 /* block bitmap */ + 1 /* inode bitmap */ + ITABLE_BLOCKS
    }

    fn group_start(&self, g: u64) -> u64 {
        g * BLOCKS_PER_GROUP
    }

    fn blocks_in_group(&self, g: u64) -> u64 {
        if g == self.groups - 1 {
            self.blocks_count - g * BLOCKS_PER_GROUP
        } else {
            BLOCKS_PER_GROUP
        }
    }

    /// (block bitmap, inode bitmap, inode table) absolute block numbers.
    fn group_meta(&self, g: u64) -> (u64, u64, u64) {
        let base = self.group_start(g) + if has_super(g) { 1 + self.gdt_blocks } else { 0 };
        (base, base + 1, base + 2)
    }

    /// First data block of group 0 (right after its inode table): the root
    /// directory block, then lost+found's blocks.
    fn first_data_block(&self) -> u64 {
        let (_, _, itable) = self.group_meta(0);
        itable + ITABLE_BLOCKS
    }
}

/// Create `path` as a sparse file of `size_bytes` containing an empty,
/// journal-less ext4 filesystem. Byte-deterministic for a given size.
/// Refuses to touch an existing file (the caller's create-if-missing
/// contract lives with the caller; this is the format step only).
pub fn make_empty_ext4(path: &Path, size_bytes: u64) -> std::io::Result<()> {
    let layout = Layout::new(size_bytes)?;
    let mut f = std::fs::OpenOptions::new()
        .read(true)
        .write(true)
        .create_new(true)
        .open(path)?;
    f.set_len(size_bytes)?;
    if let Err(e) = write_fs(&mut f, &layout) {
        // Never leave a half-formatted file for the "already exists →
        // reuse" branch of the caller to trip over on retry.
        drop(f);
        let _ = std::fs::remove_file(path);
        return Err(e);
    }
    f.flush()?;
    Ok(())
}

fn write_fs(f: &mut std::fs::File, l: &Layout) -> std::io::Result<()> {
    let root_block = l.first_data_block();
    let lf_block = root_block + 1;

    // ---- accounting ----------------------------------------------------
    let mut free_blocks_total = 0u64;
    let mut group_free_blocks = Vec::with_capacity(l.groups as usize);
    for g in 0..l.groups {
        let mut free = l.blocks_in_group(g) - Layout::group_overhead(g, l.gdt_blocks);
        if g == 0 {
            free -= 1 + LOST_FOUND_BLOCKS; // root dir block + lost+found
        }
        group_free_blocks.push(free);
        free_blocks_total += free;
    }
    let inodes_count = l.groups * INODES_PER_GROUP;
    let free_inodes = inodes_count - FIRST_NON_RESERVED_INO as u64;

    // ---- superblock ------------------------------------------------------
    let mut sb = Buf::new(1024);
    sb.u32(0, inodes_count as u32);
    sb.u32(4, l.blocks_count as u32);
    sb.u32(8, 0); // reserved blocks: none — guests run as root anyway
    sb.u32(12, free_blocks_total as u32);
    sb.u32(16, free_inodes as u32);
    sb.u32(20, 0); // first_data_block (0 for 4 KiB blocks)
    sb.u32(24, 2); // log_block_size: 4096
    sb.u32(28, 2); // log_cluster_size (= block size, no bigalloc)
    sb.u32(32, BLOCKS_PER_GROUP as u32);
    sb.u32(36, BLOCKS_PER_GROUP as u32); // clusters_per_group
    sb.u32(40, INODES_PER_GROUP as u32);
    sb.u32(44, 0); // mtime: never mounted
    sb.u32(48, FIXED_TIME); // wtime
    sb.u16(52, 0); // mnt_count
    sb.u16(54, 0xFFFF); // max_mnt_count: disabled
    sb.u16(56, EXT4_MAGIC);
    sb.u16(58, 1); // state: clean
    sb.u16(60, 1); // errors: continue
    sb.u16(62, 0); // minor_rev
    sb.u32(64, FIXED_TIME); // lastcheck
    sb.u32(68, 0); // checkinterval: disabled
    sb.u32(72, 0); // creator_os: linux
    sb.u32(76, 1); // rev_level: dynamic
    sb.u32(84, FIRST_NON_RESERVED_INO);
    sb.u16(88, INODE_SIZE as u16);
    sb.u16(90, 0); // block_group_nr (primary; backups patch this)
    sb.u32(92, COMPAT_EXT_ATTR | COMPAT_DIR_INDEX);
    sb.u32(96, INCOMPAT_FILETYPE | INCOMPAT_EXTENTS);
    sb.u32(
        100,
        RO_COMPAT_SPARSE_SUPER | RO_COMPAT_LARGE_FILE | RO_COMPAT_DIR_NLINK | RO_COMPAT_EXTRA_ISIZE,
    );
    sb.bytes(104, &VOLUME_UUID);
    sb.bytes(120, VOLUME_LABEL);
    // s_journal_uuid (208) / s_journal_inum (224) / s_journal_dev (228) stay
    // zero — that, plus no HAS_JOURNAL compat bit, is "journal-less".
    sb.u32(236, HASH_SEED[0]); // s_hash_seed
    sb.u32(240, HASH_SEED[1]);
    sb.u32(244, HASH_SEED[2]);
    sb.u32(248, HASH_SEED[3]);
    sb.0[252] = 1; // s_def_hash_version: half_md4
    sb.u16(254, 0); // s_desc_size: 0 → 32 (no 64bit)
    sb.u32(264, FIXED_TIME); // s_mkfs_time
    sb.u16(348, 32); // s_min_extra_isize
    sb.u16(350, 32); // s_want_extra_isize

    // ---- group descriptor table -----------------------------------------
    let mut gdt = Buf::new((l.gdt_blocks * BLOCK_SIZE) as usize);
    for g in 0..l.groups {
        let (bb, ib, it) = l.group_meta(g);
        let off = (g * 32) as usize;
        gdt.u32(off, bb as u32);
        gdt.u32(off + 4, ib as u32);
        gdt.u32(off + 8, it as u32);
        gdt.u16(off + 12, group_free_blocks[g as usize] as u16);
        gdt.u16(
            off + 14,
            (INODES_PER_GROUP
                - if g == 0 {
                    FIRST_NON_RESERVED_INO as u64
                } else {
                    0
                }) as u16,
        );
        gdt.u16(off + 16, if g == 0 { 2 } else { 0 }); // used dirs: /, /lost+found
    }

    // ---- per-group on-disk metadata --------------------------------------
    for g in 0..l.groups {
        let start = l.group_start(g);
        let (bb, ib, _it) = l.group_meta(g);

        if has_super(g) {
            if g == 0 {
                // Primary superblock lives at byte offset 1024 inside block 0.
                write_at(f, 1024, &sb.0)?;
            } else {
                let mut backup = Buf(sb.0.clone());
                backup.u16(90, g as u16); // s_block_group_nr
                write_at(f, start * BLOCK_SIZE, &backup.0)?;
            }
            write_at(f, (start + 1) * BLOCK_SIZE, &gdt.0)?;
        }

        // Block bitmap: metadata (+ root/lost+found in group 0) used; the
        // tail group's padding bits past blocks_in_group are 1.
        let mut bbm = Buf::new(BLOCK_SIZE as usize);
        let used_meta = Layout::group_overhead(g, l.gdt_blocks)
            + if g == 0 { 1 + LOST_FOUND_BLOCKS } else { 0 };
        for bit in 0..used_meta {
            set_bit(&mut bbm.0, bit as usize);
        }
        for bit in l.blocks_in_group(g)..BLOCKS_PER_GROUP {
            set_bit(&mut bbm.0, bit as usize);
        }
        write_at(f, bb * BLOCK_SIZE, &bbm.0)?;

        // Inode bitmap: group 0 reserves inodes 1..=11; every group pads the
        // bits past inodes_per_group with 1.
        let mut ibm = Buf::new(BLOCK_SIZE as usize);
        if g == 0 {
            for bit in 0..FIRST_NON_RESERVED_INO as usize {
                set_bit(&mut ibm.0, bit);
            }
        }
        for bit in INODES_PER_GROUP..(BLOCK_SIZE * 8) {
            set_bit(&mut ibm.0, bit as usize);
        }
        write_at(f, ib * BLOCK_SIZE, &ibm.0)?;

        // Inode tables stay holes: zeroed-on-read, exactly what
        // lazy_itable_init would leave for the kernel.
    }

    // ---- the two live inodes ---------------------------------------------
    let (_, _, itable0) = l.group_meta(0);
    write_at(
        f,
        itable0 * BLOCK_SIZE + (ROOT_INO as u64 - 1) * INODE_SIZE,
        &dir_inode(3, root_block, 1).0,
    )?;
    write_at(
        f,
        itable0 * BLOCK_SIZE + (FIRST_NON_RESERVED_INO as u64 - 1) * INODE_SIZE,
        &dir_inode(2, lf_block, LOST_FOUND_BLOCKS).0,
    )?;

    // ---- directory data ---------------------------------------------------
    // Root: ".", "..", "lost+found".
    let mut root = Buf::new(BLOCK_SIZE as usize);
    let mut off = dirent(&mut root, 0, ROOT_INO, b".", 12);
    off += dirent(&mut root, off, ROOT_INO, b"..", 12);
    dirent(
        &mut root,
        off,
        FIRST_NON_RESERVED_INO,
        b"lost+found",
        BLOCK_SIZE as usize - off,
    );
    write_at(f, root_block * BLOCK_SIZE, &root.0)?;

    // lost+found: ".", ".." in its first block; the rest are empty dir
    // blocks (a single unused entry spanning the block).
    let mut lf = Buf::new(BLOCK_SIZE as usize);
    let off = dirent(&mut lf, 0, FIRST_NON_RESERVED_INO, b".", 12);
    dirent(&mut lf, off, ROOT_INO, b"..", BLOCK_SIZE as usize - off);
    write_at(f, lf_block * BLOCK_SIZE, &lf.0)?;
    let mut empty = Buf::new(BLOCK_SIZE as usize);
    empty.u16(4, BLOCK_SIZE as u16); // inode 0, rec_len = whole block
    for b in 1..LOST_FOUND_BLOCKS {
        write_at(f, (lf_block + b) * BLOCK_SIZE, &empty.0)?;
    }
    Ok(())
}

/// A directory inode (mode 0755) whose data is `nblocks` contiguous blocks
/// at `start`, expressed as a single depth-0 extent.
fn dir_inode(links: u16, start: u64, nblocks: u64) -> Buf {
    let mut ino = Buf::new(INODE_SIZE as usize);
    ino.u16(0, 0o040755);
    ino.u32(4, (nblocks * BLOCK_SIZE) as u32); // size_lo
    ino.u32(8, FIXED_TIME); // atime
    ino.u32(12, FIXED_TIME); // ctime
    ino.u32(16, FIXED_TIME); // mtime
    ino.u16(26, links);
    ino.u32(28, (nblocks * BLOCK_SIZE / 512) as u32); // i_blocks in 512s
    ino.u32(32, 0x0008_0000); // EXT4_EXTENTS_FL
                              // Extent tree in i_block (offset 40): header + one leaf extent.
    ino.u16(40, 0xF30A); // eh_magic
    ino.u16(42, 1); // eh_entries
    ino.u16(44, 4); // eh_max
    ino.u16(46, 0); // eh_depth
    ino.u32(52, 0); // ee_block (logical 0)
    ino.u16(56, nblocks as u16); // ee_len
    ino.u16(58, 0); // ee_start_hi
    ino.u32(60, start as u32); // ee_start_lo
    ino.u16(128, 32); // i_extra_isize
    ino
}

/// Write one dirent; returns its rec_len.
fn dirent(buf: &mut Buf, off: usize, ino: u32, name: &[u8], rec_len: usize) -> usize {
    buf.u32(off, ino);
    buf.u16(off + 4, rec_len as u16);
    buf.0[off + 6] = name.len() as u8;
    buf.0[off + 7] = 2; // file_type: directory
    buf.bytes(off + 8, name);
    rec_len
}

fn set_bit(bytes: &mut [u8], bit: usize) {
    bytes[bit / 8] |= 1 << (bit % 8);
}

fn write_at(f: &mut std::fs::File, offset: u64, data: &[u8]) -> std::io::Result<()> {
    f.seek(SeekFrom::Start(offset))?;
    f.write_all(data)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    fn tmp(name: &str) -> PathBuf {
        let p = std::env::temp_dir().join(format!(
            "sm-ext4-test-{name}-{}-{}",
            std::process::id(),
            std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .unwrap()
                .as_nanos()
        ));
        let _ = std::fs::remove_file(&p);
        p
    }

    /// e2fsck if the host has it (brew e2fsprogs on mac, e2fsprogs on
    /// linux); otherwise skip — the volume integration suites still mount
    /// and exercise the filesystem in a real guest on both backends.
    fn e2fsck() -> Option<PathBuf> {
        let mut candidates = vec![
            PathBuf::from("/usr/sbin/e2fsck"),
            PathBuf::from("/sbin/e2fsck"),
            PathBuf::from("/opt/homebrew/sbin/e2fsck"),
            PathBuf::from("/usr/local/sbin/e2fsck"),
        ];
        if let Ok(home) = std::env::var("HOME") {
            // Android platform-tools ship e2fsprogs on macOS dev boxes —
            // the same place `locate_mke2fs` historically found mke2fs.
            candidates.push(PathBuf::from(home).join("Library/Android/sdk/platform-tools/e2fsck"));
        }
        for p in candidates {
            if p.is_file() {
                return Some(p);
            }
        }
        if let Ok(out) = std::process::Command::new("which").arg("e2fsck").output() {
            if out.status.success() {
                return Some(PathBuf::from(
                    String::from_utf8_lossy(&out.stdout).trim().to_string(),
                ));
            }
        }
        None
    }

    fn fsck_clean(path: &Path) {
        let Some(fsck) = e2fsck() else {
            eprintln!("[skip] e2fsck not on this host — relying on guest-mount integration tests");
            return;
        };
        let out = std::process::Command::new(&fsck)
            .arg("-fn")
            .arg(path)
            .output()
            .expect("run e2fsck");
        assert!(
            out.status.success(),
            "e2fsck -fn failed on {}:\n{}\n{}",
            path.display(),
            String::from_utf8_lossy(&out.stdout),
            String::from_utf8_lossy(&out.stderr)
        );
    }

    #[test]
    fn fsck_clean_across_sizes() {
        // Exercise: single group, multi-group, sparse_super backups (g 3,5,7,9),
        // a runt tail that gets dropped, and the common 1 GiB default.
        for size in [
            8u64 * 1024 * 1024,
            128 * 1024 * 1024,
            128 * 1024 * 1024 + 4096, // 1-block runt tail group
            1024 * 1024 * 1024,
            1297 * 1024 * 1024, // odd size, partial tail group
        ] {
            let p = tmp(&format!("sz{size}"));
            make_empty_ext4(&p, size).unwrap();
            fsck_clean(&p);
            let _ = std::fs::remove_file(&p);
        }
    }

    #[test]
    fn deterministic_output() {
        let a = tmp("det-a");
        let b = tmp("det-b");
        make_empty_ext4(&a, 256 * 1024 * 1024).unwrap();
        make_empty_ext4(&b, 256 * 1024 * 1024).unwrap();
        assert_eq!(
            std::fs::read(&a).unwrap(),
            std::fs::read(&b).unwrap(),
            "same-size volumes must be byte-identical"
        );
        let _ = std::fs::remove_file(&a);
        let _ = std::fs::remove_file(&b);
    }

    #[test]
    fn refuses_existing_file_and_tiny_sizes() {
        let p = tmp("exists");
        std::fs::write(&p, b"precious").unwrap();
        assert!(make_empty_ext4(&p, 64 * 1024 * 1024).is_err());
        assert_eq!(std::fs::read(&p).unwrap(), b"precious");
        let _ = std::fs::remove_file(&p);

        let q = tmp("tiny");
        assert!(make_empty_ext4(&q, 4 * 1024 * 1024).is_err());
        assert!(!q.exists());
    }

    #[test]
    fn superblock_invariants() {
        let p = tmp("sbinv");
        make_empty_ext4(&p, 64 * 1024 * 1024).unwrap();
        let data = std::fs::read(&p).unwrap();
        let sb = &data[1024..2048];
        let magic = u16::from_le_bytes([sb[56], sb[57]]);
        assert_eq!(magic, EXT4_MAGIC);
        let compat = u32::from_le_bytes([sb[92], sb[93], sb[94], sb[95]]);
        assert_eq!(compat & 0x0004, 0, "HAS_JOURNAL must NOT be set");
        let blocks = u32::from_le_bytes([sb[4], sb[5], sb[6], sb[7]]) as u64;
        assert_eq!(blocks, 64 * 1024 * 1024 / BLOCK_SIZE);
        let _ = std::fs::remove_file(&p);
    }
}