supermachine 0.7.70

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
//! In-process squashfs writer (backhand), replacing the `mksquashfs`
//! subprocess. `mksquashfs` was the one bake-path tool NOT bundled with
//! macOS — it required a separate `brew install squashfs`. backhand builds
//! the filesystem in-process with zstd compression at level 3, matching the
//! exact image the guest kernel already mounts (squashfs-zstd), so this is a
//! drop-in with zero guest-side change.
//!
//! Node ownership is set directly on each node (uid/gid/perms), which folds
//! away the old `-all-root` + `-pseudo-override` + pseudo-file dance: the OCI
//! layer's non-root ownership is recovered from the tar headers and applied
//! to the matching nodes.

use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
use std::os::unix::fs::{FileTypeExt, MetadataExt};
use std::path::{Path, PathBuf};

use backhand::compression::Compressor;
use backhand::{FilesystemCompressor, FilesystemWriter, NodeHeader};

/// Where each node's uid/gid/perms come from.
pub(crate) enum Ownership {
    /// uid=gid=0; perms+mtime from the on-disk file (mksquashfs `-all-root`).
    AllRoot,
    /// uid=gid=0 + on-disk perms by default, with per-relative-path overrides
    /// (uid, gid, perms) recovered from an OCI layer's tar headers (replaces
    /// `-all-root` + `-pseudo-override`). Map keys are paths relative to the
    /// source root, with no leading `./` or `/`.
    OciLayer(HashMap<String, (u32, u32, u16)>),
    /// uid/gid/perms/mtime all from the on-disk file (the supermachine delta
    /// layer: mksquashfs there ran without `-all-root`, recording the
    /// build-host uid for its own injected files).
    FromMetadata,
}

/// Build `out_path` as a zstd(level 3) squashfs of everything under `src_dir`.
/// No subprocess; bounded open FDs (one file streamed at a time).
pub(crate) fn write_squashfs(
    src_dir: &Path,
    out_path: &Path,
    ownership: &Ownership,
) -> Result<(), String> {
    let mut fs = FilesystemWriter::default();
    // `Compressor::Zstd` with no explicit options compresses at level 3 —
    // identical to the bake's `mksquashfs -comp zstd -Xcompression-level 3`.
    let compressor = FilesystemCompressor::new(Compressor::Zstd, None)
        .map_err(|e| format!("squashfs zstd compressor: {e}"))?;
    fs.set_compressor(compressor);

    // Sorted walk so a directory is always pushed before its children
    // (a path is a strict prefix of, and sorts before, anything beneath it).
    let mut entries: Vec<PathBuf> = Vec::new();
    collect_tree(src_dir, &mut entries)?;
    entries.sort();

    for abs in &entries {
        let rel = abs
            .strip_prefix(src_dir)
            .map_err(|e| format!("strip prefix {}: {e}", abs.display()))?;
        let meta = match std::fs::symlink_metadata(abs) {
            Ok(m) => m,
            // A path that vanished mid-walk (e.g. a racing cleanup) is skipped
            // rather than aborting the whole squashfs.
            Err(_) => continue,
        };
        let header = node_header(rel, &meta, ownership);
        // squashfs paths are absolute from the image root.
        let sq_path = Path::new("/").join(rel);
        let ft = meta.file_type();

        let push = if ft.is_dir() {
            fs.push_dir(&sq_path, header)
        } else if ft.is_symlink() {
            let target = std::fs::read_link(abs)
                .map_err(|e| format!("read symlink {}: {e}", abs.display()))?;
            fs.push_symlink(target, &sq_path, header)
        } else if ft.is_file() {
            fs.push_file(LazyFile::new(abs.clone()), &sq_path, header)
        } else if ft.is_char_device() {
            fs.push_char_device(meta.rdev() as u32, &sq_path, header)
        } else if ft.is_block_device() {
            fs.push_block_device(meta.rdev() as u32, &sq_path, header)
        } else if ft.is_fifo() {
            fs.push_fifo(&sq_path, header)
        } else if ft.is_socket() {
            fs.push_socket(&sq_path, header)
        } else {
            // Unknown node type — nothing mksquashfs could record either.
            continue;
        };
        push.map_err(|e| format!("squashfs add {}: {e}", sq_path.display()))?;
    }

    let out = File::create(out_path).map_err(|e| format!("create {}: {e}", out_path.display()))?;
    let mut bw = std::io::BufWriter::new(out);
    fs.write(&mut bw)
        .map_err(|e| format!("write squashfs {}: {e}", out_path.display()))?;
    Ok(())
}

/// Resolve a node's `(permissions, uid, gid, mtime)` per the ownership policy.
fn node_header(rel: &Path, meta: &std::fs::Metadata, ownership: &Ownership) -> NodeHeader {
    let on_disk_perms = (meta.mode() & 0o7777) as u16;
    let mtime = meta.mtime().clamp(0, u32::MAX as i64) as u32;
    let (perms, uid, gid) = match ownership {
        Ownership::AllRoot => (on_disk_perms, 0, 0),
        Ownership::FromMetadata => (on_disk_perms, meta.uid(), meta.gid()),
        Ownership::OciLayer(overrides) => {
            let key = rel.to_string_lossy();
            match overrides.get(key.as_ref()) {
                Some(&(uid, gid, perms)) => (perms, uid, gid),
                None => (on_disk_perms, 0, 0),
            }
        }
    };
    NodeHeader::new(perms, uid, gid, mtime)
}

/// Recursively collect every path under `root` (excluding `root` itself).
fn collect_tree(root: &Path, out: &mut Vec<PathBuf>) -> Result<(), String> {
    let mut stack = vec![root.to_path_buf()];
    while let Some(dir) = stack.pop() {
        let rd = match std::fs::read_dir(&dir) {
            Ok(rd) => rd,
            Err(_) => continue,
        };
        for entry in rd.flatten() {
            let path = entry.path();
            out.push(path.clone());
            // Recurse into real directories only (not symlinks to dirs).
            if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) {
                stack.push(path);
            }
        }
    }
    Ok(())
}

/// A `Read` over a backing file that opens lazily on first read and releases
/// the FD at EOF — so during `write()` at most one source file is open at a
/// time, and a layer with 100k files can't exhaust the FD table.
///
/// Once EOF is reached the reader STAYS at EOF (returns 0 forever); it never
/// reopens. backhand consumes each `push_file` reader exactly once
/// (`add_bytes` → `SquashfsFileWriter::Consumed`), and its block loop calls
/// `read_chunk()` one extra time past the file's end expecting an empty chunk
/// to terminate. An earlier version reopened-from-start on that post-EOF read,
/// replaying the whole file as a fresh block so the loop never terminated — any
/// file ≥ one 128 KiB block (with a partial tail) ballooned without end (an
/// ~8 MB alpine rootfs → >11 GB). A consumed reader must read as consumed.
struct LazyFile {
    path: PathBuf,
    file: Option<File>,
    eof: bool,
}

impl LazyFile {
    fn new(path: PathBuf) -> Self {
        Self {
            path,
            file: None,
            eof: false,
        }
    }
}

impl Read for LazyFile {
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        if self.eof {
            return Ok(0);
        }
        if self.file.is_none() {
            self.file = Some(File::open(&self.path)?);
        }
        let n = self.file.as_mut().unwrap().read(buf)?;
        if n == 0 {
            // EOF — release the FD and latch EOF (never reopen + replay).
            self.file = None;
            self.eof = true;
        }
        Ok(n)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use backhand::FilesystemReader;

    /// Regression for the `LazyFile` reopen balloon: a multi-block file (≥ one
    /// 128 KiB block, with a partial tail) plus symlinks must produce a small,
    /// finite squashfs. backhand's block loop reads one chunk past the file's
    /// end expecting empty to terminate; the old reopen-on-EOF replayed the
    /// whole file there, looping until it ballooned (an ~8 MB alpine rootfs →
    /// >11 GB, never finished). Latching EOF fixes it.
    #[test]
    fn multiblock_file_and_symlinks_do_not_balloon() {
        let dir = std::env::temp_dir().join(format!("sm-sqfs-many-{}", std::process::id()));
        let _ = std::fs::remove_dir_all(&dir);
        std::fs::create_dir_all(dir.join("bin")).unwrap();
        // > 1 squashfs block so backhand fragments the tail (triggers re-read).
        std::fs::write(dir.join("bin/busybox"), vec![0xabu8; 800 * 1024]).unwrap();
        for i in 0..400 {
            std::os::unix::fs::symlink("/bin/busybox", dir.join(format!("bin/applet{i}"))).unwrap();
        }
        std::fs::create_dir_all(dir.join("var")).unwrap();
        std::os::unix::fs::symlink("../run", dir.join("var/run")).unwrap();

        let out = dir.with_extension("squashfs");
        let _ = std::fs::remove_file(&out);
        write_squashfs(&dir, &out, &Ownership::AllRoot).unwrap();
        let sz = std::fs::metadata(&out).unwrap().len();
        // 800 KiB of one-byte-repeated data compresses tiny; a balloon is GBs.
        assert!(sz < 8 * 1024 * 1024, "squashfs ballooned to {sz} bytes");

        // The image must still be a valid, readable squashfs with the file +
        // symlinks present (not corrupt/truncated).
        let bytes = std::fs::read(&out).unwrap();
        let reader = FilesystemReader::from_reader(std::io::Cursor::new(bytes)).unwrap();
        let paths: Vec<String> = reader
            .files()
            .map(|n| n.fullpath.to_string_lossy().into_owned())
            .collect();
        assert!(paths.iter().any(|p| p == "/bin/busybox"), "file present");
        assert!(paths.iter().any(|p| p == "/bin/applet0"), "symlink present");
        let _ = std::fs::remove_dir_all(&dir);
        let _ = std::fs::remove_file(&out);
    }

    #[test]
    fn roundtrip_tree_with_ownership_and_symlink() {
        let dir = std::env::temp_dir().join(format!("sm-sqfs-{}", std::process::id()));
        let _ = std::fs::remove_dir_all(&dir);
        std::fs::create_dir_all(dir.join("etc")).unwrap();
        std::fs::write(dir.join("etc/hello.txt"), b"hi there").unwrap();
        std::fs::write(dir.join("rootfile"), b"root-owned").unwrap();
        std::os::unix::fs::symlink("hello.txt", dir.join("etc/link")).unwrap();

        // One non-root override (uid/gid 1000, perms 0640).
        let mut overrides = HashMap::new();
        overrides.insert("etc/hello.txt".to_owned(), (1000u32, 1000u32, 0o640u16));

        let out = dir.with_extension("squashfs");
        let _ = std::fs::remove_file(&out);
        write_squashfs(&dir, &out, &Ownership::OciLayer(overrides)).unwrap();

        // Read it back and verify structure + ownership.
        let bytes = std::fs::read(&out).unwrap();
        let reader = FilesystemReader::from_reader(std::io::Cursor::new(bytes)).unwrap();
        let mut saw_hello = false;
        let mut saw_root = false;
        let mut saw_link = false;
        for node in reader.files() {
            let p = node.fullpath.to_string_lossy().into_owned();
            match p.as_str() {
                "/etc/hello.txt" => {
                    assert_eq!(node.header.uid, 1000, "override uid");
                    assert_eq!(node.header.gid, 1000, "override gid");
                    assert_eq!(node.header.permissions & 0o7777, 0o640);
                    saw_hello = true;
                }
                "/rootfile" => {
                    assert_eq!(node.header.uid, 0, "default root uid");
                    assert_eq!(node.header.gid, 0, "default root gid");
                    saw_root = true;
                }
                "/etc/link" => saw_link = true,
                _ => {}
            }
        }
        assert!(
            saw_hello && saw_root && saw_link,
            "expected all nodes present"
        );
        let _ = std::fs::remove_dir_all(&dir);
        let _ = std::fs::remove_file(&out);
    }

    fn unique_dir(tag: &str) -> PathBuf {
        static N: std::sync::atomic::AtomicU32 = std::sync::atomic::AtomicU32::new(0);
        let n = N.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
        let d = std::env::temp_dir().join(format!("sm-sqfs-{tag}-{}-{n}", std::process::id()));
        let _ = std::fs::remove_dir_all(&d);
        std::fs::create_dir_all(&d).unwrap();
        d
    }

    fn read_back(out: &Path) -> Vec<(String, u32, u32, u16)> {
        let bytes = std::fs::read(out).unwrap();
        let reader = FilesystemReader::from_reader(std::io::Cursor::new(bytes)).unwrap();
        reader
            .files()
            .map(|n| {
                (
                    n.fullpath.to_string_lossy().into_owned(),
                    n.header.uid,
                    n.header.gid,
                    n.header.permissions & 0o7777,
                )
            })
            .collect()
    }

    /// `FromMetadata` (the delta layer) records the on-disk uid/gid, not root.
    #[test]
    fn from_metadata_uses_on_disk_ownership() {
        let dir = unique_dir("meta");
        std::fs::write(dir.join("f"), b"x").unwrap();
        let me = std::fs::metadata(dir.join("f")).unwrap();
        let (uid, gid) = (me.uid(), me.gid());

        let out = dir.with_extension("squashfs");
        write_squashfs(&dir, &out, &Ownership::FromMetadata).unwrap();
        let f = read_back(&out)
            .into_iter()
            .find(|(p, ..)| p == "/f")
            .expect("/f present");
        assert_eq!((f.1, f.2), (uid, gid), "FromMetadata keeps on-disk owner");
        let _ = std::fs::remove_dir_all(&dir);
        let _ = std::fs::remove_file(&out);
    }

    /// Deeply nested dirs round-trip — the sorted walk must push every parent
    /// before its children (push_dir requires the parent to exist).
    #[test]
    fn deep_nesting_roundtrips() {
        let dir = unique_dir("deep");
        let nested = dir.join("a/b/c/d/e/f/g");
        std::fs::create_dir_all(&nested).unwrap();
        std::fs::write(nested.join("leaf"), b"deep").unwrap();

        let out = dir.with_extension("squashfs");
        write_squashfs(&dir, &out, &Ownership::AllRoot).unwrap();
        let paths: Vec<String> = read_back(&out).into_iter().map(|(p, ..)| p).collect();
        assert!(
            paths.iter().any(|p| p == "/a/b/c/d/e/f/g/leaf"),
            "deep leaf present, got {paths:?}"
        );
        let _ = std::fs::remove_dir_all(&dir);
        let _ = std::fs::remove_file(&out);
    }

    /// A non-ASCII / multi-byte filename survives the walk + squashfs path
    /// round-trip (macOS may store it NFD on disk; we just preserve the bytes).
    #[test]
    fn unicode_path_roundtrips() {
        let dir = unique_dir("uni");
        std::fs::write(dir.join("café-η-日本.txt"), b"u").unwrap();
        let out = dir.with_extension("squashfs");
        write_squashfs(&dir, &out, &Ownership::AllRoot).unwrap();
        let paths: Vec<String> = read_back(&out).into_iter().map(|(p, ..)| p).collect();
        assert!(
            // NFC as written, or NFD (é → e + combining accent) as macOS may store it.
            paths
                .iter()
                .any(|p| p.contains("日本.txt") && p.contains("caf")),
            "unicode file present, got {paths:?}"
        );
        let _ = std::fs::remove_dir_all(&dir);
        let _ = std::fs::remove_file(&out);
    }

    /// The FD-bound claim: `LazyFile` opens on first read and releases the FD at
    /// EOF, so backhand (which reads files sequentially at write() time) keeps
    /// at most ~1 open — a 100k-file layer can't EMFILE. And critically, once
    /// EOF is reached it STAYS at EOF (the reader is consumed once; it must not
    /// reopen + replay, which previously ballooned the writer — see
    /// `multiblock_file_and_symlinks_do_not_balloon`).
    #[test]
    fn lazy_file_releases_fd_at_eof_and_stays_eof() {
        let dir = unique_dir("lazy");
        let p = dir.join("data");
        std::fs::write(&p, b"hello world").unwrap();

        let mut lf = LazyFile::new(p.clone());
        assert!(lf.file.is_none(), "no FD before first read");
        let mut buf = Vec::new();
        lf.read_to_end(&mut buf).unwrap();
        assert_eq!(buf, b"hello world");
        assert!(lf.file.is_none(), "FD released at EOF");
        // A consumed reader stays empty — it does NOT reopen and replay (the
        // backhand block loop relies on a post-EOF read returning empty to
        // terminate).
        let mut buf2 = Vec::new();
        lf.read_to_end(&mut buf2).unwrap();
        assert!(buf2.is_empty(), "consumed reader stays at EOF (no replay)");
        let _ = std::fs::remove_dir_all(&dir);
    }

    /// Many files in one squashfs build without error (exercises the bulk path
    /// the lazy-FD design protects).
    #[test]
    fn many_files_build() {
        let dir = unique_dir("many");
        for i in 0..400u32 {
            std::fs::write(dir.join(format!("f{i:04}")), format!("file {i}")).unwrap();
        }
        let out = dir.with_extension("squashfs");
        write_squashfs(&dir, &out, &Ownership::AllRoot).unwrap();
        let n = read_back(&out)
            .into_iter()
            .filter(|(p, ..)| p.starts_with("/f"))
            .count();
        assert_eq!(n, 400, "all 400 files recorded");
        let _ = std::fs::remove_dir_all(&dir);
        let _ = std::fs::remove_file(&out);
    }
}