supermachine 0.7.69

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
//! OCI layer materialisation: layer-SHA enumeration (image-index /
//! legacy manifest), the build plan, per-layer ownership recovery (for the
//! in-process squashfs writer), OCI whiteout handling, and parallel layer
//! extraction → squashfs.
//! Split out of the bake monolith; shared via `use super::*`.

use super::*;

pub(super) fn read_layer_index(
    path: &Path,
    cache_dir: &Path,
) -> Result<Option<Vec<String>>, String> {
    let text = match std::fs::read_to_string(path) {
        Ok(text) => text,
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
        Err(e) => return Err(format!("read layer index {}: {e}", path.display())),
    };
    let shas: Vec<String> = text
        .lines()
        .map(str::trim)
        .filter(|line| !line.is_empty())
        .map(ToOwned::to_owned)
        .collect();
    if shas.is_empty() {
        return Ok(None);
    }
    let complete = shas
        .iter()
        .all(|sha| cache_dir.join(format!("{sha}.squashfs")).is_file());
    Ok(complete.then_some(shas))
}

pub(super) fn layer_shas_from_save_dir(
    image: &str,
    save_dir: &Path,
    want_arch: &str,
) -> Result<Vec<String>, String> {
    // Try OCI path first; fall back to legacy manifest.json on
    // missing-blob / no-arch-manifest failures (Docker Desktop
    // `docker save` of multi-arch images includes the layer blobs
    // but omits the per-arch manifest BLOB referenced by its own
    // OCI index. See `inspect_oci_layout` for the parallel
    // fallback on metadata side).
    match layer_shas_from_save_dir_via_index(image, save_dir, want_arch) {
        Ok(v) => Ok(v),
        Err(e) => {
            layer_shas_from_legacy_manifest(image, save_dir, want_arch).map_err(|legacy_err| {
                format!(
                    "OCI layer enumeration failed ({e}); legacy \
                 manifest.json fallback also failed: {legacy_err}"
                )
            })
        }
    }
}

pub(super) fn layer_shas_from_save_dir_via_index(
    image: &str,
    save_dir: &Path,
    want_arch: &str,
) -> Result<Vec<String>, String> {
    let index_text = std::fs::read_to_string(save_dir.join("index.json"))
        .map_err(|e| format!("read image save index.json: {e}"))?;
    let index: serde_json::Value =
        serde_json::from_str(&index_text).map_err(|e| format!("image save index JSON: {e}"))?;
    // Walk nested OCI image-index entries. Apple's
    // `container image save` wraps even single-arch images in an
    // outer `oci.image.index.v1+json` whose only `manifests[]` entry
    // points at *another* image-index, and the actual
    // `platform.architecture` lives one level down. Docker's
    // `docker save --platform linux/amd64` does the same nesting.
    // Plain manifest-list registries land all platforms at the top
    // level. The shared helper handles all three shapes by recursing
    // on `image.index` / `manifest.list` descriptors — either when
    // the descriptor has no `platform` field (nesting wrapper), or
    // when its platform matches `want_arch`.
    let descriptor = find_oci_manifest_descriptor(save_dir, &index, want_arch, 0)
        .map_err(|_| format!("no {want_arch} manifest in image {image}"))?;
    let manifest_digest = descriptor
        .get("digest")
        .and_then(|v| v.as_str())
        .ok_or_else(|| format!("no {want_arch} manifest in image {image}"))
        .and_then(sha256_path_component)?;
    let manifest_path = save_dir.join("blobs/sha256").join(&manifest_digest);
    let manifest_text = std::fs::read_to_string(&manifest_path)
        .map_err(|e| format!("read image save manifest {}: {e}", manifest_path.display()))?;
    let manifest: serde_json::Value = serde_json::from_str(&manifest_text)
        .map_err(|e| format!("image save manifest JSON: {e}"))?;
    let shas: Vec<String> = manifest
        .get("layers")
        .and_then(|v| v.as_array())
        .ok_or_else(|| "image save manifest missing layers".to_owned())?
        .iter()
        .map(|layer| {
            layer
                .get("digest")
                .and_then(|v| v.as_str())
                .ok_or_else(|| "image save layer missing digest".to_owned())
                .and_then(sha256_path_component)
        })
        .collect::<Result<Vec<_>, _>>()?;
    if shas.is_empty() {
        return Err(format!("image {image} has no {want_arch} layers"));
    }
    Ok(shas)
}

/// Fallback layer enumeration via legacy Docker `manifest.json`.
/// Used when the OCI image-index path fails because Docker Desktop's
/// `docker save` omits the per-arch manifest blob it references.
///
/// `manifest.json` shape:
/// ```json
/// [ { "Config": "blobs/sha256/<digest>", "RepoTags": ["..."],
///     "Layers": [ "blobs/sha256/<sha>", ... ] } ]
/// ```
/// The Layers entries are paths relative to `save_dir`, already
/// prefixed with `blobs/sha256/` — we strip that to return just
/// the SHA strings so the caller's existing
/// `layer_cache_dir/<sha>.squashfs` logic works unchanged.
///
/// `want_arch` is enforced by reading the referenced config blob's
/// `architecture` field. If the saved image is the wrong arch, we
/// fail with a clear pointer at the right `docker save --platform`
/// invocation (rather than silently booting wrong-arch binaries).
pub(super) fn layer_shas_from_legacy_manifest(
    image: &str,
    save_dir: &Path,
    want_arch: &str,
) -> Result<Vec<String>, String> {
    let entry = read_legacy_docker_manifest_entry(image, save_dir)?;
    let config_rel = entry
        .get("Config")
        .and_then(|v| v.as_str())
        .ok_or_else(|| "legacy manifest entry missing Config".to_owned())?;
    // `Config` is a manifest-controlled relative path; confine it so a
    // hostile `"Config": "../../etc/passwd"` can't read outside save_dir.
    let config_path = confined_join(save_dir, config_rel)?;
    let config_text = std::fs::read_to_string(&config_path)
        .map_err(|e| format!("read legacy config {}: {e}", config_path.display()))?;
    let config: serde_json::Value =
        serde_json::from_str(&config_text).map_err(|e| format!("parse legacy config: {e}"))?;
    let actual_arch = config
        .get("architecture")
        .and_then(|v| v.as_str())
        .unwrap_or("");
    if actual_arch != want_arch {
        return Err(format!(
            "legacy manifest.json describes a linux/{actual_arch} image but \
             the request asked for linux/{want_arch}; re-run `docker save` \
             with `--platform linux/{want_arch}` against a multi-arch source"
        ));
    }
    let layers = entry
        .get("Layers")
        .and_then(|v| v.as_array())
        .ok_or_else(|| "legacy manifest entry missing Layers".to_owned())?;
    let shas: Vec<String> = layers
        .iter()
        .filter_map(|v| v.as_str())
        .map(|s| sha256_path_component(s.strip_prefix("blobs/sha256/").unwrap_or(s)))
        .collect::<Result<Vec<_>, _>>()?;
    if shas.is_empty() {
        return Err(format!(
            "legacy manifest.json for image {image} has no layers"
        ));
    }
    Ok(shas)
}

pub(super) fn plan_layers(
    plan: &BakePlan<'_>,
    resolution: &ImageResolution,
    source: &dyn ImageSource,
) -> Result<Option<LayerPlan>, String> {
    if plan.runtime != "supermachine" {
        return Ok(None);
    }
    let t0 = Instant::now();
    let cache_dir = layer_cache_dir();
    let index_dir = cache_dir.join("images");
    std::fs::create_dir_all(&index_dir)
        .map_err(|e| format!("create layer index dir {}: {e}", index_dir.display()))?;
    std::fs::create_dir_all(cache_dir.join("deltas"))
        .map_err(|e| format!("create layer delta cache dir: {e}"))?;

    let arch = plan.arch();
    let index_path = resolution
        .image_id
        .as_deref()
        .map(|id| index_dir.join(format!("{id}.{arch}.layers")));
    let mut manifest_cache_hit = false;
    let mut save_work_dir = None;
    let mut save_dir = None;
    let layer_shas = if let Some(path) = index_path.as_deref() {
        if let Some(shas) = read_layer_index(path, &cache_dir)? {
            manifest_cache_hit = true;
            shas
        } else {
            let work_dir = temp_work_dir("supermachine-layer-plan")?;
            let saved_dir = source.save(plan.image, &work_dir)?;
            let shas = layer_shas_from_save_dir(plan.image, &saved_dir, arch)?;
            std::fs::write(path, format!("{}\n", shas.join("\n")))
                .map_err(|e| format!("write layer index {}: {e}", path.display()))?;
            save_work_dir = Some(work_dir);
            save_dir = Some(saved_dir);
            shas
        }
    } else {
        let work_dir = temp_work_dir("supermachine-layer-plan")?;
        let saved_dir = source.save(plan.image, &work_dir)?;
        let shas = layer_shas_from_save_dir(plan.image, &saved_dir, arch)?;
        save_work_dir = Some(work_dir);
        save_dir = Some(saved_dir);
        shas
    };
    let cached_layers = layer_shas
        .iter()
        .filter(|sha| cache_dir.join(format!("{sha}.squashfs")).is_file())
        .count();
    let missing_layers = layer_shas.len().saturating_sub(cached_layers);
    Ok(Some(LayerPlan {
        cache_dir,
        index_path,
        layer_shas,
        save_work_dir,
        save_dir,
        cached_layers,
        missing_layers,
        manifest_cache_hit,
        plan_ms: elapsed_ms(t0),
    }))
}

/// Recover the original uid / gid / perms from a layer tarball's headers —
/// only for paths the tar marks as non-root. Returned as a `relative-path →
/// (uid, gid, perms)` map the squashfs writer applies directly to the
/// matching node (it pushes every other node as root by default).
///
/// **Why:** macOS BSD `tar -xf` cannot preserve in-tar ownership
/// unless run as root — `chown()` is rejected for non-root callers,
/// so tar silently falls back to the running user's uid/gid (~501
/// on a typical macOS dev machine). The resulting `layer_extract/`
/// directory thus has *every* system file owned by uid 501 instead
/// of root. If that uid reached the squashfs, the guest would see
/// `/etc/passwd`, `/bin/su`, every setuid binary owned by an unknown
/// user — `sudo` refuses to run (`/etc/sudo.conf` not root-owned),
/// busybox `su` setuid-check fails, postgres entrypoints running
/// `sudo -u postgres ...` exit early, package managers refuse to run.
///
/// **Fix shape:** the squashfs writer defaults every node to uid 0
/// gid 0 — correct for ~99% of OCI base-image rootfs paths, and in
/// particular for every system file (`/etc`, `/usr`, `/bin`, setuid
/// binaries, …). This map then *overrides* only the paths the tar
/// header marks as non-root (typical example: the postgres image has
/// `/var/lib/postgresql` owned by the `postgres` user):
///
///  1. The map is tiny — root-owned paths (Unicode certs etc.) carry
///     no entry, so an NFC/NFD readdir-vs-tar byte mismatch on those
///     simply leaves them root-owned (the right answer).
///  2. Failure modes are *safe*: a missed override means the file
///     stays root-owned, correct for every system path.
///
/// Whiteout markers (`.wh.*`) are skipped — the existing
/// `remove_oci_whiteouts` step removes their targets from the
/// extracted tree.
pub(super) fn recover_layer_ownership(
    blob_path: &Path,
    extracted_root: &Path,
) -> Result<std::collections::HashMap<String, (u32, u32, u16)>, String> {
    use std::collections::HashMap;
    use std::fs::File;
    use std::io::{BufReader, Read};

    // Detect gzip via magic bytes (OCI layers are typically
    // `tar+gzip` but `tar+zstd` and uncompressed `tar` are also
    // valid per the OCI image-layer media-type spec). We only
    // handle the two we actually emit / consume: raw tar and
    // gzip-wrapped tar. Anything else falls back to raw tar
    // (which `tar::Archive` will reject cleanly with an error
    // message that points at the unsupported media type).
    let file = File::open(blob_path)
        .map_err(|e| format!("open layer blob {}: {e}", blob_path.display()))?;
    let mut head = [0u8; 2];
    let mut br = BufReader::new(file);
    let n = br.read(&mut head).unwrap_or(0);
    // Re-seek to start. BufReader needs a fresh underlying File.
    drop(br);
    let file = File::open(blob_path)
        .map_err(|e| format!("re-open layer blob {}: {e}", blob_path.display()))?;
    let reader: Box<dyn Read> = if n == 2 && head == [0x1f, 0x8b] {
        Box::new(flate2::read::GzDecoder::new(BufReader::new(file)))
    } else {
        Box::new(BufReader::new(file))
    };

    let mut archive = tar::Archive::new(reader);
    // Dedupe by path: a layer may overwrite the same path multiple
    // times (uncommon but valid per OCI). mksquashfs's pseudo
    // parser handles duplicate `m` lines for the same path by
    // applying whichever appears last — same effect as reducing
    // here, but we explicit-dedup so the output stays predictable
    // and we don't spam the file with redundant lines on
    // pathological tars (e.g. `apt` autoremove dance leaves
    // ghost entries).
    //
    // The value tuple is `(uid, gid, perms)`, applied directly to the
    // matching backhand node. We `to_string_lossy` the path once at read
    // time so the HashMap key is owned.
    let mut entries_by_path: HashMap<String, (u32, u32, u16)> = HashMap::new();

    for entry_result in archive
        .entries()
        .map_err(|e| format!("read tar entries from {}: {e}", blob_path.display()))?
    {
        let entry = match entry_result {
            Ok(e) => e,
            // A malformed entry shouldn't kill the whole bake — log
            // and move on; mksquashfs will fall back to the
            // extracted-dir's (wrong) uid for that single path.
            Err(_) => continue,
        };
        let header = entry.header();
        // Skip whiteout markers — the post-extract dir doesn't have
        // them anymore (see `remove_oci_whiteouts`), and a pseudo
        // entry pointing at a path mksquashfs can't find is a hard
        // error.
        let path = match entry.path() {
            Ok(p) => p,
            Err(_) => continue,
        };
        let name_str = match path.file_name().and_then(|s| s.to_str()) {
            Some(s) => s,
            None => continue,
        };
        if name_str.starts_with(".wh.") {
            continue;
        }
        // Also skip whatever the matching whiteout-target was —
        // `remove_oci_whiteouts` strips the file from the extracted
        // dir if the tar contained both `X` AND `.wh.X`. We don't
        // have cheap access to that decision here, but the
        // continue-on-mksquashfs-pseudo-file-miss is non-fatal in
        // practice (mksquashfs warns about missing pseudo targets,
        // doesn't abort). Worst case: the pseudo line is ignored
        // for that one path and the wrong uid stays for that file
        // only — which is exactly the pre-fix behavior.
        //
        // Skip hardlinks and the global-extended-header pax record
        // pseudo-entry; mksquashfs's `m` directive doesn't apply to
        // those (hardlinks share metadata with their target;
        // pax-headers aren't filesystem entries).
        let entry_type = header.entry_type();
        if entry_type.is_hard_link()
            || entry_type.is_gnu_longname()
            || entry_type.is_gnu_longlink()
            || entry_type.is_pax_global_extensions()
            || entry_type.is_pax_local_extensions()
        {
            continue;
        }
        let mode = header.mode().unwrap_or(0o644) & 0o7777;
        let uid = header.uid().unwrap_or(0);
        let gid = header.gid().unwrap_or(0);
        // Skip root-owned entries entirely. mksquashfs is invoked
        // with `-all-root`, so every path defaults to uid 0 / gid 0
        // already. Emitting redundant `m 0 0` pseudo lines just
        // bloats the file AND opens up failure modes for tar/disk
        // byte mismatches (Unicode filenames extracted as NFD on
        // macOS, paths removed by `remove_oci_whiteouts`, etc.).
        // The only paths that need an override are the rare cases
        // where the image's Dockerfile baked in a non-root
        // ownership (e.g. postgres' `chown -R postgres
        // /var/lib/postgresql`).
        if uid == 0 && gid == 0 {
            continue;
        }

        // mksquashfs pseudo-file path: strip leading `./` and `/`
        // because mksquashfs's `-p` directive expects the path
        // relative to the source root (which is what we pass as
        // the source-dir arg). OCI tar entries are typically
        // `path/to/file` (no leading slash) but defensive-strip
        // both forms.
        let path_str = path.to_string_lossy();
        let normalized = path_str
            .trim_start_matches("./")
            .trim_start_matches('/')
            .trim_end_matches('/');
        if normalized.is_empty() {
            // Root entry — `.` or `/` — mksquashfs forbids overriding
            // the source-root's own metadata via pseudo; skip.
            continue;
        }
        // Paths with control characters (newline, NUL, etc.) would
        // confuse mksquashfs's pseudo-file line parser; the tar
        // crate accepts them as bytes but emitting them in the
        // pseudo file would corrupt subsequent lines. Skip — the
        // file's content still ends up in the squashfs, just
        // with the (wrong) extracted-dir uid.
        if normalized.bytes().any(|b| b < 0x20 || b == 0x7f) {
            continue;
        }
        // Only emit pseudo-file entries for paths that actually
        // exist in the extracted dir. The tar may carry paths
        // that `remove_oci_whiteouts` deleted (file `X` removed
        // because the same layer contained `.wh.X`) or paths
        // whose containing dir was pruned. Without this check
        // mksquashfs aborts the whole bake with
        // "Pseudo file path X does not exist in source tree"
        // — the integrator's report.
        //
        // We use `symlink_metadata` so we DON'T follow symlinks
        // (a symlink pointing at a non-existent target should
        // still be a valid pseudo-file target — the entry IS
        // the symlink itself).
        // Confine the existence probe: an interior `..` (the tar crate
        // strips leading `/`/`./` above but not `foo/../../bar`) would
        // otherwise let a crafted layer stat paths outside the extracted
        // dir. Such an entry can't legitimately exist in-tree anyway —
        // skip it.
        let Ok(host_path) = confined_join(extracted_root, normalized) else {
            continue;
        };
        if std::fs::symlink_metadata(&host_path).is_err() {
            continue;
        }
        entries_by_path.insert(normalized.to_owned(), (uid as u32, gid as u32, mode as u16));
    }
    let _ = n; // magic-byte read above; value consumed via the re-open path.
    Ok(entries_by_path)
}

pub(super) fn remove_oci_whiteouts(root: &Path) -> Result<(), String> {
    let entries = match std::fs::read_dir(root) {
        Ok(e) => e,
        // The directory was removed by a `.wh.<dir>` whiteout processed earlier
        // in the parent's (arbitrary-order) readdir, before we recursed into the
        // now-stale entry. That's a successful deletion, not an error — making
        // whiteout processing order-independent.
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()),
        Err(e) => return Err(format!("read layer dir {}: {e}", root.display())),
    };
    for entry in entries {
        let entry = entry.map_err(|e| format!("read layer dir entry {}: {e}", root.display()))?;
        let path = entry.path();
        let file_type = entry
            .file_type()
            .map_err(|e| format!("stat layer path {}: {e}", path.display()))?;
        if file_type.is_dir() {
            remove_oci_whiteouts(&path)?;
        }
        let name = entry.file_name();
        let Some(name) = name.to_str() else {
            continue;
        };
        if !name.starts_with(".wh.") {
            continue;
        }
        if name != ".wh..wh..opq" {
            let target = path.with_file_name(name.trim_start_matches(".wh."));
            match std::fs::remove_dir_all(&target) {
                Ok(()) => {}
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
                Err(dir_err) => {
                    std::fs::remove_file(&target).map_err(|e2| {
                        format!(
                            "remove whiteout target {}: {dir_err}; {e2}",
                            target.display()
                        )
                    })?;
                }
            }
        }
        std::fs::remove_file(&path)
            .map_err(|e| format!("remove OCI whiteout {}: {e}", path.display()))?;
    }
    Ok(())
}

pub(super) fn materialize_missing_layers(
    image: &str,
    layer_plan: &LayerPlan,
    source: &dyn ImageSource,
) -> Result<LayerMaterialization, String> {
    let t0 = Instant::now();
    if layer_plan.missing_layers == 0 {
        return Ok(LayerMaterialization {
            materialize_ms: elapsed_ms(t0),
            built_layers: 0,
            reused_layers: layer_plan.cached_layers,
        });
    }

    let mut owned_work_dir = None;
    let save_dir = if let Some(save_dir) = layer_plan.save_dir.as_deref() {
        save_dir.to_path_buf()
    } else {
        let work_dir = temp_work_dir("supermachine-layer-materialize")?;
        let save_dir = source.save(image, &work_dir)?;
        owned_work_dir = Some(work_dir);
        save_dir
    };
    let arch = source.arch().to_owned();
    let result = (|| {
        let saved_shas = layer_shas_from_save_dir(image, &save_dir, &arch)?;
        if saved_shas != layer_plan.layer_shas {
            return Err(format!(
                "image layer set changed while materializing {image}; retry the command"
            ));
        }

        // 0.7.44+ parallel layer materialization. Each layer is
        // independent (distinct sha → distinct cache file, distinct
        // tmp paths via TEMP_COUNTER). The serial pre-0.7.44 path
        // was a measurable cold-pull bottleneck on multi-layer
        // images (rust:1-slim ≈ 5 layers, python:slim ≈ 4 layers).
        //
        // Each worker thread does: tar extract → whiteout strip →
        // ownership recovery → in-process squashfs (backhand) → atomic
        // rename. backhand's squashfs write is single-threaded PER layer,
        // so the cross-layer concurrency here is exactly what keeps all
        // cores busy: N layers compress on N threads. A single giant
        // layer is the one case with no intra-layer parallelism.
        //
        // Already-built layers (cache hits) short-circuit without
        // spawning a thread — common case where the user re-bakes
        // an image with the same base layers.
        use std::sync::atomic::AtomicUsize;
        let built_layers = AtomicUsize::new(0);
        let reused_layers = AtomicUsize::new(0);
        let layer_results: Vec<Result<(), String>> = std::thread::scope(|s| {
            let mut handles = Vec::with_capacity(layer_plan.layer_shas.len());
            for sha in &layer_plan.layer_shas {
                let layer_squashfs = layer_plan.cache_dir.join(format!("{sha}.squashfs"));
                if layer_squashfs.is_file() {
                    reused_layers.fetch_add(1, Ordering::Relaxed);
                    continue;
                }
                let blob = save_dir.join("blobs/sha256").join(sha);
                if !blob.is_file() {
                    // Capture as a "result" to avoid breaking the
                    // scope. Other in-flight threads will finish;
                    // we propagate the first error after join.
                    handles.push(s.spawn(move || -> Result<(), String> {
                        Err(format!("layer blob {} missing from image save", sha))
                    }));
                    continue;
                }
                let built_ref = &built_layers;
                let reused_ref = &reused_layers;
                let cache_dir = &layer_plan.cache_dir;
                let blob = blob.clone();
                let layer_squashfs = layer_squashfs.clone();
                let sha_owned = sha.clone();
                handles.push(s.spawn(move || -> Result<(), String> {
                    let unique = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
                    let layer_extract = cache_dir.join(format!(
                        "{sha_owned}.extract.{}.{}",
                        std::process::id(),
                        unique
                    ));
                    let tmp_squashfs = cache_dir.join(format!(
                        ".{sha_owned}.squashfs.{}.{}.tmp",
                        std::process::id(),
                        unique
                    ));
                    let _ = std::fs::remove_dir_all(&layer_extract);
                    let _ = std::fs::remove_file(&tmp_squashfs);
                    std::fs::create_dir_all(&layer_extract).map_err(|e| {
                        format!("create layer extract {}: {e}", layer_extract.display())
                    })?;

                    let built: Result<bool, String> = (|| {
                        extract_layer_tar(&blob, &layer_extract)?;
                        remove_oci_whiteouts(&layer_extract)?;

                        if layer_squashfs.is_file() {
                            return Ok(false);
                        }

                        // Walk the layer tar's headers a second time to
                        // recover original uid/gid for the rare entries the
                        // image marks as non-root. The common case (every
                        // system file = uid 0 — `/etc`, `/usr`, `/bin`, …) is
                        // the default, so the override map only carries the
                        // non-root paths; backhand applies the ownership
                        // directly to each node (no pseudo-file / `-all-root`
                        // dance). See `recover_layer_ownership`.
                        let overrides = recover_layer_ownership(&blob, &layer_extract)?;
                        squashfs::write_squashfs(
                            &layer_extract,
                            &tmp_squashfs,
                            &squashfs::Ownership::OciLayer(overrides),
                        )
                        .map_err(|e| format!("squashfs layer {sha_owned}: {e}"))?;

                        if layer_squashfs.is_file() {
                            let _ = std::fs::remove_file(&tmp_squashfs);
                            return Ok(false);
                        }
                        std::fs::rename(&tmp_squashfs, &layer_squashfs).map_err(|e| {
                            format!(
                                "install layer squashfs {} -> {}: {e}",
                                tmp_squashfs.display(),
                                layer_squashfs.display()
                            )
                        })?;
                        Ok(true)
                    })();
                    let _ = std::fs::remove_dir_all(&layer_extract);
                    if built.is_err() {
                        let _ = std::fs::remove_file(&tmp_squashfs);
                    }
                    match built? {
                        true => built_ref.fetch_add(1, Ordering::Relaxed),
                        false => reused_ref.fetch_add(1, Ordering::Relaxed),
                    };
                    Ok(())
                }));
            }
            handles
                .into_iter()
                .map(|h| {
                    h.join()
                        .unwrap_or_else(|_| Err("layer thread panicked".to_owned()))
                })
                .collect()
        });
        for r in layer_results {
            r?;
        }
        let built_layers = built_layers.load(Ordering::Relaxed);
        let reused_layers = reused_layers.load(Ordering::Relaxed);

        Ok(LayerMaterialization {
            materialize_ms: elapsed_ms(t0),
            built_layers,
            reused_layers,
        })
    })();
    if let Some(work_dir) = owned_work_dir {
        let _ = std::fs::remove_dir_all(work_dir);
    }
    result
}

#[cfg(test)]
mod whiteout_tests {
    use super::*;
    use std::fs;
    use std::sync::atomic::{AtomicU64, Ordering};

    fn tmp(tag: &str) -> std::path::PathBuf {
        static N: AtomicU64 = AtomicU64::new(0);
        let d = std::env::temp_dir().join(format!(
            "sm-wh-{tag}-{}-{}",
            std::process::id(),
            N.fetch_add(1, Ordering::Relaxed)
        ));
        let _ = fs::remove_dir_all(&d);
        fs::create_dir_all(&d).unwrap();
        d
    }

    #[test]
    fn removes_target_and_marker_keeps_siblings() {
        let root = tmp("basic");
        fs::write(root.join("keep.txt"), b"k").unwrap();
        fs::write(root.join("gone.txt"), b"g").unwrap();
        fs::write(root.join(".wh.gone.txt"), b"").unwrap();
        remove_oci_whiteouts(&root).unwrap();
        assert!(root.join("keep.txt").exists());
        assert!(!root.join("gone.txt").exists(), "whiteout target removed");
        assert!(!root.join(".wh.gone.txt").exists(), "marker removed");
        let _ = fs::remove_dir_all(&root);
    }

    #[test]
    fn directory_whiteout_coexisting_with_target_is_order_independent() {
        // `.wh.d` and a populated `d/` coexist at processing time (the common
        // case: an upper layer deletes a lower layer's directory). Whichever
        // readdir order the OS picks, this must succeed — `d` removed, marker
        // removed, no error from recursing into a since-deleted directory.
        // Run several times to shake the (arbitrary) readdir order.
        for _ in 0..8 {
            let root = tmp("dir");
            fs::create_dir_all(root.join("d/sub")).unwrap();
            fs::write(root.join("d/sub/f"), b"x").unwrap();
            fs::write(root.join(".wh.d"), b"").unwrap();
            remove_oci_whiteouts(&root).expect("must not error on either order");
            assert!(!root.join("d").exists(), "directory target removed");
            assert!(!root.join(".wh.d").exists(), "marker removed");
            let _ = fs::remove_dir_all(&root);
        }
    }

    #[test]
    fn opaque_marker_removed_without_deleting_siblings() {
        let root = tmp("opq");
        fs::create_dir_all(root.join("d")).unwrap();
        fs::write(root.join("d/keep"), b"x").unwrap();
        fs::write(root.join("d/.wh..wh..opq"), b"").unwrap();
        remove_oci_whiteouts(&root).unwrap();
        assert!(
            root.join("d/keep").exists(),
            "opaque marker must not delete siblings"
        );
        assert!(
            !root.join("d/.wh..wh..opq").exists(),
            "opaque marker itself removed"
        );
        let _ = fs::remove_dir_all(&root);
    }

    #[test]
    fn nested_whiteouts_are_processed() {
        let root = tmp("nested");
        fs::create_dir_all(root.join("a/b")).unwrap();
        fs::write(root.join("a/b/gone"), b"x").unwrap();
        fs::write(root.join("a/b/.wh.gone"), b"").unwrap();
        remove_oci_whiteouts(&root).unwrap();
        assert!(!root.join("a/b/gone").exists());
        assert!(!root.join("a/b/.wh.gone").exists());
        assert!(root.join("a/b").exists(), "the containing dir stays");
        let _ = fs::remove_dir_all(&root);
    }

    #[test]
    fn whiteout_for_absent_target_is_tolerated() {
        // A `.wh.x` with no `x` present (already absent) must just remove the
        // marker, not error.
        let root = tmp("absent");
        fs::write(root.join(".wh.ghost"), b"").unwrap();
        remove_oci_whiteouts(&root).unwrap();
        assert!(!root.join(".wh.ghost").exists());
        let _ = fs::remove_dir_all(&root);
    }
}