supermachine 0.5.0

//! High-level public API: [`Image`], [`Vm`], [`VmConfig`], [`Error`].
//!
//! These types wrap the lower-level [`crate::vmm`] primitives
//! (`WarmPool`, `VmResources`, …) into a small, stable surface
//! for embedders: load an image, start a VM, talk to its guest,
//! stop. The lower-level types remain available under
//! `#[doc(hidden)]` for the CLI / router / bench crates that
//! pre-date the narrowing.

use std::collections::VecDeque;
use std::io::{Read, Write};
use std::net::{SocketAddr, TcpListener, TcpStream};
use std::os::unix::net::UnixStream;
use std::path::{Path, PathBuf};
use std::process::{Child, Command, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Condvar, Mutex};
use std::thread::JoinHandle;
use std::time::{Duration, Instant};

use crate::assets::AssetPaths;
use crate::vmm::pool::{PoolClientError, WarmPool, WarmPoolError};
use crate::vmm::resources::VmResources;
use crate::vmm::runner::RunOptions;

/// All errors the high-level API can return. Designed to be
/// `match`able: the variants name *what failed*, not which
/// internal type produced it.
///
/// Each variant carries a human-readable `msg` plus an optional
/// typed `source: Option<Box<dyn std::error::Error + Send + Sync>>`
/// so callers can downcast to the underlying error
/// (`io::Error`, `WarmPoolError`, `PoolClientError`, ...) when
/// they need typed handling. `std::error::Error::source()` walks
/// this chain so `?` propagation preserves it through downstream
/// `Box<dyn Error>` conversions.
///
/// `#[non_exhaustive]` so future versions can add variants without
/// breaking exhaustive matches in consumer code.
#[non_exhaustive]
pub enum Error {
    /// The image / snapshot couldn't be loaded — bad path, bad
    /// magic bytes, or version mismatch.
    Image {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// VM start / restore failed. Includes `WarmPool` setup errors,
    /// HVF entitlement issues, missing assets, and pool-spawn
    /// failures. Downcast `source` to the typed cause where
    /// applicable.
    Vm {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// The configured assets (kernel, init shim) couldn't be
    /// located. Set [`VmConfig::with_assets`] explicitly to
    /// override auto-discovery.
    Assets {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// I/O on a vsock socket / file. The original `io::Error` is
    /// the variant payload — match `Error::Io(e)` and inspect
    /// `e.kind()` for typed handling.
    Io(std::io::Error),
    /// Registry pull failed — image manifest fetch, layer download,
    /// or auth handshake. Surface message includes the registry
    /// HTTP status / response body where available.
    Network {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// [`PullPolicy::Never`] was set but no usable cache exists.
    /// Switch to [`PullPolicy::Missing`] (the default) to allow a
    /// pull, or pre-bake via the `supermachine` CLI.
    CacheMiss {
        msg: String,
    },
    /// A cached snapshot was found but isn't loadable on this
    /// binary — runtime SHA mismatch, snapshot format version
    /// mismatch, or corrupt/missing layer files. The error message
    /// names the specific reason. With [`PullPolicy::Missing`] /
    /// [`PullPolicy::Always`] the library auto-rebakes; only
    /// [`PullPolicy::Never`] surfaces this.
    CacheInvalid {
        msg: String,
    },
    /// The bake step itself failed — snapshot capture timed out,
    /// the workload didn't bind a port within the readiness window,
    /// or the worker exited mid-bake. See `bake.log` in the
    /// snapshot dir for details.
    Bake {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// Registry returned 404 / "manifest unknown" for the
    /// requested image+tag — typo, deleted upstream, or wrong
    /// registry. The `image` field holds the full `host/repo:tag`
    /// reference as the user passed it, so callers can do
    /// `match e { Error::ImageNotFound { image, .. } => ... }`
    /// without re-parsing the string.
    ImageNotFound {
        image: String,
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// Registry returned 401 / 403 — bad credentials, expired
    /// token, or repo requires login. Set creds via
    /// `~/.docker/config.json` or `--registry-auth USER:PASS`.
    RegistryAuth {
        image: String,
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// Registry HTTP request failed at the network layer — DNS
    /// failure, connection refused, TLS handshake error, timeout
    /// on the wire. Distinct from [`Error::ImageNotFound`] /
    /// [`Error::RegistryAuth`] which surface registry-side errors
    /// after a successful connection.
    RegistryUnreachable {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// Pool has reached its `max` worker count and the
    /// `acquire_timeout` elapsed before any peer dropped its
    /// `PooledVm`. Increase `max`, increase `acquire_timeout`,
    /// or retry. Carries the elapsed timeout + max in the
    /// message for diagnostics.
    PoolExhausted {
        msg: String,
    },
}

// Constructor helpers that callers in this crate use. Keeping
// the public surface field-style (struct variants) means new
// fields are non-breaking; the callers below all funnel through
// these so we can evolve the construction shape later without
// touching every call site.
//
// Public API is the variants themselves; these are pub(crate).
impl Error {
    pub(crate) fn image_msg(msg: impl Into<String>) -> Self {
        Error::Image { msg: msg.into(), source: None }
    }
    pub(crate) fn vm_msg(msg: impl Into<String>) -> Self {
        Error::Vm { msg: msg.into(), source: None }
    }
    pub(crate) fn assets_msg(msg: impl Into<String>) -> Self {
        Error::Assets { msg: msg.into(), source: None }
    }
    pub(crate) fn network_msg(msg: impl Into<String>) -> Self {
        Error::Network { msg: msg.into(), source: None }
    }
    pub(crate) fn bake_msg(msg: impl Into<String>) -> Self {
        Error::Bake { msg: msg.into(), source: None }
    }
    pub(crate) fn cache_miss(msg: impl Into<String>) -> Self {
        Error::CacheMiss { msg: msg.into() }
    }
    pub(crate) fn cache_invalid(msg: impl Into<String>) -> Self {
        Error::CacheInvalid { msg: msg.into() }
    }
    pub(crate) fn image_not_found(image: impl Into<String>, msg: impl Into<String>) -> Self {
        Error::ImageNotFound { image: image.into(), msg: msg.into(), source: None }
    }
    pub(crate) fn registry_auth(image: impl Into<String>, msg: impl Into<String>) -> Self {
        Error::RegistryAuth { image: image.into(), msg: msg.into(), source: None }
    }
    pub(crate) fn registry_unreachable(msg: impl Into<String>) -> Self {
        Error::RegistryUnreachable { msg: msg.into(), source: None }
    }
    pub(crate) fn pool_exhausted(msg: impl Into<String>) -> Self {
        Error::PoolExhausted { msg: msg.into() }
    }

    /// True when the message looks like the agent-probe symptom
    /// of a multi-vCPU restore RCU-stall (probe timed out / EAGAIN
    /// in the underlying vsock read). Heuristic — used to
    /// downgrade probe failures to a warning when the snapshot
    /// was baked with vcpus > 1. Definitive stale-agent signals
    /// (the agent ack with an old protocol number) report a
    /// different message and are NOT covered.
    pub(crate) fn is_likely_multi_vcpu_restore_stall(&self) -> bool {
        let Error::Vm { msg, .. } = self else {
            return false;
        };
        // The probe's two failure modes that are indistinguishable
        // from a multi-vCPU RCU-stall:
        //   1. send_control_with_ack returns io::ErrorKind::Other
        //      with the underlying socket error string (most often
        //      "Resource temporarily unavailable" / EAGAIN, or a
        //      timeout phrase).
        //   2. The probe wrapper turned that into the
        //      "agent in this snapshot is from an older …" string;
        //      check for the embedded "(probe failed: …)".
        msg.contains("Resource temporarily unavailable")
            || msg.contains("os error 35")
            || msg.contains("os error 60") // ETIMEDOUT
            || msg.contains("timed out")
            || (msg.contains("probe failed") && !msg.contains("speaks protocol v"))
    }
}

impl std::fmt::Debug for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::Image { msg, source } => f
                .debug_struct("Image")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::Vm { msg, source } => f
                .debug_struct("Vm")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::Assets { msg, source } => f
                .debug_struct("Assets")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::Io(e) => f.debug_tuple("Io").field(e).finish(),
            Error::Network { msg, source } => f
                .debug_struct("Network")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::CacheMiss { msg } => f.debug_struct("CacheMiss").field("msg", msg).finish(),
            Error::CacheInvalid { msg } => {
                f.debug_struct("CacheInvalid").field("msg", msg).finish()
            }
            Error::Bake { msg, source } => f
                .debug_struct("Bake")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::ImageNotFound { image, msg, source } => f
                .debug_struct("ImageNotFound")
                .field("image", image)
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::RegistryAuth { image, msg, source } => f
                .debug_struct("RegistryAuth")
                .field("image", image)
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::RegistryUnreachable { msg, source } => f
                .debug_struct("RegistryUnreachable")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::PoolExhausted { msg } => {
                f.debug_struct("PoolExhausted").field("msg", msg).finish()
            }
        }
    }
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::Image { msg, .. } => write!(f, "image: {msg}"),
            Error::Vm { msg, .. } => write!(f, "vm: {msg}"),
            Error::Assets { msg, .. } => write!(f, "assets: {msg}"),
            Error::Io(e) => write!(f, "io: {e}"),
            Error::Network { msg, .. } => write!(f, "network: {msg}"),
            Error::CacheMiss { msg } => write!(f, "cache miss: {msg}"),
            Error::CacheInvalid { msg } => write!(f, "cache invalid: {msg}"),
            Error::Bake { msg, .. } => write!(f, "bake: {msg}"),
            Error::ImageNotFound { image, msg, .. } => {
                write!(f, "image not found ({image}): {msg}")
            }
            Error::RegistryAuth { image, msg, .. } => {
                write!(f, "registry auth failed for {image}: {msg}")
            }
            Error::RegistryUnreachable { msg, .. } => {
                write!(f, "registry unreachable: {msg}")
            }
            Error::PoolExhausted { msg } => write!(f, "pool exhausted: {msg}"),
        }
    }
}

impl std::error::Error for Error {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Error::Image { source, .. }
            | Error::Vm { source, .. }
            | Error::Assets { source, .. }
            | Error::Network { source, .. }
            | Error::Bake { source, .. }
            | Error::ImageNotFound { source, .. }
            | Error::RegistryAuth { source, .. }
            | Error::RegistryUnreachable { source, .. } => {
                source.as_ref().map(|s| s.as_ref() as &(dyn std::error::Error + 'static))
            }
            Error::Io(e) => Some(e),
            Error::CacheMiss { .. }
            | Error::CacheInvalid { .. }
            | Error::PoolExhausted { .. } => None,
        }
    }
}

impl From<std::io::Error> for Error {
    fn from(e: std::io::Error) -> Self {
        Error::Io(e)
    }
}

impl From<WarmPoolError> for Error {
    fn from(e: WarmPoolError) -> Self {
        Error::Vm {
            msg: e.to_string(),
            source: Some(Box::new(e)),
        }
    }
}

impl From<PoolClientError> for Error {
    fn from(e: PoolClientError) -> Self {
        Error::Vm {
            msg: e.to_string(),
            source: Some(Box::new(e)),
        }
    }
}

/// How [`Image::from_oci`] decides whether to talk to the registry
/// or use a locally-cached snapshot. Same semantics as Docker's
/// `--pull` flag.
///
/// **The default is [`PullPolicy::Missing`]** — use the cache if
/// it exists; pull only if absent. Right for pinned tags or digest
/// references. For `:latest`-style mutable tags use
/// [`PullPolicy::Always`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum PullPolicy {
    /// Pull manifest from the registry every time; rebake if the
    /// digest changed since the last bake. Right for `:latest`-style
    /// mutable tags.
    Always,
    /// Use the cached snapshot if it exists locally and is valid.
    /// Don't talk to the registry at all unless the cache is
    /// missing or invalid. **The default.**
    Missing,
    /// Use the cache or fail. Never pull. Right for offline /
    /// air-gapped environments.
    Never,
}

impl Default for PullPolicy {
    fn default() -> Self {
        Self::Missing
    }
}

impl PullPolicy {
    /// String form the underlying `bake` pipeline accepts. Mirror
    /// of the CLI's `--pull` argument values.
    fn as_bake_str(self) -> &'static str {
        match self {
            Self::Always => "always",
            Self::Missing => "missing",
            Self::Never => "never",
        }
    }
}

/// A baked OCI image: its restore snapshot plus the metadata
/// describing which kernel + virtio-blk layers it needs. Cheap
/// to clone.
///
/// Two ways to construct one:
///
/// - [`Image::from_oci`] — pull (or reuse cache) from a registry,
///   bake into a snapshot, return the resulting image. The
///   high-level "I have an image reference" entry point.
/// - [`Image::from_snapshot`] — load an already-baked snapshot
///   directory directly. Useful when you want to keep snapshots
///   under your own management or share one across processes.
#[derive(Debug, Clone)]
pub struct Image {
    snapshot_path: PathBuf,
    /// Default memory; can be overridden via [`VmConfig::with_memory_mib`].
    pub(crate) memory_mib: u32,
    /// Default vCPUs; can be overridden via [`VmConfig::with_vcpus`].
    pub(crate) vcpus: u32,
    /// 16-hex-prefix of the supermachine-worker SHA256 that baked
    /// this snapshot, parsed from `metadata.json["runtime_sha16"]`.
    /// Used to skip the agent-protocol probe when a fresh-bake
    /// invariant holds (snapshot baked by the same lib version
    /// the host is now running). `None` means metadata didn't
    /// record one — older snapshots, third-party bakes, etc; the
    /// probe runs unconditionally in that case.
    pub(crate) baker_runtime_sha16: Option<String>,
    /// `metadata.json["balloon_target_pages"]` — bake-time
    /// recommended balloon inflation target for idle workers.
    /// Plumbed through SpawnConfig → worker CLI → runner so each
    /// pool worker requests inflate after restore. None disables
    /// ballooning. Defaults to 75% of `memory_mib` in 4 KiB
    /// pages (set by bake.rs).
    pub(crate) balloon_target_pages: Option<u32>,
    /// virtio-blk layer file paths in the order the bake step
    /// produced them. The microVM needs all of them attached at
    /// restore time (the OverlayFS in the guest is mounted on top).
    pub(crate) layers: Vec<PathBuf>,
    /// Optional per-image delta layer applied after `layers`.
    pub(crate) delta_squashfs: Option<PathBuf>,
    /// virtio-fs DAX mounts persisted in metadata.json. The bake
    /// captures the snapshot with these mounts wired into the FDT,
    /// so restore must re-create the same VirtioFs devices at the
    /// same MMIO addresses for the guest kernel to find them.
    pub(crate) mounts: Vec<crate::vmm::resources::MountSpec>,
    /// Bundled kernel path, if the snapshot dir ships one alongside.
    /// Lets a self-contained bundle (e.g. `MyApp.app/Contents/
    /// Resources/<image>/kernel`) start a VM without requiring the
    /// embedder's host to have supermachine assets installed
    /// system-wide. `None` means [`Vm::start`] falls back to
    /// [`AssetPaths::discover`].
    pub(crate) bundled_kernel: Option<PathBuf>,
    /// Hidden warm pool, lazy-initialized on first
    /// [`Image::acquire`]. The pool holds a single long-lived
    /// `WarmPool` (in-process worker + the snapshot mmap'd in)
    /// behind a mutex; each `acquire` calls `restore` to reset
    /// the worker to clean snapshot state, returns a [`PooledVm`]
    /// that holds the lock, and re-enters the pool on `Drop`.
    /// Per-acquire cost is just the snapshot restore (~5 ms),
    /// not the full VM spawn (~50–100 ms).
    ///
    /// Wrapped in `Arc` so cloning an `Image` shares the same
    /// pool instance — useful when multiple parts of an app hold
    /// `Image` references but should share a single warm worker.
    pub(crate) hidden_pool: std::sync::OnceLock<Arc<HiddenPool>>,
    /// Warm handoff: the bake-time worker, kept alive after the
    /// pipelined bake completes (`Image::builder().with_warmup(…)`)
    /// so the FIRST `Pool::acquire()` can use it directly instead
    /// of paying ~50 ms spawn + ~5 ms restore for a fresh worker.
    /// Subsequent acquires fall through to the normal spawn-from-
    /// disk path.
    ///
    /// Atomic claim semantics: the consumer (`PoolBuilder::build` or
    /// `HiddenPool::ensure_min_workers`) calls
    /// `warm_baked_worker.lock().take()`. Only one party gets
    /// `Some`; everyone else gets `None` and falls through cleanly.
    /// This also guards against multi-pool-from-same-Image: only
    /// the first pool gets the warm worker.
    ///
    /// Drop semantics: if the Image is dropped without anyone
    /// claiming, `Drop for Image` (below) takes the value and
    /// shuts down the worker via QUIT (drains in-flight saves)
    /// + `child.wait()` + socks_dir cleanup. The bake driver
    /// already drained the bg base save before returning the
    /// BakedWorker, so QUIT is a fast (~10 ms) drain-noop in this
    /// case — no `.partial` leaks.
    pub(crate) warm_baked_worker: Arc<crate::bake::WarmStash>,
}

impl Image {
    /// Test-only introspection: is a warm-handoff worker currently
    /// stashed? Used by the warm-handoff integration tests to
    /// verify the bake actually populated the stash, and that the
    /// first `Pool::build()` consumed it. Not part of the public
    /// API contract — the field itself is `pub(crate)` and may
    /// move; this accessor is doc-hidden because it's a peephole
    /// for tests, not a feature.
    #[doc(hidden)]
    pub fn _warm_handoff_present(&self) -> bool {
        self.warm_baked_worker
            .inner
            .lock()
            .map(|g| g.is_some())
            .unwrap_or(false)
    }
    /// Test-only: PID of the stashed warm worker, or None. Used by
    /// the R1 integration test (Image-dropped-without-claim) to
    /// `kill -0` the pid after dropping the Image and confirm
    /// reaping. Not stable.
    #[doc(hidden)]
    pub fn _warm_handoff_pid(&self) -> Option<u32> {
        self.warm_baked_worker
            .inner
            .lock()
            .ok()
            .and_then(|g| g.as_ref().map(|bw| bw.child.id()))
    }
}

/// Internal state for the hidden subprocess pool an [`Image`]
/// manages for [`Image::acquire`] users. Spawns N
/// `supermachine-worker` subprocesses up front, each pre-restored
/// from the snapshot — so `acquire` is just "pop an idle worker
/// off the queue" (~1 ms) and N concurrent acquires really run N
/// VMs in parallel (each in its own subprocess, each its own
/// `hv_vm_create` singleton).
///
/// On `Drop`, kills every worker and unlinks every socket.
#[doc(hidden)]
pub struct HiddenPool {
    /// Per-worker state: idle queue + counts. Arc'd separately
    /// from the pool so housekeeping threads can hold it across
    /// condvar waits without keeping `HiddenPool` itself alive
    /// (otherwise the user-side drop never fires).
    state: Arc<Mutex<PoolState>>,
    /// Wakes `acquire()` callers blocked on an empty idle queue
    /// AND wakes housekeeping threads on shutdown. Signalled when
    /// a worker re-enters idle, when the pool is shutting down,
    /// or when a wait_timeout window in a housekeeper expires.
    available: Arc<Condvar>,
    /// Workers handed back from `PooledVm::drop` waiting to be
    /// restored to clean snapshot state before going back into
    /// the idle queue. The restorer thread drains this. Arc'd
    /// so the restorer can wait on the condvar without keeping
    /// `HiddenPool` alive.
    dirty: Option<Arc<Mutex<VecDeque<Worker>>>>,
    /// Wakes any restorer thread when a worker lands on dirty
    /// (or on shutdown).
    dirty_pending: Option<Arc<Condvar>>,
    /// Where each worker's vsock mux/exec sockets live.
    socks_dir: PathBuf,
    /// True once `Drop` has started; replenisher / restorer /
    /// janitor exit the next time around their wait loops. Arc'd
    /// so housekeeping threads can poll it after dropping their
    /// strong `HiddenPool` reference.
    shutting_down: Arc<AtomicBool>,
    /// Image-derived spawn config, copied so the pool is self-
    /// contained.
    spawn_cfg: Arc<SpawnConfig>,
    /// Static pool policy. Read from many threads; never mutated
    /// after construction so we can stash by value (no Mutex).
    policy: PoolPolicy,
}

/// Static policy for an auto-scaling pool. Set at builder time;
/// immutable thereafter.
#[derive(Debug, Clone, Copy)]
struct PoolPolicy {
    /// Always-warm baseline. The replenisher keeps `alive >= min`.
    /// `min == 0` means lazy: the first acquire spawns the first
    /// worker.
    min: usize,
    /// Hard concurrency cap. Acquire blocks (with timeout) when
    /// `alive == max`. `max == usize::MAX` means uncapped.
    max: usize,
    /// Idle workers above `min` that have sat unused for longer
    /// than this get evicted by the janitor. `Duration::MAX`
    /// disables eviction (fixed-size pool).
    idle_timeout: Duration,
    /// Caller's `acquire()` blocks at most this long when the
    /// pool is at `max` and no worker is idle. After that the
    /// call returns `Error::PoolExhausted`. `None` = block forever.
    acquire_timeout: Option<Duration>,
    /// When `true` (default), `PooledVm::drop` queues the worker
    /// for restoration via the supervisor RESTORE RPC before
    /// it goes back to idle. Each cycle starts with a clean
    /// snapshot-state guest. Costs ~3 ms per cycle (off the
    /// critical path if the pool has a buddy slot).
    ///
    /// When `false`, drop pushes the worker DIRECTLY back to
    /// idle without restoring. The next acquire gets the same
    /// guest in whatever state the previous user left it. The
    /// per-cycle restore cost vanishes, AND the guest's page
    /// cache stays warm — for workloads like rustc that re-read
    /// the same sysroot/deps every invocation, this is a HUGE
    /// win (300 ms cold compile → ~50–100 ms warm-cache compile).
    ///
    /// Caveats: workloads must be tolerant of leftover state
    /// in `/tmp` etc. The integrator's pattern (write_file
    /// `main.rs`, run `rustc -o /tmp/m && /tmp/m`) is safe — the
    /// `&&` short-circuits on compile failure, and outputs are
    /// always overwritten. Pair with periodic full pool drain
    /// + rebuild if you need bounded resource accumulation over
    /// long runs.
    restore_on_release: bool,
}

impl Default for PoolPolicy {
    fn default() -> Self {
        Self {
            min: 0,
            max: 64,
            idle_timeout: Duration::from_secs(60),
            acquire_timeout: Some(Duration::from_secs(60)),
            restore_on_release: true,
        }
    }
}

/// Per-Pool internal state under one mutex. Held only briefly
/// during acquire/release; the long-running work (spawn /
/// restore / kill) happens outside the lock.
struct PoolState {
    /// Idle workers ready for the next `acquire()`. **LIFO** —
    /// the most recently used worker has the hottest host page
    /// cache (squashfs layer pages, kernel mmap pages), so it
    /// pays the cheapest restore on the next acquire. Both ops
    /// are at the back of the `Vec`.
    idle: Vec<IdleEntry>,
    /// Total workers in the pool right now: idle + currently
    /// checked-out + currently being spawned/restored. The
    /// auto-grow path bumps this *before* spawning so a
    /// concurrent `acquire` can see "we already promised someone
    /// a worker, don't double-spawn."
    alive: usize,
    /// Currently-blocked acquire callers (for stats /
    /// observability). Incremented at the wait site, decremented
    /// when the wait returns.
    waiting: usize,
}

/// An idle worker plus the timestamp it returned to the queue,
/// used by the janitor to evict above-`min` workers that have
/// sat unused longer than `idle_timeout`.
struct IdleEntry {
    worker: Worker,
    last_used: Instant,
}

/// One spawned `supermachine-worker` subprocess + its vsock
/// socket paths + the lib-side end of the supervisor control
/// socket.
///
/// Workers are launched in `--pool-worker` mode (see
/// `bin/worker.rs`): each one connects back to a unix socket the
/// lib listens on, accepts text-line commands (`RESTORE <path>`,
/// `QUIT`), and writes `DONE …` after each restore. This lets
/// the same worker process serve many `acquire`/`drop` cycles
/// — each cycle is a snapshot restore (~3 ms) instead of a full
/// process spawn (~10 ms) plus a kill.
///
/// The control socket is bidirectional: lib writes commands,
/// worker writes responses. We hold separate read/write halves
/// behind a `Mutex` so the restorer thread (which sends RESTORE
/// during release-handling) doesn't race with the QUIT path
/// during pool teardown.
struct Worker {
    child: Child,
    vsock_mux_path: PathBuf,
    vsock_exec_path: PathBuf,
    /// Path of the unix listener the lib is using to talk to
    /// this worker. Cleaned up on drop alongside the vsock paths.
    control_path: PathBuf,
    /// Live control connection. Writer side; reads happen on a
    /// `BufReader` we construct on demand for the restore round-
    /// trip. Wrapped in `Mutex` so concurrent code paths
    /// (RESTORE during release vs. QUIT during shutdown) are
    /// linearized cleanly.
    control: Arc<Mutex<ControlChannel>>,
    /// The snapshot file this worker was last restored from.
    /// Used as the `base` hint on cycle-snapshot RPCs so the
    /// runner can use APFS clonefile + diff pwrite instead of
    /// the plain streaming sync save (~3× speedup on the
    /// cycle-snapshot path).
    last_restore_path: PathBuf,
}

/// Stats returned from a successful subprocess snapshot RPC.
struct SnapshotStats {
    bytes_written: u64,
    capture_us: u64,
    save_us: u64,
}

/// Lib-side bookkeeping for the `--pool-worker` text protocol.
struct ControlChannel {
    /// Buffered reader so we can read line-at-a-time without
    /// over-consuming bytes.
    reader: std::io::BufReader<std::os::unix::net::UnixStream>,
    /// Direct write half — `BufReader` borrows the reader half.
    writer: std::os::unix::net::UnixStream,
}

impl ControlChannel {
    fn send_line(&mut self, line: &str) -> std::io::Result<()> {
        use std::io::Write;
        self.writer.write_all(line.as_bytes())?;
        if !line.ends_with('\n') {
            self.writer.write_all(b"\n")?;
        }
        self.writer.flush()
    }

    fn read_line(&mut self) -> std::io::Result<String> {
        use std::io::BufRead;
        // Transparently skip `SAVE_DONE <path>` / `SAVE_FAIL <path> ...`
        // notifications. The worker's bg async-save thread emits those
        // on the same supervisor channel as the request/response
        // protocol — they're orthogonal "I finished a save you asked
        // me to start earlier" announcements, never a response to the
        // current request. If we returned one to a caller expecting
        // (say) DONE_SNAPSHOT, the line-oriented parse would mis-match.
        //
        // The bake pipeline has its own copy of this filter
        // (`read_supervisor_line_skip_save_notifications` in bake.rs)
        // because the bake driver owns the BufReader directly; this
        // version is for the `ControlChannel`-wrapped pooled path.
        loop {
            let mut buf = String::new();
            let n = self.reader.read_line(&mut buf)?;
            if n == 0 {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::UnexpectedEof,
                    "worker control socket closed",
                ));
            }
            let trimmed = buf.trim_start();
            if trimmed.starts_with("SAVE_DONE ") || trimmed.starts_with("SAVE_FAIL ") {
                tracing::debug!(line = %buf.trim_end(), "skipping orthogonal bg-save notification");
                continue;
            }
            return Ok(buf);
        }
    }
}

/// Convert a `bake::BakedWorker` (handle handed back from the
/// pipelined-bake driver when `keep_alive=true`) into a regular
/// `Worker` that the `HiddenPool` can use as an idle entry.
///
/// This is a pure handle-transfer — no I/O, no liveness check
/// (caller did the `try_wait` first). The resulting Worker has
/// `last_restore_path` set to the warm snapshot path, so any
/// subsequent cycle SNAPSHOT RPCs use it as the diff-via-clone
/// `base=` hint, same as a normal pool worker that just restored.
#[cfg(target_os = "macos")]
fn warm_baked_to_worker(bw: crate::bake::BakedWorker) -> Worker {
    Worker {
        child: bw.child,
        vsock_mux_path: bw.vsock_mux_path,
        vsock_exec_path: bw.vsock_exec_path,
        control_path: bw.control_path,
        control: Arc::new(Mutex::new(ControlChannel {
            reader: std::io::BufReader::new(bw.control_reader),
            writer: bw.control_writer,
        })),
        last_restore_path: bw.last_restore_path,
    }
}

impl Worker {
    /// Send a `RESTORE <path>` command and block until the
    /// worker writes `DONE …`. Returns `Ok` on success, `Err`
    /// if the protocol broke (worker crashed, socket closed,
    /// malformed response). Caller treats `Err` as "worker is
    /// unusable" and respawns.
    fn send_restore(&self, snapshot_path: &Path) -> Result<(), Error> {
        let path_str = snapshot_path
            .to_str()
            .ok_or_else(|| Error::vm_msg("snapshot path is not valid UTF-8".to_owned()))?;
        let mut ctl = self
            .control
            .lock()
            .map_err(|_| Error::vm_msg("worker control mutex poisoned".to_owned()))?;
        ctl.send_line(&format!("RESTORE {path_str}"))
            .map_err(Error::Io)?;
        let line = ctl.read_line().map_err(Error::Io)?;
        if line.starts_with("DONE") {
            Ok(())
        } else {
            Err(Error::vm_msg(format!(
                "worker RESTORE: expected DONE response, got: {}",
                line.trim()
            )))
        }
    }

    /// Send `SNAPSHOT <out_path>` and block on the response.
    /// The worker pauses the guest, captures snapshot, writes to
    /// `out_path`, then writes either `DONE_SNAPSHOT
    /// bytes_written=N capture_us=… save_us=…` or
    /// `ERR_SNAPSHOT <reason>` on the supervisor socket. We
    /// parse the response and surface it as `Result`.
    fn send_snapshot(&self, out_path: &Path) -> Result<SnapshotStats, Error> {
        let path_str = out_path
            .to_str()
            .ok_or_else(|| Error::vm_msg("snapshot path is not valid UTF-8".to_owned()))?;
        let mut ctl = self
            .control
            .lock()
            .map_err(|_| Error::vm_msg("worker control mutex poisoned".to_owned()))?;
        ctl.send_line(&format!("SNAPSHOT {path_str}"))
            .map_err(Error::Io)?;
        let line = ctl.read_line().map_err(Error::Io)?;
        if let Some(rest) = line.strip_prefix("DONE_SNAPSHOT") {
            let mut stats = SnapshotStats {
                bytes_written: 0,
                capture_us: 0,
                save_us: 0,
            };
            for kv in rest.split_ascii_whitespace() {
                if let Some(v) = kv.strip_prefix("bytes_written=") {
                    stats.bytes_written = v.parse().unwrap_or(0);
                } else if let Some(v) = kv.strip_prefix("capture_us=") {
                    stats.capture_us = v.parse().unwrap_or(0);
                } else if let Some(v) = kv.strip_prefix("save_us=") {
                    stats.save_us = v.parse().unwrap_or(0);
                }
            }
            Ok(stats)
        } else if let Some(rest) = line.strip_prefix("ERR_SNAPSHOT ") {
            Err(Error::vm_msg(format!(
                "worker SNAPSHOT failed: {}",
                rest.trim()
            )))
        } else {
            Err(Error::vm_msg(format!(
                "worker SNAPSHOT: unexpected response: {}",
                line.trim()
            )))
        }
    }

    /// Differential snapshot variant. Sends
    /// `SNAPSHOT <out_path> base=<base_path>`. The worker, if it
    /// has a matching in-flight async save to `base_path` (i.e.
    /// the snapshot is still in memory from a recent
    /// SNAPSHOT_ASYNC), uses APFS `clonefile` + diff `pwrite` to
    /// land the warm snapshot — usually ~10x faster than the
    /// plain streaming sync path.
    ///
    /// Falls back to the plain streaming sync save inside the
    /// runner on any of: no matching in-flight save, clonefile
    /// EXDEV (different filesystems), warm meta overflows base's
    /// ram_offset slack. Caller never sees this distinction; the
    /// returned `SnapshotStats` is well-formed in both cases.
    ///
    /// Used by the pipelined bake flow's warm capture.
    #[allow(dead_code)]
    fn send_snapshot_with_base(
        &self,
        out_path: &Path,
        base_path: &Path,
    ) -> Result<SnapshotStats, Error> {
        let path_str = out_path
            .to_str()
            .ok_or_else(|| Error::vm_msg("snapshot out path is not valid UTF-8".to_owned()))?;
        let base_str = base_path
            .to_str()
            .ok_or_else(|| Error::vm_msg("snapshot base path is not valid UTF-8".to_owned()))?;
        let mut ctl = self
            .control
            .lock()
            .map_err(|_| Error::vm_msg("worker control mutex poisoned".to_owned()))?;
        ctl.send_line(&format!("SNAPSHOT {path_str} base={base_str}"))
            .map_err(Error::Io)?;
        let line = ctl.read_line().map_err(Error::Io)?;
        if let Some(rest) = line.strip_prefix("DONE_SNAPSHOT") {
            let mut stats = SnapshotStats {
                bytes_written: 0,
                capture_us: 0,
                save_us: 0,
            };
            for kv in rest.split_ascii_whitespace() {
                if let Some(v) = kv.strip_prefix("bytes_written=") {
                    stats.bytes_written = v.parse().unwrap_or(0);
                } else if let Some(v) = kv.strip_prefix("capture_us=") {
                    stats.capture_us = v.parse().unwrap_or(0);
                } else if let Some(v) = kv.strip_prefix("save_us=") {
                    stats.save_us = v.parse().unwrap_or(0);
                }
            }
            Ok(stats)
        } else if let Some(rest) = line.strip_prefix("ERR_SNAPSHOT ") {
            Err(Error::vm_msg(format!(
                "worker SNAPSHOT (with base) failed: {}",
                rest.trim()
            )))
        } else {
            Err(Error::vm_msg(format!(
                "worker SNAPSHOT (with base): unexpected response: {}",
                line.trim()
            )))
        }
    }

    /// Async-save variant of [`Worker::send_snapshot`]. Sends
    /// `SNAPSHOT_ASYNC <out_path>`; the worker pauses, captures
    /// into a compact in-memory buffer, kicks off a background
    /// save thread, and returns `DONE_SNAPSHOT_ASYNC` immediately.
    /// The on-disk file appears asynchronously — drain via
    /// [`Worker::shutdown`] (which sends QUIT and waits for the
    /// worker process to exit) before relying on the file existing.
    ///
    /// Used by the pipelined bake flow.
    #[allow(dead_code)]
    fn send_snapshot_async(&self, out_path: &Path) -> Result<SnapshotStats, Error> {
        let path_str = out_path
            .to_str()
            .ok_or_else(|| Error::vm_msg("snapshot path is not valid UTF-8".to_owned()))?;
        let mut ctl = self
            .control
            .lock()
            .map_err(|_| Error::vm_msg("worker control mutex poisoned".to_owned()))?;
        ctl.send_line(&format!("SNAPSHOT_ASYNC {path_str}"))
            .map_err(Error::Io)?;
        let line = ctl.read_line().map_err(Error::Io)?;
        if let Some(rest) = line.strip_prefix("DONE_SNAPSHOT_ASYNC") {
            let mut stats = SnapshotStats {
                bytes_written: 0,
                capture_us: 0,
                save_us: 0,
            };
            for kv in rest.split_ascii_whitespace() {
                if let Some(v) = kv.strip_prefix("bytes_written=") {
                    stats.bytes_written = v.parse().unwrap_or(0);
                } else if let Some(v) = kv.strip_prefix("capture_us=") {
                    stats.capture_us = v.parse().unwrap_or(0);
                } else if let Some(v) = kv.strip_prefix("save_us=") {
                    stats.save_us = v.parse().unwrap_or(0);
                }
            }
            Ok(stats)
        } else if let Some(rest) = line.strip_prefix("ERR_SNAPSHOT ") {
            Err(Error::vm_msg(format!(
                "worker SNAPSHOT_ASYNC failed: {}",
                rest.trim()
            )))
        } else {
            Err(Error::vm_msg(format!(
                "worker SNAPSHOT_ASYNC: unexpected response: {}",
                line.trim()
            )))
        }
    }

    /// Best-effort `smpark_park` CONTROL RPC to the in-guest agent.
    /// Drives all secondary vCPUs into a known parked-WFI state via
    /// `ioctl(/dev/smpark, PARK)`. Returns `Ok(true)` on success,
    /// `Ok(false)` if the module isn't available (single-vCPU bake
    /// or smpark.ko not loaded), `Err` only on transport failure.
    /// Caller treats `Ok(false)` as "skip park, fall back to the
    /// existing rendezvous-only path".
    #[cfg_attr(not(all(target_os = "macos", target_arch = "aarch64")), allow(dead_code))]
    fn send_smpark_park(&self) -> Result<bool, Error> {
        let body = serde_json::json!({ "action": "smpark_park" });
        match crate::exec::send_control_with_ack(
            &self.vsock_exec_path,
            &body,
            Some(Duration::from_secs(5)),
        ) {
            Ok(_) => Ok(true),
            Err(e) => {
                // Agent reports `ok=false` for "module not loaded /
                // /dev/smpark not available" — surfaces as
                // io::ErrorKind::Other with the agent's message.
                // Treat any transport-level failure as "skip park"
                // so callers don't break the snapshot path on the
                // 1-vCPU common case (where smpark is a no-op the
                // module reports as such).
                tracing::debug!(error = %e, "smpark_park unavailable; skipping");
                Ok(false)
            }
        }
    }

    /// Best-effort `smpark_unpark` CONTROL RPC. Wakes the parked
    /// secondaries. Same fallback semantics as
    /// [`Worker::send_smpark_park`] — returns `Ok(false)` if the
    /// module isn't there.
    #[cfg_attr(not(all(target_os = "macos", target_arch = "aarch64")), allow(dead_code))]
    fn send_smpark_unpark(&self) -> Result<bool, Error> {
        let body = serde_json::json!({ "action": "smpark_unpark" });
        match crate::exec::send_control_with_ack(
            &self.vsock_exec_path,
            &body,
            Some(Duration::from_secs(5)),
        ) {
            Ok(_) => Ok(true),
            Err(e) => {
                tracing::debug!(error = %e, "smpark_unpark unavailable; skipping");
                Ok(false)
            }
        }
    }

    /// Best-effort QUIT. Tries the supervisor protocol first —
    /// gives the worker a chance to flush state cleanly — then
    /// falls back to SIGKILL if the worker doesn't exit within
    /// the grace window.
    fn shutdown(&mut self) {
        // Send QUIT — best effort, ignore failures (worker may
        // already be dead, mutex may be poisoned).
        if let Ok(mut ctl) = self.control.lock() {
            let _ = ctl.send_line("QUIT");
        }
        // Give the worker ~100 ms to exit cleanly. Most exits
        // happen in <10 ms; the upper bound is for slow restore-
        // cleanup paths.
        let deadline = Instant::now() + Duration::from_millis(100);
        loop {
            match self.child.try_wait() {
                Ok(Some(_)) => break,
                Ok(None) if Instant::now() < deadline => {
                    std::thread::sleep(Duration::from_millis(2));
                }
                _ => {
                    let _ = self.child.kill();
                    let _ = self.child.wait();
                    break;
                }
            }
        }
        let _ = std::fs::remove_file(&self.vsock_mux_path);
        let _ = std::fs::remove_file(&self.vsock_exec_path);
        let _ = std::fs::remove_file(&self.control_path);
        let mut h = self.vsock_mux_path.clone();
        h.set_extension("handoff");
        let _ = std::fs::remove_file(&h);
    }
}

/// Resolved + reusable spawn config for one Image's pool.
struct SpawnConfig {
    worker_bin: PathBuf,
    snapshot_path: PathBuf,
    layers: Vec<PathBuf>,
    delta_squashfs: Option<PathBuf>,
    /// virtio-fs mounts to re-create on the worker side. Snapshot's
    /// metadata.json `mounts` field, propagated through `Image`.
    mounts: Vec<crate::vmm::resources::MountSpec>,
    memory_mib: u32,
    vcpus: u32,
    socks_dir: PathBuf,
    /// Identifier folded into the socket file names. Just for
    /// readability when looking at /tmp.
    name_prefix: String,
    /// Honored when waiting for a freshly spawned worker's vsock
    /// socket to appear.
    spawn_timeout: Duration,
    /// `metadata.json["runtime_sha16"]` of the snapshot being
    /// restored. When this matches the current `worker_bin`'s
    /// SHA16, we know the in-guest agent shipped with this
    /// snapshot is the same agent the lib expects, so the
    /// agent-protocol probe can be skipped. ~25 ms saved per
    /// acquire.
    baker_runtime_sha16: Option<String>,
    /// `metadata.json["balloon_target_pages"]` from the snapshot
    /// — number of 4 KiB pages the host asks the guest to
    /// inflate via virtio-balloon after restore. Drops idle
    /// worker RSS from `~memory_mib` to ~25% of memory_mib on
    /// rust:1-slim and similar workloads. Plumbed through to
    /// the worker binary as `--balloon-target-pages N`.
    balloon_target_pages: Option<u32>,
}

impl SpawnConfig {
    /// Spawn ONE worker subprocess in `--pool-worker` mode and
    /// return it ready to serve. The worker handles its initial
    /// snapshot restore on boot and writes a `DONE …` line on
    /// the supervisor control socket once that completes — we
    /// block on that line so callers can treat the returned
    /// `Worker` as fully restored.
    fn spawn_one(&self) -> Result<Worker, Error> {
        use std::os::unix::net::UnixListener;

        // 64-bit hex suffix from `unique_suffix()` (nanos +
        // monotonic counter). Distinct across parallel spawns;
        // collision-free in practice.
        let suffix = unique_suffix();
        let vsock_mux_path = self
            .socks_dir
            .join(format!("{}-{:016x}.sock", self.name_prefix, suffix));
        let vsock_exec_path = {
            let mut p = vsock_mux_path.clone();
            let mut name = p.file_name().unwrap().to_owned();
            name.push("-exec");
            p.set_file_name(name);
            p
        };
        let control_path = {
            let mut p = vsock_mux_path.clone();
            let mut name = p.file_name().unwrap().to_owned();
            name.push("-ctl");
            p.set_file_name(name);
            p
        };
        let _ = std::fs::remove_file(&vsock_mux_path);
        let _ = std::fs::remove_file(&vsock_exec_path);
        let _ = std::fs::remove_file(&control_path);

        // Lib listens on the control socket BEFORE spawning so
        // the worker's `connect()` always finds it.
        let ctl_listener = UnixListener::bind(&control_path).map_err(|e| {
            Error::vm_msg(format!(
                "bind control socket {}: {e}",
                control_path.display()
            ))
        })?;

        // ATOMIC-RENAME SENTINEL POLL: when the Image was returned
        // from an always-pipelined-skip-warm `.build()`, the bg
        // save of `restore.snap` may still be in flight. The worker
        // would fail-fast on `--restore-from` pointing at a missing
        // file. `save_compact_to_file` writes to `<path>.partial`
        // and atomic-renames to `<path>` on completion, so file
        // existence ↔ save complete.
        //
        // Bound at `spawn_timeout` (default 30s; configurable via
        // `VmConfig::restore_timeout`). Steady-state save lands in
        // ~50–200 ms on typical hardware; the cap is well over the
        // p99.99. On timeout, surface a clear error rather than
        // letting the worker fail with a confusing "no such file"
        // message.
        //
        // No-op when the file already exists (the common case for
        // workers spawned after the first acquire, or for Images
        // loaded via `Image::from_snapshot`).
        if !self.snapshot_path.is_file() {
            let poll_t0 = Instant::now();
            let poll_deadline = poll_t0 + self.spawn_timeout;
            let mut backoff = Duration::from_millis(2);
            loop {
                if self.snapshot_path.is_file() {
                    if std::env::var_os("SUPERMACHINE_TIMINGS").is_some() {
                        eprintln!(
                            "[spawn_one] waited {:?} for snapshot file to land at {}",
                            poll_t0.elapsed(),
                            self.snapshot_path.display(),
                        );
                    }
                    break;
                }
                if Instant::now() > poll_deadline {
                    let _ = std::fs::remove_file(&control_path);
                    return Err(Error::vm_msg(format!(
                        "worker spawn: snapshot file {} did not appear within {:?} \
                         (bg save in flight from a pipelined bake — increase \
                         `VmConfig::restore_timeout` if your disk is slow)",
                        self.snapshot_path.display(),
                        self.spawn_timeout
                    )));
                }
                std::thread::sleep(backoff);
                backoff = (backoff * 2).min(Duration::from_millis(50));
            }
        }

        let mut cmd = Command::new(&self.worker_bin);
        for layer in &self.layers {
            cmd.arg("--virtio-blk").arg(layer);
        }
        if let Some(delta) = &self.delta_squashfs {
            cmd.arg("--virtio-blk").arg(delta);
        }
        // virtio-fs mounts persisted in the snapshot metadata. The
        // worker re-constructs the VirtioFs devices at the same MMIO
        // slots so the guest's baked-in FDT entries find them.
        for m in &self.mounts {
            cmd.arg("--mount")
                .arg(format!("{}:{}", m.host_path, m.guest_tag));
        }
        cmd.arg("--memory").arg(self.memory_mib.to_string());
        cmd.arg("--vcpus").arg(self.vcpus.to_string());
        cmd.arg("--restore-from").arg(&self.snapshot_path);
        cmd.arg("--cow-restore");
        cmd.arg("--vsock-mux").arg(&vsock_mux_path);
        cmd.arg("--vsock-exec").arg(&vsock_exec_path);
        cmd.arg("--pool-worker").arg(&control_path);
        if let Some(pages) = self.balloon_target_pages {
            cmd.arg("--balloon-target-pages").arg(pages.to_string());
        }
        // Quiet by default — embedders don't want VM kernel logs
        // on their stdout. SUPERMACHINE_WORKER_LOG=1 to opt in.
        let log_to_stdio = std::env::var("SUPERMACHINE_WORKER_LOG")
            .map(|v| v == "1" || v == "true")
            .unwrap_or(false);
        if !log_to_stdio {
            cmd.stdout(Stdio::null()).stderr(Stdio::null());
        }
        let __t0 = Instant::now();
        let child = cmd
            .spawn()
            .map_err(|e| Error::vm_msg(format!("spawn worker {}: {e}", self.worker_bin.display())))?;
        let __t_spawned = __t0.elapsed();

        // Worker connects to ctl_listener once it's far enough
        // through main() to call UnixStream::connect — well
        // before any expensive work. We just accept and read the
        // initial DONE line, which the worker writes after its
        // snapshot restore completes. That means callers get a
        // worker that's truly ready to serve, no further polling.
        ctl_listener
            .set_nonblocking(true)
            .map_err(|e| Error::vm_msg(format!("set control listener nonblocking: {e}")))?;
        let deadline = Instant::now() + self.spawn_timeout;
        let mut backoff = Duration::from_millis(1);
        let stream = loop {
            match ctl_listener.accept() {
                Ok((s, _)) => break s,
                Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => {
                    if Instant::now() > deadline {
                        let _ = std::fs::remove_file(&control_path);
                        return Err(Error::vm_msg(format!(
                            "worker spawn: control connect did not arrive within {:?}",
                            self.spawn_timeout
                        )));
                    }
                    std::thread::sleep(backoff);
                    backoff = (backoff * 2).min(Duration::from_millis(10));
                }
                Err(e) => {
                    let _ = std::fs::remove_file(&control_path);
                    return Err(Error::vm_msg(format!(
                        "worker spawn: control accept: {e}"
                    )));
                }
            }
        };
        // Switch back to blocking — the rest is line-oriented and
        // synchronous.
        stream
            .set_nonblocking(false)
            .map_err(|e| Error::vm_msg(format!("set control stream blocking: {e}")))?;
        let writer = stream
            .try_clone()
            .map_err(|e| Error::vm_msg(format!("clone control stream: {e}")))?;
        let mut control = ControlChannel {
            reader: std::io::BufReader::new(stream),
            writer,
        };

        // The supervisor protocol opens with a "READY\n" line
        // from the worker (it writes that immediately after
        // connecting, before doing any HVF setup). We then send
        // RESTORE <snap_path>, the worker boots+restores, and
        // writes back DONE us=… host_port=… …
        let ready = match control.read_line() {
            Ok(l) => l,
            Err(e) => {
                let _ = std::fs::remove_file(&control_path);
                return Err(Error::vm_msg(format!(
                    "worker spawn: read READY: {e}"
                )));
            }
        };
        if ready.trim() != "READY" {
            let _ = std::fs::remove_file(&control_path);
            return Err(Error::vm_msg(format!(
                "worker spawn: expected READY, got: {}",
                ready.trim()
            )));
        }
        let snap_path_str = self.snapshot_path.to_string_lossy().to_string();
        if let Err(e) = control.send_line(&format!("RESTORE {snap_path_str}")) {
            let _ = std::fs::remove_file(&control_path);
            return Err(Error::vm_msg(format!(
                "worker spawn: send initial RESTORE: {e}"
            )));
        }
        let done = match control.read_line() {
            Ok(l) => l,
            Err(e) => {
                let _ = std::fs::remove_file(&control_path);
                return Err(Error::vm_msg(format!(
                    "worker spawn: read initial DONE: {e}"
                )));
            }
        };
        if !done.starts_with("DONE") {
            let _ = std::fs::remove_file(&control_path);
            return Err(Error::vm_msg(format!(
                "worker spawn: expected initial DONE, got: {}",
                done.trim()
            )));
        }
        if std::env::var_os("SUPERMACHINE_TIMINGS").is_some() {
            eprintln!(
                "[spawn_one] spawn={:?} accept_to_done={:?} total={:?}",
                __t_spawned,
                __t0.elapsed() - __t_spawned,
                __t0.elapsed()
            );
        }

        // Probe the in-guest agent's wire-protocol version.
        // Catches the failure mode where a snapshot was baked
        // against a previous supermachine release whose agent
        // doesn't speak the current protocol — without this,
        // unknown JSON fields (stage_files / chain) silently
        // hit serde's `#[serde(default)]` ignore-unknown path
        // and the host sees "exec ran with exit 0" but the
        // staged file was never written.
        //
        // Fast path: skip the probe when the snapshot's recorded
        // `runtime_sha16` matches the current worker binary's
        // SHA16. By construction the in-guest agent shipped at
        // bake time matches what the lib expects — the probe
        // would always succeed, just round-trip for nothing.
        // ~25 ms saved per acquire on the hot path.
        //
        // Multi-vCPU caveat: HVF can't round-trip ICH_LR_EL2
        // (GIC List Registers) cleanly across snapshot/restore,
        // so a multi-vCPU restored guest sometimes RCU-stalls
        // on boot — see docs/design/multi-vcpu-snapshot-
        // intermittency-2026-04-27.md and the deferral note. In
        // that state the agent never responds and the probe
        // times out, masquerading as a stale-agent error.
        // For vcpus > 1 we downgrade probe timeouts to a soft
        // warning instead of failing spawn — the workload's
        // first exec() will surface the real error if the
        // guest is genuinely hung. Definitive stale-agent
        // signals (probe ack with old protocol number) still
        // hard-fail.
        // (Removed: post-restore smpark_unpark RPC. Was needed when
        // bake-time park left secondaries in WFI inside
        // smpark_park_routine with unpark_signal=0; SNAPSHOT_VERSION
        // 9's full ICH round-trip means we no longer drive
        // secondaries into the synthetic parked state, so there's
        // nothing to unstick post-restore.)

        let probe_skip = self
            .baker_runtime_sha16
            .as_deref()
            .and_then(|stored| {
                current_worker_sha16(&self.worker_bin)
                    .map(|current| stored == current.as_str())
            })
            .unwrap_or(false);
        if !probe_skip {
            if let Err(e) = probe_agent_protocol(&vsock_exec_path, self.vcpus) {
                if self.vcpus > 1 && e.is_likely_multi_vcpu_restore_stall() {
                    eprintln!(
                        "supermachine: WARNING multi-vCPU ({}vCPU) snapshot agent probe \
                         timed out — guest may be RCU-stalled after restore. \
                         Continuing (multi-vCPU is unsupported per design); \
                         workload exec will surface a real error if the guest is hung.",
                        self.vcpus
                    );
                } else {
                    let _ = std::fs::remove_file(&control_path);
                    return Err(e);
                }
            }
        }

        Ok(Worker {
            child,
            vsock_mux_path,
            vsock_exec_path,
            control_path,
            control: Arc::new(Mutex::new(control)),
            last_restore_path: self.snapshot_path.clone(),
        })
    }
}

/// Minimum agent wire-protocol version this lib understands.
/// MUST match `AGENT_PROTOCOL` in
/// `crates/supermachine-guest-agent/src/main.rs`. Bump in
/// lockstep when the protocol gains a feature the lib expects
/// the agent to support (e.g. stage_files, chain).
///
/// Version log:
///   1: pre-stage_file. ExecRequest = {argv, env, cwd, tty,
///      cols, rows}. CONTROL actions: signal, write_file,
///      read_file.
///   2: + stage_files + chain on ExecRequest. + probe CONTROL.
const HOST_AGENT_PROTOCOL_MIN: u32 = 2;

/// Returns a 16-hex prefix of the SHA-256 of the worker binary at
/// `worker_bin`, cached per-binary-path. Used by the spawn-time
/// probe-skip fast path to compare against the snapshot's stored
/// `runtime_sha16`. Caching is keyed on `(path, file_size,
/// mtime_ns)` so a re-installed worker (same path, new contents)
/// invalidates correctly.
fn current_worker_sha16(worker_bin: &Path) -> Option<String> {
    use std::sync::Mutex;
    static CACHE: Mutex<Option<(PathBuf, u64, u128, String)>> = Mutex::new(None);

    let meta = std::fs::metadata(worker_bin).ok()?;
    let len = meta.len();
    let mtime_ns = meta
        .modified()
        .ok()?
        .duration_since(std::time::UNIX_EPOCH)
        .ok()?
        .as_nanos();

    if let Ok(g) = CACHE.lock() {
        if let Some((p, l, m, sha)) = g.as_ref() {
            if p == worker_bin && *l == len && *m == mtime_ns {
                return Some(sha.clone());
            }
        }
    }

    // Compute SHA-256 via the `shasum` shell utility — same
    // path as `bake::sha256_file`, so the digest matches what
    // the bake step recorded into metadata.json. We pay the
    // forking cost once per worker-bin (cache key is
    // path/size/mtime), not per acquire.
    let out = std::process::Command::new("shasum")
        .arg("-a")
        .arg("256")
        .arg(worker_bin)
        .output()
        .ok()?;
    if !out.status.success() {
        return None;
    }
    let line = String::from_utf8_lossy(&out.stdout);
    let digest = line.split_whitespace().next()?.to_owned();
    let sha16 = digest[..digest.len().min(16)].to_owned();

    if let Ok(mut g) = CACHE.lock() {
        *g = Some((worker_bin.to_path_buf(), len, mtime_ns, sha16.clone()));
    }
    Some(sha16)
}

/// First-contact handshake with an in-guest agent. Sends a
/// `probe` CONTROL action; verifies the returned protocol
/// version is at least `HOST_AGENT_PROTOCOL_MIN`. Mismatch →
/// surface a typed `Error` that tells the caller exactly what
/// to do (rebake the snapshot).
///
/// `vcpus` is the snapshot's vCPU count. For vcpus > 1 we use a
/// short timeout (1 s) — a multi-vCPU restored guest either
/// answers within milliseconds or has RCU-stalled and is never
/// going to answer; waiting the full 10 s only delays the
/// downgrade-to-warning path in the spawn caller.
fn probe_agent_protocol(vsock_exec_path: &Path, vcpus: u32) -> Result<(), Error> {
    let timeout = if vcpus > 1 {
        Duration::from_secs(1)
    } else {
        Duration::from_secs(10)
    };
    let body = serde_json::json!({ "action": "probe" });
    let ack = match crate::exec::send_control_with_ack(
        vsock_exec_path,
        &body,
        Some(timeout),
    ) {
        Ok(a) => a,
        Err(e) => {
            // Old agents (pre-protocol-2) don't know "probe"
            // and return ack with ok=false; send_control_with_ack
            // surfaces that as an io::Error with the agent's
            // error message. Any error here = stale agent.
            return Err(Error::vm_msg(format!(
                "agent in this snapshot is from an older supermachine release \
                 (probe failed: {e}). Rebake the snapshot to pick up the new \
                 agent: rm -rf the snapshot dir and re-run your bake (e.g. \
                 `supermachine pull <image> --name <name>`)."
            )));
        }
    };
    let proto = ack.get("protocol").and_then(|v| v.as_u64()).unwrap_or(0) as u32;
    if proto < HOST_AGENT_PROTOCOL_MIN {
        return Err(Error::vm_msg(format!(
            "agent in this snapshot speaks protocol v{proto} but this \
             supermachine library expects v{HOST_AGENT_PROTOCOL_MIN}+. The \
             snapshot was baked against a previous release; rebake to pick up \
             the new agent (rm -rf the snapshot dir and re-run your bake)."
        )));
    }
    Ok(())
}

impl std::fmt::Debug for HiddenPool {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let s = self.state.lock().ok();
        f.debug_struct("HiddenPool")
            .field("socks_dir", &self.socks_dir)
            .field(
                "alive",
                &s.as_ref().map(|s| s.alive).unwrap_or(usize::MAX),
            )
            .field(
                "idle",
                &s.as_ref().map(|s| s.idle.len()).unwrap_or(usize::MAX),
            )
            .finish()
    }
}

impl Drop for HiddenPool {
    fn drop(&mut self) {
        // Signal housekeeping threads to exit. They hold their
        // own clones of the wait-state Arcs (`PoolWaitHandles`),
        // not `Arc<HiddenPool>`, so they don't keep us alive
        // and this drop is allowed to fire as soon as the user-
        // side `Pool` + `Image` references go away. Each thread
        // re-checks `shutting_down` within ~100 ms of its next
        // condvar wait timeout.
        self.shutting_down.store(true, Ordering::SeqCst);
        self.available.notify_all();
        if let Some(c) = self.dirty_pending.as_ref() {
            c.notify_all();
        }
        // Shut down any idle workers (sends QUIT, kills on timeout).
        if let Ok(mut s) = self.state.lock() {
            while let Some(mut e) = s.idle.pop() {
                e.worker.shutdown();
                s.alive = s.alive.saturating_sub(1);
            }
            // Drain the dirty queue too.
            if let Some(d) = self.dirty.as_ref() {
                if let Ok(mut q) = d.lock() {
                    while let Some(mut w) = q.pop_front() {
                        w.shutdown();
                        s.alive = s.alive.saturating_sub(1);
                    }
                }
            }
        }
        // Best-effort cleanup of the per-pool socks dir.
        let _ = std::fs::remove_dir_all(&self.socks_dir);
    }
}

impl HiddenPool {
    /// Acquire a worker. Three paths:
    ///
    /// 1. **Idle hit (hot path).** A worker is in the idle queue
    ///    — pop and return. ~µs.
    /// 2. **Auto-grow.** No idle worker, but `alive < max`. Bump
    ///    the alive count (reserves a slot before we drop the
    ///    lock), spawn a fresh worker outside the lock, return
    ///    it. ~10-15 ms cold spawn cost.
    /// 3. **Wait.** No idle worker, `alive == max`. Block on the
    ///    `available` condvar with `acquire_timeout`. On timeout
    ///    return `Error::PoolExhausted`.
    fn acquire(&self) -> Result<Worker, Error> {
        let acquire_t0 = Instant::now();
        let acquire_timeout = self.policy.acquire_timeout;
        let mut state = self
            .state
            .lock()
            .map_err(|_| Error::vm_msg("pool mutex poisoned".to_owned()))?;
        loop {
            // Liveness check at pop time. Dead workers are a real
            // failure mode with `restore_on_release(false)`: the
            // worker is returned to idle on Vm drop WITHOUT a cycle
            // restore, so if its process died (HVF crash, SIGKILL,
            // jetsam, etc.) we'd hand the caller a doomed handle
            // whose first `vm.exec` fails with "Connection refused"
            // on the closed vsock socket. Try_wait is non-blocking
            // and cheap; discarding the dead entry + decrementing
            // alive lets the auto-grow path below spawn a fresh
            // worker on the same acquire call.
            //
            // `restore_on_release(true)` workers normally can't
            // reach this branch alive-but-dead because the
            // restorer thread already RESTOREs them on drop (a
            // failed restore would have removed them from the
            // pool). Belt-and-suspenders: same check applies.
            while let Some(mut entry) = state.idle.pop() {
                match entry.worker.child.try_wait() {
                    Ok(None) => return Ok(entry.worker), // alive
                    Ok(Some(_status)) => {
                        // Already exited. Decrement alive so the
                        // auto-grow branch (or replenisher) replaces
                        // it. The worker handle's Drop will tidy
                        // sockets without trying to send QUIT to a
                        // dead process.
                        state.alive = state.alive.saturating_sub(1);
                        continue; // try next idle entry
                    }
                    Err(_) => {
                        // Stat failed (e.g. permission, OS weirdness).
                        // Treat as dead — same handling.
                        state.alive = state.alive.saturating_sub(1);
                        continue;
                    }
                }
            }
            if self.shutting_down.load(Ordering::SeqCst) {
                return Err(Error::vm_msg("pool is shutting down".to_owned()));
            }
            // Auto-grow: spawn a worker if we're under the cap.
            // Reserve the alive slot inside the lock so concurrent
            // acquires don't both spawn past max.
            if state.alive < self.policy.max {
                state.alive += 1;
                drop(state);
                let spawned = self.spawn_cfg.spawn_one();
                match spawned {
                    Ok(w) => return Ok(w),
                    Err(e) => {
                        // Roll back the reservation so the pool
                        // doesn't permanently lose the slot.
                        if let Ok(mut s) = self.state.lock() {
                            s.alive = s.alive.saturating_sub(1);
                        }
                        // Wake any other waiters so they can retry
                        // (they may not need to spawn — restorer
                        // could push to idle in the meantime).
                        self.available.notify_all();
                        return Err(e);
                    }
                }
            }
            // At max: wait on condvar with timeout (or forever).
            // If we've already exceeded the budget, return now —
            // don't bother re-entering wait.
            if let Some(total) = acquire_timeout {
                if acquire_t0.elapsed() >= total {
                    return Err(Error::pool_exhausted(format!(
                        "acquire timed out after {total:?}; pool at max ({})",
                        self.policy.max
                    )));
                }
            }
            state.waiting += 1;
            let (new_state, timed_out) = match acquire_timeout {
                None => match self.available.wait(state) {
                    Ok(s) => (s, false),
                    Err(_) => {
                        return Err(Error::vm_msg("pool condvar poisoned".to_owned()))
                    }
                },
                Some(total) => {
                    let remaining = total.saturating_sub(acquire_t0.elapsed());
                    match self.available.wait_timeout(state, remaining) {
                        Ok((s, r)) => (s, r.timed_out()),
                        Err(_) => {
                            return Err(Error::vm_msg(
                                "pool condvar poisoned".to_owned(),
                            ))
                        }
                    }
                }
            };
            state = new_state;
            state.waiting = state.waiting.saturating_sub(1);
            if timed_out {
                return Err(Error::pool_exhausted(format!(
                    "acquire timed out after {:?}; pool at max ({})",
                    acquire_timeout.unwrap_or_default(),
                    self.policy.max
                )));
            }
        }
    }

    /// `PooledVm::drop` calls this with the now-dirty worker.
    /// Pushes onto the dirty queue and signals the restorer
    /// thread, which sends a RESTORE command to reset the
    /// worker's guest state and then returns it to the idle
    /// queue. Drop returns to the user immediately — the ~3 ms
    /// restore happens off the user's thread.
    ///
    /// Returns the worker to the dirty queue rather than
    /// killing it. The same supervisor-mode worker process
    /// serves many cycles; we only kill on shutdown or on a
    /// genuine RESTORE protocol failure (then the replenisher
    /// spawns a fresh worker to keep the pool at target size).
    fn release(&self, worker: Worker) {
        // Skip-restore mode: push directly to idle without
        // queueing for the restorer. Per-cycle cost vanishes
        // and the guest's page cache stays warm across uses.
        // Caller opted in via `PoolBuilder::restore_on_release(false)`.
        if !self.policy.restore_on_release {
            if let Ok(mut s) = self.state.lock() {
                s.idle.push(IdleEntry {
                    worker,
                    last_used: Instant::now(),
                });
                self.available.notify_all();
            }
            return;
        }
        if let Some(d) = self.dirty.as_ref() {
            if let Ok(mut q) = d.lock() {
                q.push_back(worker);
            }
            if let Some(c) = self.dirty_pending.as_ref() {
                c.notify_all();
            }
        } else {
            // Defensive fallback: if for some reason the dirty
            // queue isn't wired up (shouldn't happen with the
            // current init path), fall back to the old "kill +
            // ask replenisher" behaviour so we never strand a
            // worker.
            let mut w = worker;
            w.shutdown();
            if let Ok(mut s) = self.state.lock() {
                s.alive = s.alive.saturating_sub(1);
            }
            self.available.notify_all();
        }
    }
}

impl Image {
    /// Load an image from the on-disk artifacts produced by
    /// `supermachine run IMAGE`. The argument can be either:
    ///
    /// - The directory containing `metadata.json` and `restore.snap`
    ///   (typical: `~/.local/supermachine-snapshots/<name>/`).
    /// - The `restore.snap` file itself; we read `metadata.json`
    ///   from its parent dir.
    ///
    /// ```sh
    /// supermachine run nginx:1.27-alpine --detach && supermachine run --stop
    /// # snapshot dir: ~/.local/supermachine-snapshots/nginx_1_27-alpine/
    /// ```
    ///
    /// On disk, that directory contains:
    ///
    /// ```text
    /// metadata.json    # layers, memory, vcpus, etc.
    /// restore.snap     # captured VM state (CoW-mappable)
    /// delta.squashfs   # writable overlay layer (optional)
    /// ```
    pub fn from_snapshot(path: impl Into<PathBuf>) -> Result<Self, Error> {
        Self::from_snapshot_inner(path.into(), false)
    }

    /// Like [`Self::from_snapshot`] but tolerates the snapshot
    /// `restore.snap` file being absent — used by the
    /// always-pipelined plain-`build()` path where the bg
    /// `save_compact_to_file` may still be in flight when
    /// `build()` returns. The Image is constructed from
    /// `metadata.json` (which IS written synchronously) and the
    /// stashed warm `BakedWorker` covers the first acquire from
    /// in-memory state. Subsequent `Pool::spawn_one` calls poll
    /// for `snapshot_path.is_file()` before invoking the worker —
    /// `save_compact_to_file` writes to `<path>.partial` and
    /// atomic-renames, so file existence ↔ save complete.
    pub(crate) fn from_snapshot_pending(path: impl Into<PathBuf>) -> Result<Self, Error> {
        Self::from_snapshot_inner(path.into(), true)
    }

    fn from_snapshot_inner(path: PathBuf, allow_pending: bool) -> Result<Self, Error> {
        // Resolve to a (snapshot_path, metadata_path) pair. Under
        // `allow_pending`, the path may be a directory whose
        // `restore.snap` doesn't exist yet — accept that and let
        // spawn_one poll for the file later.
        let (snapshot_path, metadata_path) = if path.is_dir() {
            (path.join("restore.snap"), path.join("metadata.json"))
        } else if path.is_file() {
            let parent = path.parent().ok_or_else(|| {
                Error::image_msg(format!("snapshot path has no parent dir: {}", path.display()))
            })?;
            (path.clone(), parent.join("metadata.json"))
        } else if allow_pending {
            // Path doesn't exist as either dir or file. With
            // pending we still expect the directory at minimum
            // (metadata.json must live there).
            return Err(Error::image_msg(format!(
                "snapshot path not found: {}",
                path.display()
            )));
        } else {
            return Err(Error::image_msg(format!(
                "snapshot path not found: {}",
                path.display()
            )));
        };

        if !allow_pending && !snapshot_path.is_file() {
            return Err(Error::image_msg(format!(
                "snapshot file not found: {}",
                snapshot_path.display()
            )));
        }
        if !metadata_path.is_file() {
            return Err(Error::image_msg(format!(
                "metadata.json not found alongside snapshot at {}",
                metadata_path.display()
            )));
        }

        let meta_text = std::fs::read_to_string(&metadata_path)
            .map_err(|e| Error::image_msg(format!("read {}: {e}", metadata_path.display())))?;
        let meta: serde_json::Value = serde_json::from_str(&meta_text)
            .map_err(|e| Error::image_msg(format!("parse {}: {e}", metadata_path.display())))?;

        // Detect snapshots baked under a previous binary version. The
        // bake driver writes the kernel path as
        // `…/supermachine/v<VERSION>/kernel`; if that <VERSION> doesn't
        // match the current crate version, the snapshot's pinned init
        // shim and kernel may be missing fixes (e.g. the loopback
        // bring-up that landed in 0.4.29). Emit a single-line warning
        // so the next `supermachine run` makes the cause obvious.
        warn_if_snapshot_version_mismatch(&meta, &snapshot_path);

        let memory_mib = meta
            .get("memory_mib")
            .and_then(|v| v.as_u64())
            .map(|v| v as u32)
            .unwrap_or(256);
        let vcpus = meta
            .get("vcpus")
            .and_then(|v| v.as_u64())
            .map(|v| v as u32)
            .unwrap_or(1);

        // metadata.json paths may be absolute (default for native
        // bakes that store paths under ~/.local/...) or relative
        // (used by `supermachine bundle --image NAME`, which writes
        // a self-contained dir with `./layers/<sha>.squashfs` style
        // entries). Resolve relative paths against the metadata
        // dir so a bundle works after `cp -r` to a different host.
        let metadata_dir = metadata_path
            .parent()
            .map(Path::to_path_buf)
            .unwrap_or_else(|| PathBuf::from("."));
        let resolve_path = |s: &str| -> PathBuf {
            let p = PathBuf::from(s);
            if p.is_absolute() {
                p
            } else {
                metadata_dir.join(p)
            }
        };

        let layers: Vec<PathBuf> = meta
            .get("layers")
            .and_then(|v| v.as_array())
            .map(|arr| {
                arr.iter()
                    .filter_map(|x| x.as_str().map(&resolve_path))
                    .collect()
            })
            .unwrap_or_default();
        let delta_squashfs = meta
            .get("delta_squashfs")
            .and_then(|v| v.as_str())
            .map(&resolve_path);

        // virtio-fs mounts: persisted by the bake driver as
        // `metadata.mounts = [{host_path, guest_tag}, ...]`. Missing
        // / empty for legacy snapshots; the runtime treats both the
        // same.
        let mounts: Vec<crate::vmm::resources::MountSpec> = meta
            .get("mounts")
            .and_then(|v| v.as_array())
            .map(|arr| {
                arr.iter()
                    .filter_map(|x| {
                        let host = x.get("host_path").and_then(|v| v.as_str())?;
                        let tag = x.get("guest_tag").and_then(|v| v.as_str())?;
                        Some(crate::vmm::resources::MountSpec::new(host, tag))
                    })
                    .collect()
            })
            .unwrap_or_default();

        // Bundled kernel discovery: a self-contained bundle puts
        // the kernel image next to the snapshot. Prefer that over
        // host-wide AssetPaths so a shipped `.app` doesn't depend
        // on the user having supermachine installed.
        let bundled_kernel = {
            let cand = metadata_dir.join("kernel");
            if cand.is_file() {
                Some(cand)
            } else {
                None
            }
        };

        let baker_runtime_sha16 = meta
            .get("runtime_sha16")
            .and_then(|v| v.as_str())
            .map(|s| s.to_owned());

        let balloon_target_pages = meta
            .get("balloon_target_pages")
            .and_then(|v| v.as_u64())
            .and_then(|n| u32::try_from(n).ok())
            .filter(|n| *n > 0);

        Ok(Self {
            snapshot_path,
            memory_mib,
            vcpus,
            baker_runtime_sha16,
            balloon_target_pages,
            layers,
            delta_squashfs,
            mounts,
            bundled_kernel,
            hidden_pool: std::sync::OnceLock::new(),
            warm_baked_worker: Arc::new(crate::bake::WarmStash::new(None)),
        })
    }

    /// Pull and bake an image from a registry reference, returning
    /// the loadable [`Image`]. Equivalent to running
    /// `supermachine run <image_ref> --no-detach` from a Rust app,
    /// minus the daemon — you get the [`Image`] back, then call
    /// [`Vm::start`] yourself.
    ///
    /// Uses [`PullPolicy::Missing`] (cache-first) by default. For
    /// other policies, see [`Image::from_oci_with_policy`].
    ///
    /// ```no_run
    /// # use supermachine::{Image, Vm, VmConfig};
    /// let image = Image::from_oci("nginx:1.27-alpine")?;
    /// let vm = Vm::start(&image, &VmConfig::new())?;
    /// # let _ = vm; Ok::<(), supermachine::Error>(())
    /// ```
    pub fn from_oci(image_ref: &str) -> Result<Self, Error> {
        Self::from_oci_with_policy(image_ref, PullPolicy::default())
    }

    /// As [`Image::from_oci`] but with an explicit [`PullPolicy`].
    /// See [`PullPolicy`] for the cache + registry interaction
    /// table.
    pub fn from_oci_with_policy(
        image_ref: &str,
        policy: PullPolicy,
    ) -> Result<Self, Error> {
        let snapshots_dir = default_snapshots_dir();
        Self::from_oci_to_dir(image_ref, policy, &snapshots_dir, None)
    }

    /// Most explicit constructor: pull/bake into a specific
    /// snapshots directory, with an optional explicit name.
    /// Lets you keep multiple "supermachine snapshot stores"
    /// (e.g. per-project), or pin a snapshot under a name that
    /// differs from the image-derived default.
    pub fn from_oci_to_dir(
        image_ref: &str,
        policy: PullPolicy,
        snapshots_dir: &Path,
        name: Option<&str>,
    ) -> Result<Self, Error> {
        // 1. Compute where the cached snapshot would live and
        //    short-circuit on hit (Missing) or miss (Never).
        let derived = name
            .map(|s| s.to_owned())
            .unwrap_or_else(|| crate::bake::snapshot_name_for_image(image_ref));
        let snap_dir = snapshots_dir.join(&derived);
        let cache_loadable = Self::from_snapshot(&snap_dir).is_ok();

        match policy {
            PullPolicy::Never => {
                if cache_loadable {
                    return Self::from_snapshot(&snap_dir);
                }
                let restore_snap = snap_dir.join("restore.snap");
                if restore_snap.is_file() {
                    return Err(Error::cache_invalid(format!(
                        "snapshot present at {} but not loadable on this binary; \
                         rebake required (PullPolicy::Never won't auto-rebake)",
                        snap_dir.display()
                    )));
                }
                return Err(Error::cache_miss(format!(
                    "no cached snapshot for {image_ref} at {} (PullPolicy::Never)",
                    snap_dir.display()
                )));
            }
            PullPolicy::Missing if cache_loadable => {
                return Self::from_snapshot(&snap_dir);
            }
            // Missing+invalid OR Always: fall through to bake.
            _ => {}
        }

        // 2. Bake. This shells out to the existing bake pipeline:
        //    registry pull (or reuse cached layers) → squashfs →
        //    boot worker once → capture snapshot.
        let root = repo_root_for_bake()?;
        let request = crate::bake::BakeRequest {
            image: image_ref.to_owned(),
            name: name.map(|s| s.to_owned()),
            runtime: "supermachine".to_owned(),
            guest_port: 80,
            memory_mib: 256,
            vcpus: 1,
            pull_policy: policy.as_bake_str().to_owned(),
            snapshots_dir: snapshots_dir.to_path_buf(),
            cmd_override: None,
            extra_args: Vec::new(),
        };
        let bake_t0 = std::time::Instant::now();
        crate::bake::run_push(&request, bake_t0, &root)
            .map_err(|e| map_bake_error(&request.image, e))?;

        // 3. Load the freshly-baked snapshot.
        Self::from_snapshot(&snap_dir)
    }

    /// Builder for configurable bakes — env vars, cmd override,
    /// custom memory / port, custom snapshot name.
    ///
    /// ```no_run
    /// # use supermachine::Image;
    /// let image = Image::builder("nginx:1.27-alpine")
    ///     .with_name("nginx-prod")
    ///     .with_memory_mib(512)
    ///     .with_env("FOO", "bar")
    ///     .with_cmd(["nginx", "-g", "daemon off;"])
    ///     .build()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// The builder produces a different snapshot for each
    /// configuration — bake-time inputs are part of the snapshot
    /// fingerprint. Reuse a name across configurations and the
    /// previous snapshot is invalidated; pick distinct names if
    /// you need both side-by-side.
    pub fn builder(image_ref: impl Into<String>) -> OciImageBuilder {
        OciImageBuilder::new(image_ref)
    }

    /// Get an [`Image`] for `name`, baking it from `image_ref`
    /// only if a compatible snapshot doesn't already exist.
    ///
    /// This is the right call for app startup. The first run
    /// bakes (one-time cost: the registry pull + snapshot build,
    /// e.g. ~12 s for `rust:1-slim`); subsequent runs see the
    /// cached snapshot and return in microseconds. After a
    /// `cargo update` that bumped the supermachine version, the
    /// cached snapshot's bake-key no longer matches the current
    /// worker binary, and `ensure_baked` rebakes automatically —
    /// no shell scripts, no manual `rm -rf snapshots/`.
    ///
    /// `configure` is a builder closure: chain
    /// [`OciImageBuilder`] methods like `with_memory_mib`,
    /// `with_cmd`, `with_env` to customize the bake. Pass
    /// `|b| b` for defaults.
    ///
    /// ```no_run
    /// use std::time::Duration;
    /// use supermachine::{Image, VmConfig};
    ///
    /// // Bake once on first run, reuse forever after — including
    /// // across supermachine version upgrades.
    /// let image = Image::ensure_baked("rust_1_slim", "rust:1-slim", |b| {
    ///     b.with_memory_mib(2048)
    /// })?;
    /// // Configure pool: 5 always-warm, scale to 50 under burst.
    /// let pool = image.pool().min(5).max(50).build()?;
    ///
    /// // Per-task path:
    /// let vm = pool.acquire()?;
    /// vm.write_file("/tmp/main.rs", b"fn main() { println!(\"hi\"); }")?;
    /// let out = vm.exec_builder()
    ///     .argv(["sh", "-c", "rustc /tmp/main.rs -o /tmp/m && /tmp/m"])
    ///     .timeout(Duration::from_secs(30))
    ///     .output()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn ensure_baked<F>(
        name: impl Into<String>,
        image_ref: impl Into<String>,
        configure: F,
    ) -> Result<Image, Error>
    where
        F: FnOnce(OciImageBuilder) -> OciImageBuilder,
    {
        let builder = configure(
            OciImageBuilder::new(image_ref).with_name(name),
        );
        builder.build()
    }

    /// Path to the snapshot file backing this image.
    pub fn snapshot_path(&self) -> &Path {
        &self.snapshot_path
    }

    /// Memory the snapshot was baked with. [`Vm::start`] uses
    /// this if [`VmConfig::with_memory_mib`] isn't set.
    pub fn memory_mib(&self) -> u32 {
        self.memory_mib
    }

    /// vCPUs the snapshot was baked with.
    pub fn vcpus(&self) -> u32 {
        self.vcpus
    }

    /// Test-only accessor for `balloon_target_pages` — the
    /// integration tests assert the metadata round-trip. Hidden
    /// from rustdoc; not part of the stable surface.
    #[doc(hidden)]
    pub fn balloon_target_pages_for_test(&self) -> Option<u32> {
        self.balloon_target_pages
    }

    /// Start a one-shot microVM from this image. Equivalent to
    /// [`Vm::start(self, config)`][Vm::start] but reads more
    /// naturally at the call site:
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = image.start(&VmConfig::new())?;
    /// // ... use vm ...
    /// vm.stop()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// Use [`Image::acquire`] instead if you want a `PooledVm`
    /// that returns to a (hidden) pool on `Drop` for cheaper
    /// reuse — typical for evaluation harnesses, CI verifiers,
    /// or any code that runs many short-lived VMs of the same
    /// image back-to-back.
    pub fn start(&self, config: &VmConfig) -> Result<Vm, Error> {
        Vm::start(self, config)
    }

    /// Acquire a microVM from this image's hidden pool. Returns
    /// a [`PooledVm`] which `Deref`s to [`Vm`] and returns to
    /// the pool on `Drop`. Use this for the common
    /// "spin up a VM, do one task, throw it away, do another"
    /// loop — the pool keeps re-restoring from the same snapshot
    /// behind the scenes so per-iteration cost stays at the
    /// snapshot-restore floor (~5 ms on Apple Silicon).
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// # use std::time::Duration;
    /// let image = Image::from_snapshot("path/to/rust-slim")?;
    /// for src in ["fn main() {}", "fn main() { panic!() }"] {
    ///     let vm = image.acquire()?;
    ///     vm.write_file("/tmp/main.rs", src.as_bytes())?;
    ///     let out = vm.exec_builder()
    ///         .argv(["sh", "-c", "rustc /tmp/main.rs -o /tmp/m && /tmp/m"])
    ///         .timeout(Duration::from_secs(30))
    ///         .output()?;
    ///     println!("status={:?} out={:?}", out.status.code(), out.stdout);
    ///     // vm dropped here — returned to pool, restored from snapshot
    /// }
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// ## Pool sizing
    ///
    /// `Image::acquire` uses an ambient pool with default policy
    /// (`min=0`, `max=64`, `idle_timeout=60s`,
    /// `acquire_timeout=60s`). For an explicit policy use
    /// [`Image::pool`] to build a [`Pool`] and call
    /// `pool.acquire()` instead.
    ///
    /// Per-acquire cost is the snapshot restore (~3 ms on Apple
    /// Silicon) when an idle worker is available; cold spawn
    /// (lazy-grow path) is ~15 ms. The pool auto-grows up to
    /// `max` under burst and auto-evicts above-`min` workers
    /// after they sit idle for `idle_timeout`.
    pub fn acquire(&self) -> Result<PooledVm<'_>, Error> {
        self.acquire_with(&VmConfig::new())
    }

    /// Like [`Image::acquire`] but with an explicit
    /// [`VmConfig`] (overrides for memory, vCPUs, asset paths,
    /// **pool size**, etc.). The config is honored on **first**
    /// acquire — when the pool is built. Subsequent acquires
    /// reuse the existing pool regardless of `config`. This is
    /// fine for most use cases; create a fresh `Image` if you
    /// need a different config without restarting your app.
    pub fn acquire_with(&self, config: &VmConfig) -> Result<PooledVm<'_>, Error> {
        let _span = tracing::info_span!(
            "supermachine.acquire",
            memory_mib = self.memory_mib,
            vcpus = self.vcpus,
        )
        .entered();
        let pool_arc = self.ensure_default_pool(config)?;
        let worker = pool_arc.acquire()?;
        let vm = Vm {
            pool: None,
            vsock_mux_path: worker.vsock_mux_path.clone(),
            vsock_exec_path: worker.vsock_exec_path.clone(),
            own_vsock_mux_dir: None,
            skip_cleanup: true,
            // Populate image_meta from the source Image so
            // `Vm::snapshot` can build metadata.json. The
            // capture itself runs over the worker subprocess's
            // supervisor protocol — see `Vm::snapshot`'s pool-
            // worker dispatch path. No HVF-entitlement
            // requirement on the calling binary.
            image_meta: Some(Arc::new(ImageMeta {
                memory_mib: config.memory_mib.unwrap_or(self.memory_mib),
                vcpus: config.vcpus.unwrap_or(self.vcpus),
                layers: self.layers.clone(),
                delta_squashfs: self.delta_squashfs.clone(),
            })),
        };
        Ok(PooledVm {
            vm: Some(vm),
            worker: Some(worker),
            pool_arc: Arc::clone(pool_arc),
            _image: std::marker::PhantomData,
        })
    }

    /// Configure an explicit pool against this image. Use when
    /// you want auto-scaling or fine control over min/max/idle/
    /// acquire timeouts; for the simple case
    /// `image.acquire()` already manages an ambient default-
    /// policy pool for you.
    ///
    /// ```no_run
    /// # use std::time::Duration;
    /// # use supermachine::Image;
    /// let image = Image::ensure_baked("rust_warm", "rust:1-slim", |b| b)?;
    /// // Auto-scale 5..=50, evict idle workers after 60s,
    /// // fail acquire if pool stays at max for >10s:
    /// let pool = image.pool()
    ///     .min(5)
    ///     .max(50)
    ///     .idle_timeout(Duration::from_secs(60))
    ///     .acquire_timeout(Duration::from_secs(10))
    ///     .build()?;
    /// let vm = pool.acquire()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn pool(&self) -> PoolBuilder<'_> {
        PoolBuilder {
            image: self,
            policy: PoolPolicy::default(),
            vm_config: VmConfig::new(),
        }
    }

    /// Build a fresh `HiddenPool` against `policy`, store it in
    /// `self.hidden_pool`, return the borrowed Arc.
    /// Build a fresh standalone `HiddenPool`. Caller decides
    /// what to do with the Arc — `acquire()` parks it in the
    /// per-Image `hidden_pool` OnceLock; `pool().build()`
    /// returns it as a `Pool` to the caller.
    fn build_pool_arc(
        &self,
        config: &VmConfig,
        policy: PoolPolicy,
    ) -> Result<Arc<HiddenPool>, Error> {
        // Find supermachine-worker. Tries env override, sibling-
        // of-current-exe (cargo install layout), dev-tree
        // target/release (workspace).
        #[cfg(target_os = "macos")]
        let worker_bin = crate::codesign::locate_worker_bin().ok_or_else(|| {
            Error::assets_msg(
                "supermachine-worker binary not found (looked for sibling of \
                 current_exe and target/release/supermachine-worker). Set \
                 SUPERMACHINE_WORKER_BIN if you have it elsewhere."
                    .to_owned(),
            )
        })?;
        #[cfg(not(target_os = "macos"))]
        let worker_bin: PathBuf = std::env::var_os("SUPERMACHINE_WORKER_BIN")
            .map(PathBuf::from)
            .ok_or_else(|| {
                Error::assets_msg(
                    "SUPERMACHINE_WORKER_BIN must be set on this platform".to_owned(),
                )
            })?;
        // HARD: verify the worker binary matches the library
        // version. The supervisor protocol evolves between
        // releases; a stale ~/.cargo/bin/supermachine-worker from
        // an older `cargo install supermachine` deadlocks
        // pipelined-bake silently otherwise. See
        // codesign::verify_worker_version for the diagnostic
        // signature and the upgrade hint we surface to the user.
        #[cfg(target_os = "macos")]
        {
            crate::codesign::verify_worker_version(&worker_bin)
                .map_err(Error::vm_msg)?;
            // Best-effort: ensure worker is HVF-entitled. No-op on
            // already-signed. Ok to ignore — `hv_vm_create` will
            // surface its own diagnostic if the entitlement is
            // missing, and we'd rather not block on a transient
            // codesign issue.
            let _ = crate::codesign::ensure_worker_signed(&worker_bin);
        }

        // Unix socket paths are capped at 104 bytes on macOS
        // (SUN_LEN). Default to /tmp instead of $TMPDIR, which on
        // macOS resolves to /var/folders/.../T/ and burns ~50
        // characters before we even start. /tmp/sm-pool-<pid>/
        // leaves room for a meaningful socket name underneath.
        let socks_dir = match &config.vsock_mux_dir {
            Some(d) => d.clone(),
            None => PathBuf::from(format!(
                "/tmp/sm-pool-{}-{:x}",
                std::process::id(),
                unique_suffix(),
            )),
        };
        std::fs::create_dir_all(&socks_dir).map_err(Error::Io)?;
        let memory_mib = config.memory_mib.unwrap_or(self.memory_mib);
        let vcpus = config.vcpus.unwrap_or(self.vcpus);
        let spawn_timeout = config
            .restore_timeout
            .unwrap_or_else(|| Duration::from_secs(30));
        // Same SUN_LEN concern: use a small token instead of the
        // full snapshot dir name. Per-pool counter would be even
        // shorter; for now an 8-char hash is enough.
        let name_prefix = "w".to_owned();
        let spawn_cfg = Arc::new(SpawnConfig {
            worker_bin,
            snapshot_path: self.snapshot_path.clone(),
            layers: self.layers.clone(),
            delta_squashfs: self.delta_squashfs.clone(),
            mounts: self.mounts.clone(),
            memory_mib,
            vcpus,
            socks_dir: socks_dir.clone(),
            name_prefix,
            spawn_timeout,
            baker_runtime_sha16: self.baker_runtime_sha16.clone(),
            balloon_target_pages: self.balloon_target_pages,
        });
        // Spawn `min` workers in parallel up front so the pool
        // is ready to serve immediately. `max - min` more can be
        // spawned later by the auto-grow path in `acquire`.
        //
        // WARM HANDOFF: claim the bake-time worker (if any) and
        // use it as one of the initial idle entries — saves spawn
        // (~50 ms) + restore (~5 ms) for the FIRST acquire. This
        // is the per-Image atomic claim: only the first
        // PoolBuilder::build() to run on a given Image gets the
        // warm worker; concurrent / subsequent calls take None
        // and spawn fresh as today.
        //
        // If the warm worker's child has already died (e.g. user
        // kept the Image around so long the worker exited on its
        // own, or HVF returned an error), we fall through cleanly
        // to spawn-from-disk.
        let claimed_warm = self
            .warm_baked_worker
            .take()
            .and_then(|mut bw| {
                // Liveness check — if the child reaped itself, ditch it.
                match bw.child.try_wait() {
                    Ok(None) => Some(bw),       // still running, claim it
                    Ok(Some(_)) => None,        // exited, fall through
                    Err(_) => None,             // EBADF or similar
                }
            });
        let initial = policy.min;
        let mut idle: Vec<IdleEntry> = Vec::with_capacity(initial.max(1));
        let extra_warm: usize;
        if let Some(bw) = claimed_warm {
            idle.push(IdleEntry {
                worker: warm_baked_to_worker(bw),
                last_used: Instant::now(),
            });
            extra_warm = 1;
        } else {
            extra_warm = 0;
        }
        // We need `initial` idle entries total. The warm worker (if
        // claimed) covers one slot; spawn (initial - 1) fresh ones.
        // For min=0 with a warm worker, we still keep the warm one
        // around (it's effectively a free min=1).
        let to_spawn = initial.saturating_sub(extra_warm);
        if to_spawn == 1 {
            idle.push(IdleEntry {
                worker: spawn_cfg.spawn_one()?,
                last_used: Instant::now(),
            });
        } else if to_spawn > 1 {
            let mut handles = Vec::with_capacity(to_spawn);
            for _ in 0..to_spawn {
                let cfg = Arc::clone(&spawn_cfg);
                handles.push(std::thread::spawn(move || cfg.spawn_one()));
            }
            for h in handles {
                let w = h
                    .join()
                    .map_err(|_| Error::vm_msg("pool spawn thread panicked".to_owned()))??;
                idle.push(IdleEntry {
                    worker: w,
                    last_used: Instant::now(),
                });
            }
        }
        // Pool's `alive` accounting: total idle count (incl. warm
        // handoff entry).
        let initial = idle.len();
        let pool = Arc::new(HiddenPool {
            state: Arc::new(Mutex::new(PoolState {
                idle,
                alive: initial,
                waiting: 0,
            })),
            available: Arc::new(Condvar::new()),
            dirty: Some(Arc::new(Mutex::new(VecDeque::new()))),
            dirty_pending: Some(Arc::new(Condvar::new())),
            socks_dir,
            shutting_down: Arc::new(AtomicBool::new(false)),
            spawn_cfg: Arc::clone(&spawn_cfg),
            policy,
        });
        // Spawn the housekeeping threads: replenisher (handles
        // genuine worker death + maintains `min`), restorer
        // (recycles dirty workers via RESTORE), janitor (evicts
        // idle-too-long workers above `min` to free RAM). All
        // detach naturally when the pool Arc drops via
        // `Weak::upgrade` failure.
        let wait_handles = pool.wait_handles();
        let h_replenish = {
            let h = wait_handles.clone();
            std::thread::Builder::new()
                .name("supermachine-pool-replenish".into())
                .spawn(move || replenisher_loop(h))
                .map_err(|e| Error::vm_msg(format!("spawn replenisher thread: {e}")))?
        };
        // Restorer threads only run when `restore_on_release`
        // is on. With it off, drop pushes workers straight to
        // idle and the restorer would have nothing to do.
        let mut handles = vec![h_replenish];
        if policy.restore_on_release {
            // Multi-restorer: scale the recycle thread count
            // with pool size. Each RESTORE RPC is ~3 ms in-
            // place; under bursty drop patterns a single
            // restorer becomes the serialization point and
            // shows up as a phantom delay on the *next* acquire
            // (acquire pops the idle queue, restorer's still
            // chewing through dirty, idle is empty → user
            // blocks ~3 ms × queue depth).
            //
            // Formula: ⌈max/2⌉ clamped [1, 4], capped at `max`.
            // 1 restorer for max=2, 3 for max=5, 4 for max=8+.
            let restorer_count = ((policy.max + 1) / 2)
                .clamp(1, 4)
                .min(policy.max.max(1));
            for _ in 0..restorer_count {
                let h = wait_handles.clone();
                let h_restore = std::thread::Builder::new()
                    .name("supermachine-pool-restore".into())
                    .spawn(move || restorer_loop(h))
                    .map_err(|e| Error::vm_msg(format!("spawn restorer thread: {e}")))?;
                handles.push(h_restore);
            }
        }
        // Janitor: only spawn if eviction is enabled. Saves a
        // sleeping thread for fixed-size pools.
        if pool.policy.idle_timeout != Duration::MAX {
            let h = wait_handles.clone();
            let h_janitor = std::thread::Builder::new()
                .name("supermachine-pool-janitor".into())
                .spawn(move || janitor_loop(h))
                .map_err(|e| Error::vm_msg(format!("spawn janitor thread: {e}")))?;
            handles.push(h_janitor);
        }
        // Detach the handles. Threads exit on their own when
        // `shutting_down` is set in `HiddenPool::drop`; we don't
        // need to join them — they hold no `Arc<HiddenPool>` and
        // don't block the user-side drop from firing.
        drop(handles);
        Ok(pool)
    }

    /// Initialise (once) the per-Image default pool used by
    /// `image.acquire()`. Lazy: subsequent calls return the
    /// same pool regardless of `config`. Distinct from
    /// `pool().build()` — that always returns a fresh pool.
    fn ensure_default_pool(
        &self,
        config: &VmConfig,
    ) -> Result<&Arc<HiddenPool>, Error> {
        if let Some(p) = self.hidden_pool.get() {
            return Ok(p);
        }
        let pool = self.build_pool_arc(config, PoolPolicy::default())?;
        // Race-friendly: if another thread set this in parallel,
        // the new pool is dropped (and its housekeeping threads
        // exit on the dropped Arc's strong-count → 0). The
        // observable result is "either pool wins, all subsequent
        // calls see the same one".
        let _ = self.hidden_pool.set(pool);
        Ok(self
            .hidden_pool
            .get()
            .expect("hidden pool just initialized"))
    }
}

/// Bundle of clonable handles into the pool's wait state. Each
/// housekeeping thread owns one of these; no one holds a strong
/// `Arc<HiddenPool>` across a condvar wait, so user-side `Pool`
/// + `Image` drops fire immediately and the threads exit on
/// `shutting_down` after their next short timed wait.
#[derive(Clone)]
struct PoolWaitHandles {
    state: Arc<Mutex<PoolState>>,
    available: Arc<Condvar>,
    dirty: Option<Arc<Mutex<VecDeque<Worker>>>>,
    dirty_pending: Option<Arc<Condvar>>,
    shutting_down: Arc<AtomicBool>,
    spawn_cfg: Arc<SpawnConfig>,
    policy: PoolPolicy,
}

impl HiddenPool {
    fn wait_handles(&self) -> PoolWaitHandles {
        PoolWaitHandles {
            state: Arc::clone(&self.state),
            available: Arc::clone(&self.available),
            dirty: self.dirty.as_ref().map(Arc::clone),
            dirty_pending: self.dirty_pending.as_ref().map(Arc::clone),
            shutting_down: Arc::clone(&self.shutting_down),
            spawn_cfg: Arc::clone(&self.spawn_cfg),
            policy: self.policy,
        }
    }
}

/// Restorer thread: drain the dirty queue, send RESTORE on each
/// worker's supervisor control socket to reset guest state to
/// the snapshot, then push back to idle. This is the steady-
/// state recycle path — replaces the old "kill + respawn fresh"
/// loop. Saves ~10 ms per cycle (no fork+exec+dyld+restore;
/// just an in-place restore).
fn restorer_loop(h: PoolWaitHandles) {
    let (Some(dirty), Some(pending)) = (h.dirty.as_ref(), h.dirty_pending.as_ref()) else {
        return;
    };
    loop {
        if h.shutting_down.load(Ordering::SeqCst) {
            return;
        }
        // Wait for a dirty worker (or shutdown). Bounded wait so
        // we re-check shutting_down even if no notify_all arrives.
        let mut worker = {
            let mut q = match dirty.lock() {
                Ok(q) => q,
                Err(_) => return,
            };
            loop {
                if h.shutting_down.load(Ordering::SeqCst) {
                    return;
                }
                if let Some(w) = q.pop_front() {
                    break w;
                }
                q = match pending.wait_timeout(q, Duration::from_millis(100)) {
                    Ok((g, _)) => g,
                    Err(_) => return,
                };
            }
        };

        let snap_path = h.spawn_cfg.snapshot_path.clone();
        match worker.send_restore(&snap_path) {
            Ok(()) => {
                if let Ok(mut s) = h.state.lock() {
                    s.idle.push(IdleEntry {
                        worker,
                        last_used: Instant::now(),
                    });
                    h.available.notify_all();
                }
            }
            Err(_) => {
                // Restore protocol broke — kill this worker so the
                // replenisher spawns a fresh replacement.
                worker.shutdown();
                if let Ok(mut s) = h.state.lock() {
                    s.alive = s.alive.saturating_sub(1);
                }
                h.available.notify_all();
            }
        }
    }
}

/// Replenisher: maintains `alive >= min`. Spawns workers when
/// below the floor (worker died, was evicted into a regrow gap,
/// etc.). Sleeps on `available` otherwise.
fn replenisher_loop(h: PoolWaitHandles) {
    loop {
        if h.shutting_down.load(Ordering::SeqCst) {
            return;
        }
        let need_more = {
            let s = match h.state.lock() {
                Ok(s) => s,
                Err(_) => return,
            };
            s.alive < h.policy.min
        };
        if !need_more {
            let s = match h.state.lock() {
                Ok(s) => s,
                Err(_) => return,
            };
            // Bounded wait — see PoolWaitHandles docs for why.
            let _ = h
                .available
                .wait_timeout(s, Duration::from_millis(100));
            continue;
        }
        // Reserve a slot before spawning so a concurrent
        // acquire's auto-grow doesn't double-spawn.
        if let Ok(mut s) = h.state.lock() {
            if s.alive >= h.policy.min {
                continue;
            }
            s.alive += 1;
        }
        match h.spawn_cfg.spawn_one() {
            Ok(w) => {
                if let Ok(mut s) = h.state.lock() {
                    s.idle.push(IdleEntry {
                        worker: w,
                        last_used: Instant::now(),
                    });
                    h.available.notify_all();
                }
            }
            Err(_) => {
                if let Ok(mut s) = h.state.lock() {
                    s.alive = s.alive.saturating_sub(1);
                }
                std::thread::sleep(Duration::from_millis(500));
            }
        }
    }
}

/// Janitor: every `idle_timeout / 4`, walk the idle queue and
/// shut down workers above `min` that have been idle longer than
/// `idle_timeout`. Frees host RAM during quiet periods.
fn janitor_loop(h: PoolWaitHandles) {
    let timeout = h.policy.idle_timeout;
    let min = h.policy.min;
    if timeout == Duration::MAX {
        return;
    }
    // Cap each individual wait so we re-check shutting_down often
    // enough to drop within ~100 ms of pool teardown.
    let tick = (timeout / 4).max(Duration::from_millis(100));
    let wait_unit = Duration::from_millis(100).min(tick);
    loop {
        if h.shutting_down.load(Ordering::SeqCst) {
            return;
        }
        // Collect workers to evict under the lock; shut down
        // outside it so we don't hold up acquires.
        let mut to_evict: Vec<Worker> = Vec::new();
        if let Ok(mut s) = h.state.lock() {
            let now = Instant::now();
            // Walk from the front (oldest entries) and evict
            // while we're above min and the entry is stale. Stop
            // as soon as we hit a fresh one — LIFO ordering means
            // older entries pile up at the front.
            while s.alive > min && !s.idle.is_empty() {
                let oldest = &s.idle[0];
                if now.duration_since(oldest.last_used) < timeout {
                    break;
                }
                let entry = s.idle.remove(0);
                s.alive -= 1;
                to_evict.push(entry.worker);
            }
        }
        for mut w in to_evict {
            w.shutdown();
        }
        // Sleep up to `tick` total in `wait_unit` slices so we
        // observe shutting_down promptly.
        let mut remaining = tick;
        while remaining > Duration::ZERO && !h.shutting_down.load(Ordering::SeqCst) {
            let chunk = remaining.min(wait_unit);
            if let Ok(s) = h.state.lock() {
                let _ = h.available.wait_timeout(s, chunk);
            }
            remaining = remaining.saturating_sub(chunk);
        }
    }
}

/// A [`Vm`] checked out of an [`Image`]'s hidden pool. `Deref`s
/// to `Vm`, so every method on `Vm` is callable. On `Drop` the
/// VM returns to the pool — the next [`Image::acquire`] gets
/// a freshly snapshot-restored worker in ~5 ms.
///
/// Bound to the `Image`'s lifetime so the pool can't outlive
/// its owner. Acquires currently serialize on the pool's single
/// worker; concurrent acquires from one process block until the
/// previous PooledVm is dropped.
pub struct PooledVm<'a> {
    vm: Option<Vm>,
    /// Worker subprocess we checked out. On Drop, returned to
    /// the pool (which kills + replenishes).
    worker: Option<Worker>,
    /// Keeps the pool alive for the lifetime of this PooledVm.
    pool_arc: Arc<HiddenPool>,
    _image: std::marker::PhantomData<&'a Image>,
}

impl<'a> std::ops::Deref for PooledVm<'a> {
    type Target = Vm;
    fn deref(&self) -> &Vm {
        // Invariant: vm is `Some` until Drop runs.
        self.vm.as_ref().expect("PooledVm used after drop")
    }
}

impl<'a> std::ops::DerefMut for PooledVm<'a> {
    fn deref_mut(&mut self) -> &mut Vm {
        self.vm.as_mut().expect("PooledVm used after drop")
    }
}

impl<'a> PooledVm<'a> {
    /// Capture a snapshot of this VM's current state and return
    /// a new [`Image`] pointing at it. Equivalent to
    /// [`Vm::snapshot`] but works on the subprocess-pool path
    /// — the entitled worker subprocess does the HVF capture
    /// over the supervisor RPC, so the calling binary doesn't
    /// need the `com.apple.security.hypervisor` entitlement.
    ///
    /// Borrows `self` rather than consuming, so you can
    /// continue using the VM after capturing — handy for the
    /// "warm up + snapshot + keep working" pattern. The guest
    /// is paused for the duration of the capture (typically
    /// 10s of ms; bounded by disk write time for large RAM).
    pub fn snapshot(&self, dest_dir: impl Into<PathBuf>) -> Result<Image, Error> {
        let dest_dir = dest_dir.into();
        let _span = tracing::info_span!(
            "supermachine.snapshot",
            dest_dir = %dest_dir.display(),
        )
        .entered();
        let worker = self
            .worker
            .as_ref()
            .ok_or_else(|| Error::vm_msg("PooledVm: no worker (already dropped?)".to_owned()))?;
        let vm = self
            .vm
            .as_ref()
            .ok_or_else(|| Error::vm_msg("PooledVm: no vm (already dropped?)".to_owned()))?;
        let meta = vm.image_meta.clone().ok_or_else(|| {
            Error::vm_msg(
                "PooledVm::snapshot: image metadata missing (acquire from a 0.3.8+ Image)"
                    .to_owned(),
            )
        })?;
        std::fs::create_dir_all(&dest_dir).map_err(Error::Io)?;
        let snap_path = dest_dir.join("restore.snap");
        // Multi-vCPU: park secondaries via smpark.ko before the
        // capture rendezvous. The agent ioctls /dev/smpark, which
        // broadcasts an IPI to all secondaries; each one drains
        // local LRs, masks IRQs, and spins in WFI. The captured
        // per-vCPU state is then byte-identical-trivial across
        // secondaries — HVF can round-trip THAT, fixing the
        // "restored guest RCU-stalls / NULL-derefs in interrupt
        // context" failure class. Best-effort: if smpark.ko isn't
        // loaded (older snapshots, single-vCPU bake), the agent
        // returns ok=false and we fall through to the existing
        // rendezvous-only capture path.
        let parked = if meta.vcpus > 1 {
            worker.send_smpark_park()?
        } else {
            false
        };
        // The diff-via-clone path with `last_restore_path` as
        // base would in principle win 200+ ms here, but the
        // runner needs base in memory and lazy-loading from
        // disk costs ~700 ms on first call. Without warm-cache
        // hits across pool cycles, that's net-negative for
        // single-shot snapshots. Use the plain streaming save
        // (parallel sparse pwrite, ~290 ms on 2 GiB) as the
        // default; opt-in to the diff path via env var for the
        // multi-shot scenarios where cache pays back.
        let snap_result = if std::env::var_os("SUPERMACHINE_DIFF_CYCLE_SNAPSHOT").is_some() {
            worker.send_snapshot_with_base(&snap_path, &worker.last_restore_path)
        } else {
            worker.send_snapshot(&snap_path)
        };
        // Always unpark, even on snapshot failure — otherwise
        // secondaries stay stuck in WFI and the next acquire of
        // this VM is a brick. Best-effort: if park failed in the
        // first place, unpark is also a no-op.
        if parked {
            let _ = worker.send_smpark_unpark()?;
        }
        let _stats = snap_result?;
        let metadata = serde_json::json!({
            "memory_mib": meta.memory_mib,
            "vcpus": meta.vcpus,
            "layers": meta
                .layers
                .iter()
                .map(|p| p.to_string_lossy().to_string())
                .collect::<Vec<_>>(),
            "delta_squashfs": meta
                .delta_squashfs
                .as_ref()
                .map(|p| p.to_string_lossy().to_string()),
            "snapshot_base": snap_path.to_string_lossy().to_string(),
            "baked_at": chrono_rfc3339_now(),
            "source": "PooledVm::snapshot",
        });
        std::fs::write(
            dest_dir.join("metadata.json"),
            serde_json::to_string_pretty(&metadata)
                .map_err(|e| Error::vm_msg(format!("metadata serialize: {e}")))?,
        )
        .map_err(Error::Io)?;
        Image::from_snapshot(&dest_dir)
    }
}

impl<'a> Drop for PooledVm<'a> {
    fn drop(&mut self) {
        // Drop the inner Vm first — its Drop respects
        // skip_cleanup and is a no-op (we don't want to close
        // the worker's socket files; they're owned by Worker).
        let _ = self.vm.take();
        // Hand the worker back to the pool, which kills it
        // (snapshot state may be dirty) and triggers
        // replenishment so the pool stays at target N.
        if let Some(worker) = self.worker.take() {
            self.pool_arc.release(worker);
        }
        // pool_arc drops here, decrementing refcount.
    }
}

/// Configurable bake of an OCI image. Built via [`Image::builder`];
/// terminate with [`OciImageBuilder::build`] to produce an
/// [`Image`].
/// Builder for an explicitly configured worker pool. Started
/// with [`Image::pool`]; terminated with [`PoolBuilder::build`].
///
/// Defaults: `min=0`, `max=64`, `idle_timeout=60s`,
/// `acquire_timeout=60s` — i.e. lazy spawn, auto-evict, fail-
/// noisily on saturation. Override any of these with the
/// chainable setters below.
pub struct PoolBuilder<'a> {
    image: &'a Image,
    policy: PoolPolicy,
    /// Runtime overrides applied at pool-build time. Memory / vCPU
    /// changes here don't re-bake the snapshot — they override what
    /// the worker subprocess advertises to HVF at restore. Lazily
    /// committed (CoW page-fault), so a larger memory_mib than
    /// what was baked just raises the guest-visible ceiling without
    /// committing host pages.
    vm_config: VmConfig,
}

impl<'a> PoolBuilder<'a> {
    /// Always-warm baseline. The pool keeps at least this many
    /// workers alive — even if everyone drops their `PooledVm`,
    /// the next `acquire` finds these waiting in idle. Default 0
    /// (lazy-spawn on first acquire).
    pub fn min(mut self, n: usize) -> Self {
        self.policy.min = n;
        self
    }

    /// Hard concurrency cap. `acquire` blocks (with timeout) when
    /// `max` peers are checked out simultaneously. Default 64;
    /// raise for large multi-tenant fleets. Pass `usize::MAX` for
    /// effectively unbounded.
    pub fn max(mut self, n: usize) -> Self {
        self.policy.max = n.max(1);
        self
    }

    /// Idle workers above `min` that have been unused for longer
    /// than this get killed by the janitor — frees host RAM
    /// during quiet periods. Default 60 s. Pass `Duration::MAX`
    /// to disable eviction (fixed-size pool with no churn).
    pub fn idle_timeout(mut self, d: Duration) -> Self {
        self.policy.idle_timeout = d;
        self
    }

    /// Caller's `acquire` blocks at most this long when the pool
    /// is at `max` and no worker is idle. After that the call
    /// returns [`Error::PoolExhausted`]. Default 60 s. Pass `None`
    /// (via this method's twin if added) to block forever; pass
    /// `Duration::ZERO` to fail-fast.
    pub fn acquire_timeout(mut self, d: Duration) -> Self {
        self.policy.acquire_timeout = Some(d);
        self
    }

    /// Disable the acquire timeout — `acquire` blocks forever
    /// when at `max`. Useful for batch workloads where you'd
    /// rather wait than fail. Equivalent to passing `None`
    /// internally.
    pub fn no_acquire_timeout(mut self) -> Self {
        self.policy.acquire_timeout = None;
        self
    }

    /// Skip the per-cycle snapshot RESTORE on `PooledVm::drop`.
    /// Workers go straight back to the idle queue carrying
    /// whatever guest state the previous user left.
    ///
    /// Default `true`: every acquire starts from a clean
    /// snapshot-state guest. Pay ~3 ms restore per cycle (off
    /// the user's critical path with a buddy slot).
    ///
    /// `false`: opt into "warm worker reuse" — guest page cache
    /// stays warm across cycles. For workloads that re-read the
    /// same files every invocation (rustc + sysroot, python +
    /// stdlib, node + node_modules), this is a 3-6× speedup
    /// because the second compile/import doesn't re-fault
    /// pages from the squashfs layer.
    ///
    /// Safe when the workload always overwrites its own outputs
    /// (e.g. `rustc -o /tmp/m && /tmp/m` overwrites both source
    /// and binary on every cycle). Unsafe if the workload trusts
    /// `/tmp` to be clean or accumulates files unboundedly —
    /// the guest's RAM+fs accumulates state forever in this mode,
    /// so pair with periodic pool rebuild for long runs.
    pub fn restore_on_release(mut self, on: bool) -> Self {
        self.policy.restore_on_release = on;
        self
    }

    /// Per-acquire restore timeout. Forwarded to the inner
    /// [`VmConfig::with_restore_timeout`] used to spawn pool
    /// workers. Default 30 s (set at spawn time inside the pool).
    /// Bump for slow disks or large RAM snapshots; restore time
    /// scales roughly linearly with snapshot size.
    pub fn with_restore_timeout(mut self, timeout: Duration) -> Self {
        self.vm_config = std::mem::take(&mut self.vm_config).with_restore_timeout(timeout);
        self
    }

    /// Override the baked image's memory ceiling for this pool's
    /// workers. Pure runtime override — doesn't re-bake the
    /// snapshot. Pages are lazy-committed (CoW page-fault), so
    /// raising the ceiling beyond what was baked doesn't increase
    /// host commit unless the guest actually writes to the new
    /// pages.
    pub fn with_memory_mib(mut self, mib: u32) -> Self {
        self.vm_config = std::mem::take(&mut self.vm_config).with_memory_mib(mib);
        self
    }

    /// Override the baked image's vCPU count for this pool's
    /// workers. Pure runtime override.
    pub fn with_vcpus(mut self, vcpus: u32) -> Self {
        self.vm_config = std::mem::take(&mut self.vm_config).with_vcpus(vcpus);
        self
    }

    /// Build the pool against this image's snapshot. Spawns
    /// the initial `min` workers in parallel, starts the
    /// housekeeping threads (replenisher / restorer / janitor),
    /// returns a [`Pool`] handle.
    pub fn build(self) -> Result<Pool, Error> {
        let image = self.image;
        // Normalize: min must not exceed max — otherwise the
        // pre-spawn loop in build_hidden_pool would create more
        // workers than the auto-grow ceiling allows, and the
        // `alive ≤ max` invariant breaks.
        let mut policy = self.policy;
        if policy.min > policy.max {
            policy.min = policy.max;
        }
        let arc = image.build_pool_arc(&self.vm_config, policy)?;
        // Try to install this pool as the per-Image default so
        // subsequent `image.acquire()` / `image.acquire_with()`
        // calls go through the same configured pool. If the
        // OnceLock is already set (the user called acquire()
        // before pool().build()), we error rather than silently
        // returning a config-less default — that's the bug the
        // 0.4.1 docs accidentally created.
        match image.hidden_pool.set(Arc::clone(&arc)) {
            Ok(()) => Ok(Pool { inner: arc }),
            Err(_) => Err(Error::vm_msg(
                "image.pool().build() must be called before image.acquire() / \
                 image.acquire_with(). The Image already has a default pool \
                 from an earlier acquire — its policy can't be changed in \
                 place. Either call pool().build() first, or load the Image \
                 fresh: `Image::from_snapshot(path)?.pool()...build()?`."
                    .to_owned(),
            )),
        }
    }
}

/// Explicit, configured worker pool. Returned by
/// [`PoolBuilder::build`]. `Pool` is `Clone` (Arc-shared);
/// dropping every clone tears the pool down.
#[derive(Clone)]
pub struct Pool {
    inner: Arc<HiddenPool>,
}

/// Snapshot of pool state at a point in time. Cheap to fetch —
/// one mutex acquire — so it's safe to graph at modest rates.
#[derive(Debug, Clone, Copy)]
pub struct PoolStats {
    /// Total workers alive: idle + checked-out + in-flight spawn.
    pub alive: usize,
    /// Currently checked out (in use by a `PooledVm`).
    pub in_use: usize,
    /// Sitting in the idle queue waiting for next acquire.
    pub idle: usize,
    /// Acquire callers currently blocked waiting for a slot.
    pub waiting: usize,
    /// Configured maximum concurrency.
    pub max: usize,
    /// Configured baseline / always-warm count.
    pub min: usize,
}

impl Pool {
    /// Acquire a fresh VM from this pool. Identical fast path to
    /// [`Image::acquire`] — on a hit, ~µs to pop the idle queue.
    /// On miss, follows the policy: auto-grow up to `max`, or
    /// block (with timeout) if at `max`.
    pub fn acquire(&self) -> Result<PooledVm<'_>, Error> {
        let _span = tracing::info_span!(
            "supermachine.pool.acquire",
            memory_mib = self.inner.spawn_cfg.memory_mib,
            vcpus = self.inner.spawn_cfg.vcpus,
        )
        .entered();
        let worker = self.inner.acquire()?;
        let vm = Vm {
            pool: None,
            vsock_mux_path: worker.vsock_mux_path.clone(),
            vsock_exec_path: worker.vsock_exec_path.clone(),
            own_vsock_mux_dir: None,
            skip_cleanup: true,
            // image_meta is filled in via Image's ensure_pool path;
            // when the pool is explicitly built we don't have
            // direct access to the source Image, so leave it None
            // and rely on `Vm::start`-derived snapshots if the
            // user wants `PooledVm::snapshot` to work. Most
            // pool-based callers don't need it.
            image_meta: Some(Arc::new(ImageMeta {
                memory_mib: self.inner.spawn_cfg.memory_mib,
                vcpus: self.inner.spawn_cfg.vcpus,
                layers: self.inner.spawn_cfg.layers.clone(),
                delta_squashfs: self.inner.spawn_cfg.delta_squashfs.clone(),
            })),
        };
        Ok(PooledVm {
            vm: Some(vm),
            worker: Some(worker),
            pool_arc: Arc::clone(&self.inner),
            _image: std::marker::PhantomData,
        })
    }

    /// Snapshot of current pool state.
    pub fn stats(&self) -> PoolStats {
        let s = self.inner.state.lock().ok();
        let alive = s.as_ref().map(|s| s.alive).unwrap_or(0);
        let idle = s.as_ref().map(|s| s.idle.len()).unwrap_or(0);
        let waiting = s.as_ref().map(|s| s.waiting).unwrap_or(0);
        PoolStats {
            alive,
            in_use: alive.saturating_sub(idle),
            idle,
            waiting,
            max: self.inner.policy.max,
            min: self.inner.policy.min,
        }
    }
}

///
/// Every setter that affects the workload's behavior (env, cmd,
/// memory, guest_port) is part of the bake's input fingerprint:
/// changing it forces a re-bake and produces a different snapshot.
/// Use distinct `with_name` values if you want side-by-side
/// snapshots for the same image ref with different configs.
pub struct OciImageBuilder {
    image: String,
    name: Option<String>,
    pull_policy: PullPolicy,
    memory_mib: Option<u32>,
    vcpus: Option<u32>,
    guest_port: Option<u16>,
    cmd: Option<Vec<String>>,
    envs: Vec<(String, String)>,
    snapshots_dir: Option<PathBuf>,
    /// Optional post-bake warmup. After the bake completes, we
    /// `acquire` a VM from the freshly-baked image, run this
    /// closure, snapshot the post-warmup state, and replace
    /// `restore.snap` with the warm version. Subsequent acquires
    /// land at warm state — guest page cache for the workload
    /// already populated, so e.g. `rustc` cold-start drops from
    /// ~370 ms to ~50–100 ms.
    warmup: Option<Box<dyn FnOnce(&Vm) -> Result<(), Error> + Send>>,
    /// Stable tag for the warmup. Folded into the snapshot's
    /// cached fingerprint so changing the warmup invalidates the
    /// previous warm snapshot. If `None`, warmup is treated as
    /// "any change re-runs" (we use a hash-of-empty as the tag,
    /// which is stable across runs).
    warmup_tag: Option<String>,
    /// Extra files to stage into the bake's delta layer at
    /// (host_path, guest_path). Useful for bundling per-snapshot
    /// auxiliary binaries (e.g. the snapshot-park kernel module
    /// at `/supermachine-smpark.ko`). Plumbed through to the
    /// bake step as `--extra-file <host>:<guest>` args.
    extra_files: Vec<(PathBuf, String)>,
    /// virtio-fs DAX mounts: (host_path, guest_tag) pairs. Each
    /// expose-d as a virtio-fs device the guest mounts via
    /// `mount -t virtiofs <tag> <target>`. Plumbed through to the
    /// bake step as `--mount <host>:<tag>` args.
    mounts: Vec<(PathBuf, String)>,
    /// When `Some(true)`, the no-warmup `.build()` path waits for
    /// the workload's listener to come up (or the parked-PID-1
    /// fallback) before capturing — same as v0.4.22 behavior.
    /// When `None` or `Some(false)`, the default v0.4.23+ behavior
    /// applies: snapshot is captured at the pre-exec marker, BEFORE
    /// the workload runs. Each restore re-execs the workload fresh.
    ///
    /// Set to `true` for service-image bakes (nginx, redis, etc.)
    /// where you want the listener pre-bound at restore time but
    /// don't want the cost of routing through `with_warmup`. See
    /// [`OciImageBuilder::with_listener_required`].
    require_listener: Option<bool>,
}

impl OciImageBuilder {
    /// Start a new builder for `image_ref` (e.g. `"nginx:1.27-alpine"`,
    /// `"ghcr.io/owner/image@sha256:..."`).
    pub fn new(image_ref: impl Into<String>) -> Self {
        Self {
            image: image_ref.into(),
            name: None,
            pull_policy: PullPolicy::default(),
            memory_mib: None,
            vcpus: None,
            guest_port: None,
            cmd: None,
            envs: Vec::new(),
            snapshots_dir: None,
            warmup: None,
            warmup_tag: None,
            extra_files: Vec::new(),
            mounts: Vec::new(),
            require_listener: None,
        }
    }

    /// Stage `host_path` into the snapshot's delta layer at
    /// `guest_path`. The file appears at `guest_path` inside the
    /// guest's root filesystem after restore. Folded into the
    /// bake's content hash so changing the host file invalidates
    /// the cached snapshot.
    ///
    /// Used (e.g.) to ship `supermachine-smpark.ko` so init-oci
    /// can `finit_module` it on boot for multi-vCPU snapshot
    /// support.
    pub fn with_extra_file(
        mut self,
        host_path: impl Into<PathBuf>,
        guest_path: impl Into<String>,
    ) -> Self {
        self.extra_files.push((host_path.into(), guest_path.into()));
        self
    }

    /// Override where the image bytes come from. Default (when not
    /// called) treats the constructor's `image_ref` as a registry
    /// reference and pulls from Docker Hub or the registry encoded
    /// in the ref.
    ///
    /// Internally the bake driver also accepts `image_ref` directly
    /// in the prefixed forms `oci-archive:/path` and `oci-layout:/path`;
    /// this method is the structured convenience: the user keeps
    /// `image_ref` as a logical identifier (used to derive the
    /// snapshot dir name) and points at a local source separately.
    ///
    /// ```no_run
    /// # use supermachine::{Image, PullPolicy};
    /// let img = Image::builder("shopify-test-sm:latest")
    ///     .with_oci_archive("/tmp/shopify.tar")
    ///     .build()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn with_oci_archive(mut self, archive_path: impl Into<PathBuf>) -> Self {
        // Rewrite image ref to the prefix form; keep the user's
        // original ref as the auto-derived snapshot name if no
        // explicit `with_name` was called.
        let path: PathBuf = archive_path.into();
        let original = std::mem::replace(
            &mut self.image,
            format!("oci-archive:{}", path.display()),
        );
        if self.name.is_none() {
            self.name = Some(crate::bake::snapshot_name_for_image(&original));
        }
        self
    }

    /// Like `with_oci_archive` but points at an OCI layout DIRECTORY
    /// (the un-tar'd form, with `index.json` + `oci-layout` +
    /// `blobs/sha256/...` at the top level).
    pub fn with_oci_layout(mut self, layout_dir: impl Into<PathBuf>) -> Self {
        let path: PathBuf = layout_dir.into();
        let original = std::mem::replace(
            &mut self.image,
            format!("oci-layout:{}", path.display()),
        );
        if self.name.is_none() {
            self.name = Some(crate::bake::snapshot_name_for_image(&original));
        }
        self
    }

    /// Expose a host directory to the guest via virtio-fs (with DAX).
    /// The guest mounts it inside `init-oci` as
    /// `mount -t virtiofs <tag> <target>` — by convention init-oci
    /// mounts each declared tag at `/mnt/<tag>` and bind-mounts into
    /// the workload's filesystem if a per-image policy says so.
    ///
    /// Reads from the guest land in the host's page cache (DAX-mapped
    /// via `hv_vm_map`; validated by spike 22 to be zero-copy + shared
    /// across VMs that mount the same host path). The mount is added
    /// to the bake's input hash so changing `host_path` invalidates the
    /// cached snapshot.
    ///
    /// ```no_run
    /// # use supermachine::Image;
    /// let img = Image::builder("node:22-alpine")
    ///     .with_mount("/Users/me/myapp", "myapp")
    ///     .with_cmd(["node", "/mnt/myapp/index.js"])
    ///     .build()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn with_mount(
        mut self,
        host_path: impl Into<PathBuf>,
        guest_tag: impl Into<String>,
    ) -> Self {
        self.mounts.push((host_path.into(), guest_tag.into()));
        self
    }

    /// Override the number of vCPUs the snapshot is baked with.
    /// Default `1`. Multi-vCPU is opt-in: it lifts sustained
    /// HTTP-serving throughput (single-vCPU is the c=32+
    /// bottleneck) at the cost of slightly higher cold boot and
    /// some snapshot/restore caveats. See
    /// docs/design/concurrency-floor-2026-05-04.md.
    pub fn with_vcpus(mut self, vcpus: u32) -> Self {
        self.vcpus = Some(vcpus);
        self
    }

    /// Snapshot name. Default: derived from the image ref via
    /// `bake::snapshot_name_for_image`. Use this when you want
    /// `nginx:1.27-alpine` baked twice with different configs.
    pub fn with_name(mut self, name: impl Into<String>) -> Self {
        self.name = Some(name.into());
        self
    }

    /// Cache + registry policy. See [`PullPolicy`].
    pub fn with_pull_policy(mut self, policy: PullPolicy) -> Self {
        self.pull_policy = policy;
        self
    }

    /// Override the bake-time memory budget (MiB). The runtime
    /// memory is set on [`VmConfig`]; this is the size the
    /// snapshot is captured at.
    pub fn with_memory_mib(mut self, mib: u32) -> Self {
        self.memory_mib = Some(mib);
        self
    }

    /// Override the guest service port the bake waits for as the
    /// readiness signal. Default `80`.
    pub fn with_guest_port(mut self, port: u16) -> Self {
        self.guest_port = Some(port);
        self
    }

    /// Override the image's `CMD`. Pass an argv array, same shape
    /// as Docker's `--entrypoint` + arguments combined.
    ///
    /// ```no_run
    /// # use supermachine::Image;
    /// let img = Image::builder("python:3.12-alpine")
    ///     .with_cmd(["python", "-m", "http.server", "8080"])
    ///     .with_guest_port(8080)
    ///     .build()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn with_cmd<I, S>(mut self, cmd: I) -> Self
    where
        I: IntoIterator<Item = S>,
        S: Into<String>,
    {
        self.cmd = Some(cmd.into_iter().map(Into::into).collect());
        self
    }

    /// Add an environment variable for the workload. Repeatable.
    /// Mirrors `docker run -e KEY=VAL`.
    pub fn with_env(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
        self.envs.push((key.into(), value.into()));
        self
    }

    /// Override the directory snapshots are stored in. Default
    /// is `~/.local/supermachine-snapshots`. Use this to keep
    /// per-project snapshot stores isolated from each other.
    pub fn with_snapshots_dir(mut self, dir: impl Into<PathBuf>) -> Self {
        self.snapshots_dir = Some(dir.into());
        self
    }

    /// Run a warmup closure once after the bake, then re-snapshot
    /// the post-warmup state. Future restores from this image
    /// land at the warm state — guest page cache for the
    /// workload already populated, so e.g. compiling a small
    /// Rust program drops from ~370 ms to ~50–100 ms.
    ///
    /// Cached: if the warm snapshot already exists with a
    /// matching warmup tag (see [`Self::with_warmup_tag`]), the
    /// warmup is skipped on subsequent builds. Without an
    /// explicit tag, the warmup re-runs whenever the snapshot
    /// is invalidated by other inputs (image_ref, memory, etc.)
    /// — set a tag if you change the closure body and want the
    /// cache to invalidate.
    ///
    /// ```no_run
    /// # use std::time::Duration;
    /// # use supermachine::Image;
    /// let image = Image::ensure_baked("rust_warm", "rust:1-slim", |b| b
    ///     .with_memory_mib(2048)
    ///     .with_warmup(|vm| {
    ///         vm.write_file("/tmp/probe.rs", b"fn main(){}")?;
    ///         vm.exec_builder()
    ///             .argv(["sh", "-c", "rustc -O /tmp/probe.rs -o /tmp/probe && /tmp/probe"])
    ///             .timeout(Duration::from_secs(60))
    ///             .output()?;
    ///         Ok(())
    ///     })
    ///     .with_warmup_tag("v1")
    /// )?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn with_warmup<F>(mut self, warmup: F) -> Self
    where
        F: FnOnce(&Vm) -> Result<(), Error> + Send + 'static,
    {
        self.warmup = Some(Box::new(warmup));
        self
    }

    /// Stable tag for the warmup closure (see [`Self::with_warmup`]).
    /// Bump when you change the warmup body and want the previously
    /// cached warm snapshot invalidated.
    pub fn with_warmup_tag(mut self, tag: impl Into<String>) -> Self {
        self.warmup_tag = Some(tag.into());
        self
    }

    /// For the no-warmup `.build()` path, wait for the workload's
    /// in-guest listener to come up before capturing the snapshot
    /// (the v0.4.22 behavior). Without this, v0.4.23+ defaults to
    /// the `pre-exec` trigger which captures BEFORE the workload
    /// runs — fast bake, but each restore re-execs the workload
    /// fresh (workload's own startup time is paid per acquire).
    ///
    /// **When to call this:**
    ///
    /// - **You're baking a service image** (nginx, redis, postgres,
    ///   anything that binds a listener and stays up) AND you want
    ///   the listener pre-bound at restore-time so first acquire's
    ///   port-traffic works immediately, but you don't want to pay
    ///   the cost of routing through `with_warmup` (which adds a
    ///   warm-snapshot round-trip).
    ///
    /// **When NOT to call this:**
    ///
    /// - **You're using `vm.exec(...)`** for arbitrary commands and
    ///   don't care about the workload's listener. Default (pre-exec
    ///   trigger) is faster and gives you the same agent behavior.
    ///
    /// - **The workload doesn't bind a listener and doesn't exit
    ///   quickly** (e.g. a long-running daemon that never serves a
    ///   port). With `require_listener=true` you'd time out at
    ///   `--snapshot-after-ms` (~7 s default). The pre-exec trigger
    ///   handles this case in ~150 ms regardless.
    ///
    /// No effect when `with_warmup` is also set — the warmup path
    /// always uses listener-ready (or the warmup callback would run
    /// against a not-yet-ready guest).
    pub fn with_listener_required(mut self) -> Self {
        self.require_listener = Some(true);
        self
    }

    /// Run the bake (or reuse a cached snapshot per
    /// `with_pull_policy`) and return the resulting [`Image`].
    pub fn build(self) -> Result<Image, Error> {
        let snapshots_dir = self
            .snapshots_dir
            .unwrap_or_else(default_snapshots_dir);
        let derived_name = self
            .name
            .clone()
            .unwrap_or_else(|| crate::bake::snapshot_name_for_image(&self.image));
        let snap_dir = snapshots_dir.join(&derived_name);

        // Cache fast-path: same as Image::from_oci_to_dir, but on
        // the builder we have to assume the cache might be stale
        // for a different config under the same name. We trust the
        // bake pipeline's input-hash check (`native_bake_key`) to
        // re-bake when the inputs changed; on cache hit it's a
        // no-op and we just load the existing snapshot.
        let cache_loadable = Image::from_snapshot(&snap_dir).is_ok();
        match self.pull_policy {
            PullPolicy::Never => {
                if cache_loadable {
                    return Image::from_snapshot(&snap_dir);
                }
                let restore_snap = snap_dir.join("restore.snap");
                if restore_snap.is_file() {
                    return Err(Error::cache_invalid(format!(
                        "snapshot present at {} but not loadable on this binary; \
                         rebake required (PullPolicy::Never won't auto-rebake)",
                        snap_dir.display()
                    )));
                }
                return Err(Error::cache_miss(format!(
                    "no cached snapshot for {} at {} (PullPolicy::Never)",
                    self.image,
                    snap_dir.display()
                )));
            }
            // Missing+invalid OR Always: fall through to bake. The
            // bake pipeline will short-circuit on input-hash match
            // even on Always policy.
            _ => {}
        }

        // Encode env / cmd into the form `bake::run_push` accepts.
        let mut extra_args: Vec<String> = Vec::new();
        for (k, v) in &self.envs {
            extra_args.push("--env".to_owned());
            extra_args.push(format!("{k}={v}"));
        }
        for (host, guest) in &self.extra_files {
            extra_args.push("--extra-file".to_owned());
            extra_args.push(format!("{}:{}", host.display(), guest));
        }
        for (host, tag) in &self.mounts {
            extra_args.push("--mount".to_owned());
            extra_args.push(format!("{}:{}", host.display(), tag));
        }
        let cmd_override = match &self.cmd {
            Some(argv) => Some(
                serde_json::to_string(argv)
                    .map_err(|e| Error::bake_msg(format!("encode cmd: {e}")))?,
            ),
            None => None,
        };

        let root = repo_root_for_bake()?;
        let request = crate::bake::BakeRequest {
            image: self.image.clone(),
            name: self.name.clone(),
            runtime: "supermachine".to_owned(),
            guest_port: self.guest_port.unwrap_or(80),
            memory_mib: self.memory_mib.unwrap_or(256),
            vcpus: self.vcpus.unwrap_or(1),
            pull_policy: self.pull_policy.as_bake_str().to_owned(),
            snapshots_dir: snapshots_dir.clone(),
            cmd_override,
            extra_args,
        };
        // No warmup → ALWAYS-PIPELINED-SKIP-WARM path.
        //
        // We route plain `.build()` through the pipelined-bake
        // driver with `skip_warm_snapshot=true` AND
        // `keep_alive=true`. The driver:
        //   1. Boots the worker, signals BAKE_READY.
        //   2. Issues SNAPSHOT_ASYNC for the user's snapshot —
        //      capture is fast (~5 ms), the disk write runs in a
        //      background thread on the worker.
        //   3. Skips the warm SNAPSHOT entirely (no second capture).
        //   4. Returns the live worker (BakedWorker) to us before
        //      the bg save necessarily completes. The first
        //      `Pool::acquire()` claims this worker as a pre-warm
        //      idle entry — saves ~50 ms spawn + ~5 ms restore
        //      versus a cold-from-disk worker.
        //
        // Synchronization: `save_compact_to_file` writes to
        // `<path>.partial` and atomic-renames to `<path>`, so file
        // existence ↔ save complete. `Pool::spawn_one` polls for
        // `snapshot_path.is_file()` before invoking the worker
        // (which would otherwise fail-fast on `--restore-from`
        // pointing at a missing file). The first acquire doesn't
        // need the file at all — it uses the warm worker's
        // in-memory state.
        //
        // Returned `Image` uses `from_snapshot_pending` because
        // metadata.json IS on disk by the time we reach here, but
        // restore.snap may still be in flight. The non-pending
        // `Image::from_snapshot` would reject that; the pending
        // variant is identical except for the file-existence check.
        if self.warmup.is_none() {
            let trace = std::env::var_os("SUPERMACHINE_BAKE_TRACE").is_some();
            let bake_t0 = std::time::Instant::now();
            // Empty warm_dir path: skip_warm_snapshot=true
            // suppresses both the warm SNAPSHOT round-trip AND
            // the warm metadata write, so warm_dir is unused.
            // Keep a sibling sentinel for diagnostic clarity in
            // case bake-trace logs reference it.
            // Default trigger: pre-exec (fast, ~150 ms bake on
            // slow-listener images, ~17× speedup on workloads that
            // would otherwise hit the 7-second wall-clock fallback).
            // Caller can opt out via `.with_listener_required()` to
            // get the v0.4.22 listener-ready capture instead — slower
            // bake, but guaranteed listener-up at restore time.
            let use_pre_exec = !self.require_listener.unwrap_or(false);
            let pipelined = crate::bake::PipelinedWarmup {
                warm_dir: snapshots_dir.join(format!("{}__warm__unused", derived_name)),
                warm_tag: "unused".to_owned(),
                keep_alive: true,
                skip_warm_snapshot: true,
                use_pre_exec_trigger: use_pre_exec,
                callback: Box::new(|_ctx| Ok(())),
            };
            match crate::bake::run_push_pipelined(&request, bake_t0, &root, pipelined) {
                Ok(warm_handoff) => {
                    if trace {
                        eprintln!(
                            "[bake-trace] always-pipelined (skip-warm) total: {:?} (bg save \
                             may still be in flight)",
                            bake_t0.elapsed()
                        );
                    }
                    let img = Image::from_snapshot_pending(&snap_dir)?;
                    if let Some(bw) = warm_handoff {
                        *img.warm_baked_worker.inner.lock().unwrap() = Some(bw);
                    }
                    return Ok(img);
                }
                Err(msg) => {
                    return Err(map_bake_error(&request.image, msg));
                }
            }
        }
        let warmup = self.warmup.unwrap();
        // Warmup path: derive a sibling directory keyed by the
        // warmup tag. Same name → cache hit (no warmup re-run);
        // different name → fresh warm bake.
        let tag = self.warmup_tag.as_deref().unwrap_or("default");
        let warm_dir = snapshots_dir.join(format!("{}__warm__{}", derived_name, tag));
        if let Ok(image) = Image::from_snapshot(&warm_dir) {
            return Ok(image);
        }

        // Pipelined bake. The bake worker boots, signals
        // BAKE_READY, captures the base async (background save
        // overlapping with warmup), runs the warmup closure
        // against the still-live guest, captures warm sync, then
        // QUITs. Cuts ~900 ms vs the sequential path on
        // rust:1-slim because the base save and warmup overlap,
        // and we save one boot+restore round-trip.
        let trace = std::env::var_os("SUPERMACHINE_BAKE_TRACE").is_some();
        let bake_t0 = std::time::Instant::now();

        // The user's warmup is `FnOnce(&Vm) -> Result<(), Error>`.
        // Inside the pipelined-bake driver we have only the
        // worker's vsock paths, so we synthesize a minimal `Vm`
        // around them and hand that to the user. The synthetic
        // Vm has `pool: None` and `image_meta: None` — neither
        // matters for warmup workloads (which call `vm.exec`,
        // `vm.write_file`, `vm.read_file`, etc., all of which
        // talk over vsock-exec).
        //
        // We capture any error in a Mutex so the bake driver can
        // surface it. The Box<FnOnce> escape-hatch dodges the
        // type system limitation that we can't move a non-Send
        // FnOnce through a closure without a wrapper.
        let warmup_err: std::sync::Arc<std::sync::Mutex<Option<Error>>> =
            std::sync::Arc::new(std::sync::Mutex::new(None));
        let warmup_err_inner = warmup_err.clone();
        let warmup_t0_capture: std::sync::Arc<std::sync::Mutex<Option<std::time::Instant>>> =
            std::sync::Arc::new(std::sync::Mutex::new(None));
        let warmup_t0_inner = warmup_t0_capture.clone();
        let pipelined = crate::bake::PipelinedWarmup {
            warm_dir: warm_dir.clone(),
            warm_tag: tag.to_owned(),
            // Opt in to warm-handoff: bake driver returns the live
            // worker; first Pool::acquire() claims it. Saves the
            // ~50 ms spawn + ~5 ms restore on the first cycle for
            // every with_warmup user. Falls back transparently if
            // the worker is dropped without anyone claiming it.
            keep_alive: true,
            // Take the warm SNAPSHOT — this is the with-warmup path,
            // user expects a separate warm artifact at warm_dir.
            skip_warm_snapshot: false,
            // Ignored when skip_warm_snapshot=false; warmup pipeline
            // always uses listener-ready (or the warmup callback
            // would run against a not-ready guest).
            use_pre_exec_trigger: false,
            callback: Box::new(move |ctx| {
                if let Ok(mut g) = warmup_t0_inner.lock() {
                    *g = Some(std::time::Instant::now());
                }
                let synth_vm = Vm {
                    pool: None,
                    vsock_mux_path: ctx.vsock_mux_path.clone(),
                    vsock_exec_path: ctx.vsock_exec_path.clone(),
                    own_vsock_mux_dir: None,
                    skip_cleanup: true,
                    image_meta: None,
                };
                let result = warmup(&synth_vm);
                // Drop the synth Vm without running cleanup
                // (skip_cleanup=true already; explicit drop for
                // clarity).
                drop(synth_vm);
                match result {
                    Ok(()) => Ok(()),
                    Err(e) => {
                        let msg = e.to_string();
                        if let Ok(mut g) = warmup_err_inner.lock() {
                            *g = Some(e);
                        }
                        Err(msg)
                    }
                }
            }),
        };

        match crate::bake::run_push_pipelined(&request, bake_t0, &root, pipelined) {
            Ok(warm_handoff) => {
                if trace {
                    eprintln!(
                        "[bake-trace] pipelined bake total: {:?}",
                        bake_t0.elapsed()
                    );
                    if let Some(t0) = warmup_t0_capture.lock().ok().and_then(|g| *g) {
                        eprintln!("[bake-trace] (warmup ran at +{:?})", t0 - bake_t0);
                    }
                }
                let img = Image::from_snapshot(&warm_dir)?;
                // Stash the bake-time warm worker so the first
                // Pool::acquire() can claim it instead of spawning
                // fresh + restoring from disk. See
                // `Image::warm_baked_worker` for the full lifecycle
                // contract (claim-or-drop, race semantics, fall-
                // through behavior on worker death).
                if let Some(bw) = warm_handoff {
                    *img.warm_baked_worker.inner.lock().unwrap() = Some(bw);
                }
                Ok(img)
            }
            Err(msg) => {
                // If the warmup callback was the failure source,
                // bubble the typed Error back instead of the
                // stringified bake message.
                if let Some(e) = warmup_err.lock().ok().and_then(|mut g| g.take()) {
                    return Err(e);
                }
                Err(map_bake_error(&request.image, msg))
            }
        }
    }
}

/// Default snapshots directory: `~/.local/supermachine-snapshots`,
/// matching the CLI's default. Customizable via
/// [`Image::from_oci_to_dir`] or `$SUPERMACHINE_SNAPSHOTS`.
/// Warn (once per process, per snapshot dir) if a snapshot was baked
/// under a different supermachine version than the current binary. The
/// bake driver writes its kernel path as `…/supermachine/v<VERSION>/kernel`;
/// if that <VERSION> doesn't equal the current `CARGO_PKG_VERSION`, the
/// snapshot's pinned init shim and kernel are also from the old build
/// and likely missing recent fixes (loopback bring-up, cgroup2 mount,
/// X11/dbus paths, …). The warning is one stderr line, easy to grep,
/// and silent for fresh-baked snapshots so cron-restarted services
/// don't print on every restore.
fn warn_if_snapshot_version_mismatch(
    meta: &serde_json::Value,
    snapshot_path: &Path,
) {
    // Prefer the immutable `baked_by_version` field — set at fresh
    // bake, never rewritten by reuse paths. This is the source of
    // truth for "which binary baked this snapshot".
    //
    // Fall back to the `kernel` path string for legacy snapshots
    // (baked before this field landed). The kernel path's `v0.4.X`
    // segment is unreliable for fresh-bakes that wrote the current
    // path, but it's a useful signal when the metadata file is
    // genuinely older.
    let current = env!("CARGO_PKG_VERSION");
    let baked_version = meta
        .get("baked_by_version")
        .and_then(|v| v.as_str())
        .map(|s| s.to_owned())
        .or_else(|| {
            meta.get("kernel")
                .and_then(|v| v.as_str())
                .and_then(parse_version_segment)
        });
    if let Some(baked) = baked_version {
        if baked == current {
            return;
        }
        // Single-line, plain stderr; embedders can filter via stderr
        // pipe if they want. We keep both versions and the snapshot
        // path so the fix is obvious from a single line of log.
        eprintln!(
            "supermachine: warning: snapshot at {} was baked under \
             v{baked}; current binaries are v{current}. Re-bake \
             (delete the snapshot dir and rerun `supermachine run`) \
             to pick up init/kernel fixes shipped since v{baked}.",
            snapshot_path.display()
        );
    }
}

/// Extract the `v0.4.27` slug from a path like
/// `/…/supermachine/v0.4.27/kernel` → `Some("0.4.27")`. Returns `None`
/// for paths that don't follow this layout (custom kernel paths,
/// SUPERMACHINE_KERNEL_PATH overrides, etc.) — in that case we can't
/// reason about version skew so we don't warn.
fn parse_version_segment(kernel_path: &str) -> Option<String> {
    let p = Path::new(kernel_path);
    for comp in p.components() {
        let Some(s) = comp.as_os_str().to_str() else {
            continue;
        };
        let Some(rest) = s.strip_prefix('v') else {
            continue;
        };
        if rest.matches('.').count() < 2 {
            continue;
        }
        if !rest.split('.').all(|seg| seg.chars().all(|c| c.is_ascii_digit())) {
            continue;
        }
        return Some(rest.to_owned());
    }
    None
}

fn default_snapshots_dir() -> PathBuf {
    if let Some(d) = std::env::var_os("SUPERMACHINE_SNAPSHOTS") {
        return PathBuf::from(d);
    }
    let home = std::env::var_os("HOME")
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from("."));
    home.join(".local/supermachine-snapshots")
}

/// `bake::run_push` wants a "repo root" so it can locate the
/// supermachine-worker binary, the kernel image, the entitlements
/// plist, etc. The CLI walks up from its own exe to find it. From
/// a library context the same auto-discovery applies (an embedder
/// running their app from the dev tree finds the workspace; a
/// release-tarball install finds `<prefix>/share/supermachine`).
fn repo_root_for_bake() -> Result<PathBuf, Error> {
    if let Some(root) = std::env::var_os("SUPERMACHINE_ROOT") {
        return Ok(PathBuf::from(root));
    }
    let exe = std::env::current_exe()
        .map_err(|e| Error::bake_msg(format!("current_exe: {e}")))?;
    for ancestor in exe.ancestors() {
        if ancestor.join("tools/supermachine-push").is_file() {
            return Ok(ancestor.to_path_buf());
        }
        if ancestor.join("share/supermachine/kernel").is_file() {
            return Ok(ancestor.to_path_buf());
        }
    }
    std::env::current_dir().map_err(|e| Error::bake_msg(format!("current_dir: {e}")))
}

/// Map a `bake::run_push` error string into the right
/// [`Error`] variant. The bake pipeline returns flat strings, so
/// we pattern-match keywords.
fn map_bake_error(image: &str, msg: String) -> Error {
    let lc = msg.to_ascii_lowercase();

    // Registry HTTP-status-based classification. The bake produces
    // strings like "registry manifest request failed for X with HTTP 404"
    // or "registry token request failed with HTTP 401" — match those
    // shapes.
    let has_status = |code: u16| -> bool {
        let needle = format!("http {code}");
        lc.contains(&needle)
    };
    if has_status(404)
        || lc.contains("manifest unknown")
        || lc.contains("name unknown")
        || lc.contains("not found")
            && (lc.contains("registry") || lc.contains("manifest") || lc.contains("image"))
    {
        return Error::image_not_found(image, msg);
    }
    if has_status(401) || has_status(403)
        || lc.contains("unauthorized")
        || lc.contains("forbidden")
        || lc.contains("auth challenge")
    {
        return Error::registry_auth(image, msg);
    }
    if lc.contains("could not resolve")
        || lc.contains("dns")
        || lc.contains("connection refused")
        || lc.contains("connection reset")
        || lc.contains("network is unreachable")
        || lc.contains("ssl_connect")
        || lc.contains("tls handshake")
        || lc.contains("curl: (6)")  // Couldn't resolve host
        || lc.contains("curl: (7)")  // Failed to connect
        || lc.contains("curl: (28)") // Operation timeout
        || lc.contains("curl: (35)") // SSL connect error
        || lc.contains("curl: (56)") // Recv failure
    {
        return Error::registry_unreachable(msg);
    }

    // Generic registry/manifest/auth strings that didn't match a
    // specific HTTP status fall through to the catch-all Network
    // variant — same as before.
    if lc.contains("registry") || lc.contains("manifest") || lc.contains("docker pull") {
        Error::network_msg(msg)
    } else {
        // Snapshot timeouts, listener-readiness failures, and
        // anything else that wasn't a network-layer issue.
        Error::bake_msg(msg)
    }
}

/// Configuration for [`Vm::start`]. Built via the chainable
/// `VmConfig::with_*` methods or constructed directly:
///
/// ```
/// use supermachine::VmConfig;
/// let cfg = VmConfig::new()
///     .with_memory_mib(512)
///     .with_vcpus(2);
/// # let _ = cfg;
/// ```
#[derive(Debug, Clone, Default)]
pub struct VmConfig {
    /// Override the image's baked memory. `None` = use Image's value.
    memory_mib: Option<u32>,
    /// Override the image's baked vCPUs. `None` = use Image's value.
    vcpus: Option<u32>,
    assets: Option<AssetPaths>,
    vsock_mux_dir: Option<PathBuf>,
    restore_timeout: Option<Duration>,
}

impl VmConfig {
    /// Use the image's baked defaults for memory + vCPUs;
    /// auto-discover assets; vsock-mux socket in `$TMPDIR`;
    /// 10 s restore timeout.
    pub fn new() -> Self {
        Self::default()
    }

    /// Override the image's baked memory.
    pub fn with_memory_mib(mut self, mib: u32) -> Self {
        self.memory_mib = Some(mib);
        self
    }

    /// Override the image's baked vCPU count.
    pub fn with_vcpus(mut self, vcpus: u32) -> Self {
        self.vcpus = Some(vcpus);
        self
    }

    /// Override asset auto-discovery. Useful for `.app` bundles
    /// that ship the kernel + init shim under
    /// `Contents/Resources/`.
    pub fn with_assets(mut self, assets: AssetPaths) -> Self {
        self.assets = Some(assets);
        self
    }

    /// Where to put the host-side vsock-mux unix socket. Default
    /// is `$TMPDIR`. Use this if you need the socket inside an
    /// app-private dir for sandboxing reasons.
    pub fn with_vsock_mux_dir(mut self, dir: impl Into<PathBuf>) -> Self {
        self.vsock_mux_dir = Some(dir.into());
        self
    }

    /// How long to wait for the snapshot to restore. Default 10 s.
    pub fn with_restore_timeout(mut self, timeout: Duration) -> Self {
        self.restore_timeout = Some(timeout);
        self
    }

}

/// A running microVM. Holds an internal worker process and the
/// host-side vsock-mux unix socket through which you talk to the
/// guest.
///
/// Drop the value to stop the VM, or call [`Vm::stop`] for an
/// explicit shutdown report.
pub struct Vm {
    pool: Option<WarmPool>,
    vsock_mux_path: PathBuf,
    /// `<vsock_mux>-exec.sock` for the in-guest exec agent. Only
    /// useful once the agent crate ships in the initramfs (see
    /// `docs/design/exec-2026-05-03.md`); until then dialing this
    /// path will fail with "no listener" because the agent isn't
    /// running guest-side. The unix socket itself is created
    /// unconditionally so `Vm::exec` can wire to it once the agent
    /// lands.
    vsock_exec_path: PathBuf,
    /// Best-effort cleanup of the temp socket dir we created.
    own_vsock_mux_dir: Option<PathBuf>,
    /// `true` for [`PooledVm`]'s wrapped Vm — Drop must NOT
    /// shut down the pool or unlink sockets, since those are
    /// owned by [`Image`]'s [`HiddenPool`] and reused across
    /// [`Image::acquire`] calls.
    skip_cleanup: bool,
    /// Source image metadata for [`Vm::snapshot`] — needed to
    /// emit a `metadata.json` describing the new snapshot's
    /// layers / memory / vCPUs (the snapshot file alone isn't
    /// loadable as an Image without these).  `None` for VMs
    /// that didn't come from an Image (currently impossible via
    /// the public API; reserved for future use cases).
    image_meta: Option<Arc<ImageMeta>>,
}

/// Subset of [`Image`] fields snapshotted into [`Vm`] so
/// [`Vm::snapshot`] can write a self-contained `metadata.json`
/// next to the captured snapshot file.
#[derive(Clone, Debug)]
pub(crate) struct ImageMeta {
    pub memory_mib: u32,
    pub vcpus: u32,
    pub layers: Vec<PathBuf>,
    pub delta_squashfs: Option<PathBuf>,
}

impl Vm {
    /// Start a microVM from `image` with the supplied configuration.
    ///
    /// What this does, in order:
    ///
    /// 1. Resolves the kernel path. Preferences (first hit wins):
    ///    `image`'s bundled kernel (if the snapshot dir shipped one),
    ///    `config.assets.kernel` (if set explicitly), then
    ///    [`AssetPaths::discover`]. Fails with [`Error::Assets`] if
    ///    none is found.
    /// 2. Creates a unique unix socket path for vsock-mux under
    ///    the configured directory.
    /// 3. Spawns an in-process VM thread that restores from
    ///    `image.snapshot_path()`. (The library runs the VM
    ///    in-process via [`crate::internal::vmm::pool::WarmPool`].
    ///    The standalone `supermachine-worker` binary is only used
    ///    by the router daemon for SCM_RIGHTS process isolation.)
    /// 4. Waits up to [`VmConfig::with_restore_timeout`] for the
    ///    restore to complete.
    /// 5. Returns the [`Vm`] handle. The vsock-mux socket is
    ///    available immediately at [`Vm::vsock_path`].
    pub fn start(image: &Image, config: &VmConfig) -> Result<Vm, Error> {
        // Vm::start runs the VM thread in this process, so this
        // process itself calls hv_vm_create. Without the HVF
        // entitlement that fails with HV_DENIED (Hv(-85377017)),
        // which is cryptic. Surface a clear error up front instead.
        // Image::acquire callers don't pay this cost — the worker
        // subprocess handles HVF for them.
        #[cfg(target_os = "macos")]
        if let Err(msg) = crate::codesign::check_self_has_hvf_entitlement() {
            return Err(Error::vm_msg(msg));
        }
        let assets = match &config.assets {
            Some(a) => a.clone(),
            None => AssetPaths::discover(),
        };
        // Kernel preference: bundled (snapshot dir) > config.assets >
        // AssetPaths::discover. A bundled kernel makes the snapshot
        // self-contained so a `.app` ships everything it needs.
        let kernel: PathBuf = if let Some(k) = image.bundled_kernel.as_ref() {
            k.clone()
        } else if let Some(k) = assets.kernel.as_ref() {
            k.clone()
        } else {
            return Err(Error::assets_msg(
                "no kernel found: snapshot dir has no bundled kernel and AssetPaths::discover() came up empty; set VmConfig::with_assets() or $SUPERMACHINE_ASSETS_DIR".to_owned(),
            ));
        };
        let kernel = kernel.as_path();

        // Per-VM unix socket path under the chosen dir.
        let dir = match &config.vsock_mux_dir {
            Some(d) => d.clone(),
            None => std::env::temp_dir(),
        };
        let mut own_dir = None;
        if !dir.is_dir() {
            std::fs::create_dir_all(&dir).map_err(Error::Io)?;
            own_dir = Some(dir.clone());
        }
        let vsock_mux_path = dir.join(format!(
            "supermachine-vm-{}-{}.sock",
            std::process::id(),
            unique_suffix(),
        ));
        // `<vsock_mux>-exec` is the convention that worker.rs and
        // the design doc agree on. Same parent dir so unlinking the
        // mux on shutdown sweeps it too.
        let vsock_exec_path = {
            let mut p = vsock_mux_path.clone();
            let mut name = p.file_name().unwrap().to_owned();
            name.push("-exec");
            p.set_file_name(name);
            p
        };

        // Build VmResources for snapshot restore. Memory + vCPUs
        // come from the image's bake metadata unless the caller
        // explicitly overrode them.
        let memory_mib = config.memory_mib.unwrap_or(image.memory_mib);
        let vcpus = config.vcpus.unwrap_or(image.vcpus);
        let mut resources = VmResources::new()
            .with_kernel_path(kernel.to_string_lossy().to_string())
            .with_memory_mib(memory_mib as usize)
            .with_vcpus(vcpus)
            .with_cow_restore(true)
            .with_restore(image.snapshot_path.to_string_lossy().to_string())
            .with_vsock_mux(vsock_mux_path.to_string_lossy().to_string())
            .with_vsock_exec(vsock_exec_path.to_string_lossy().to_string());

        // Attach the OCI image's virtio-blk layers in bake order.
        // The guest's overlayfs union is built bottom-up over these.
        for layer in &image.layers {
            resources = resources.with_block_device(layer.to_string_lossy().to_string());
        }
        if let Some(delta) = &image.delta_squashfs {
            resources = resources.with_block_device(delta.to_string_lossy().to_string());
        }

        // Pool of size 1 — single worker, single VM.
        let options = RunOptions::default();
        let pool = WarmPool::start(resources, options).map_err(Error::from)?;

        // Restore from the snapshot. WarmPool's restore_timeout
        // dispatches the RESTORE command to the pre-spawned worker
        // and blocks until the guest is up.
        let timeout = config
            .restore_timeout
            .unwrap_or_else(|| Duration::from_secs(10));
        let _ = pool
            .restore_timeout(image.snapshot_path.to_string_lossy().to_string(), timeout)
            .map_err(Error::from)?;

        Ok(Vm {
            pool: Some(pool),
            vsock_mux_path,
            vsock_exec_path,
            own_vsock_mux_dir: own_dir,
            skip_cleanup: false,
            image_meta: Some(Arc::new(ImageMeta {
                memory_mib,
                vcpus,
                layers: image.layers.clone(),
                delta_squashfs: image.delta_squashfs.clone(),
            })),
        })
    }

    /// Path to the host-side unix socket that proxies bytes to /
    /// from the first TSI listener inside the guest. Connect to it
    /// with [`UnixStream::connect`] (or via [`Vm::connect`]).
    pub fn vsock_path(&self) -> &Path {
        &self.vsock_mux_path
    }

    /// Path to the host-side unix socket that bridges to the
    /// in-guest exec agent (native AF_VSOCK on the guest side).
    /// Reachable once the agent lands in the initramfs and is
    /// running guest-side; until then dialing it returns an
    /// immediate EOF.
    pub fn exec_path(&self) -> &Path {
        &self.vsock_exec_path
    }

    /// Spawn a process inside the running guest. Equivalent to
    /// `docker exec`. Returns an [`crate::exec::ExecChild`] handle
    /// you can read stdout/stderr from, write stdin to, and
    /// `wait()` for an exit status.
    ///
    /// ```no_run
    /// # use std::io::Read;
    /// # use supermachine::{Image, Vm, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = Vm::start(&image, &VmConfig::new())?;
    /// let mut child = vm.exec(["sh", "-c", "echo hi"])?;
    /// let mut buf = String::new();
    /// child.stdout().unwrap().read_to_string(&mut buf)?;
    /// assert_eq!(buf, "hi\n");
    /// child.wait()?;
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    pub fn exec<I, S>(&self, argv: I) -> std::io::Result<crate::exec::ExecChild>
    where
        I: IntoIterator<Item = S>,
        S: Into<String>,
    {
        let argv: Vec<String> = argv.into_iter().map(|s| s.into()).collect();
        let _span = tracing::info_span!(
            "supermachine.exec",
            argv0 = argv.first().map(|s| s.as_str()).unwrap_or(""),
            argc = argv.len(),
        )
        .entered();
        self.exec_builder().argv(argv).spawn()
    }

    /// Configurable exec — TTY, env vars, cwd, initial winsize,
    /// timeout, and the [`crate::exec::ExecBuilder::output`]
    /// convenience that drains stdio + collects exit status into
    /// one [`crate::exec::ExecOutcome`].
    pub fn exec_builder(&self) -> crate::exec::ExecBuilder {
        crate::exec::ExecBuilder::new(self.vsock_exec_path.clone())
    }

    /// Write `bytes` to `path` inside the guest, atomically.
    /// Native vsock RPC — no exec, no shell. Roughly ~100 µs per
    /// call regardless of file size (up to the 12 MiB raw limit
    /// imposed by the agent's frame cap).
    ///
    /// The guest agent stages to a sibling tmp file then renames
    /// for atomicity, so partial writes don't leave a half-baked
    /// file at `path`.
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = image.start(&VmConfig::new())?;
    /// vm.write_file("/tmp/main.rs", b"fn main() { println!(\"hi\"); }")?;
    /// let out = vm.exec_builder()
    ///     .argv(["rustc", "/tmp/main.rs", "-o", "/tmp/main"])
    ///     .output()?;
    /// assert!(out.success());
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    pub fn write_file(&self, path: &str, bytes: &[u8]) -> std::io::Result<()> {
        let body = serde_json::json!({
            "action": "write_file",
            "path": path,
            "data_b64": b64_encode(bytes),
        });
        crate::exec::send_control(&self.vsock_exec_path, &body)
    }

    /// Read `path` from inside the guest. Symmetric with
    /// [`Vm::write_file`]; native vsock RPC, ~100 µs per call.
    /// Cap is 4 MiB by default (raises an error if larger);
    /// stream large files via [`Vm::exec`] instead.
    pub fn read_file(&self, path: &str) -> std::io::Result<Vec<u8>> {
        let body = serde_json::json!({
            "action": "read_file",
            "path": path,
        });
        // Generous read timeout for large reads — file IO inside
        // the VM is fast, but we want to tolerate cold-cache cases.
        let ack = crate::exec::send_control_with_ack(
            &self.vsock_exec_path,
            &body,
            Some(std::time::Duration::from_secs(30)),
        )?;
        let data_b64 = ack
            .get("data_b64")
            .and_then(|v| v.as_str())
            .ok_or_else(|| {
                std::io::Error::new(
                    std::io::ErrorKind::InvalidData,
                    "read_file: agent ack missing data_b64",
                )
            })?;
        b64_decode(data_b64)
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
    }

    /// Send a Unix signal to the guest's main workload process.
    /// Use this for `docker stop`-style graceful shutdown:
    ///
    /// ```no_run
    /// # use supermachine::{Image, Vm, VmConfig};
    /// # let image = Image::from_snapshot("path")?;
    /// # let vm = Vm::start(&image, &VmConfig::new())?;
    /// vm.workload_signal(libc::SIGTERM)?;
    /// // ...wait for the workload to clean up...
    /// vm.stop()?;
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    ///
    /// Implementation: dials the in-guest exec agent on a fresh
    /// connection with a CONTROL frame; the agent reads
    /// `/run/supermachine-workload.pid` (written by init-oci's
    /// PID-1 supervisor) and `kill(pid, signum)` it. Returns
    /// `Err(NotFound)` if the workload hasn't been spawned yet
    /// (only happens during the bake-time window).
    pub fn workload_signal(&self, signum: i32) -> std::io::Result<()> {
        let body = serde_json::json!({
            "action": "signal",
            "signum": signum,
        });
        crate::exec::send_control(&self.vsock_exec_path, &body)
    }

    /// Connect to the guest's first TSI listener. The returned
    /// `UnixStream` is byte-equivalent to a `TcpStream` to the
    /// guest's `:80` (or whatever port it bound).
    ///
    /// For HTTP, just write a request and read the response:
    /// supermachine's vsock-mux is a transparent proxy.
    pub fn connect(&self) -> std::io::Result<UnixStream> {
        UnixStream::connect(&self.vsock_mux_path)
    }

    /// Bind a TCP listener on `127.0.0.1:host_port` that forwards
    /// each accepted connection to the guest's TSI listener (the
    /// same destination as [`Vm::connect`]). Returns a
    /// [`TcpForwarder`] that owns the accept-loop thread; drop it
    /// (or call [`TcpForwarder::stop`]) to stop accepting new
    /// connections. In-flight connections continue until they close
    /// naturally.
    ///
    /// `host_port = 0` lets the OS pick a free port; read the actual
    /// address back via [`TcpForwarder::local_addr`].
    ///
    /// `guest_port` is currently informational — supermachine's
    /// vsock-mux exposes the first TSI listener regardless. The
    /// parameter is in the signature so future versions can route
    /// to a specific guest port without breaking callers.
    ///
    /// Use this when you want the embedded VM to look like a normal
    /// localhost service (e.g. `http://127.0.0.1:9090/`) rather than
    /// having every caller go through `vm.connect()`.
    ///
    /// ```no_run
    /// # use supermachine::{Image, Vm, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = Vm::start(&image, &VmConfig::new())?;
    /// let fwd = vm.expose_tcp(9090, 80)?;
    /// println!("nginx is on {}", fwd.local_addr());
    /// // ... do work ...
    /// drop(fwd); // stop forwarding
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn expose_tcp(&self, host_port: u16, _guest_port: u16) -> std::io::Result<TcpForwarder> {
        let listener = TcpListener::bind(("127.0.0.1", host_port))?;
        let bound = listener.local_addr()?;
        // Short accept timeout so the stop flag is responsive.
        listener.set_nonblocking(false)?;
        let stop = Arc::new(AtomicBool::new(false));
        let stop_thread = stop.clone();
        let vsock_path = self.vsock_mux_path.clone();
        let handle = std::thread::Builder::new()
            .name(format!("supermachine-tcp-{host_port}"))
            .spawn(move || {
                accept_loop(listener, vsock_path, stop_thread);
            })
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
        // Best-effort: poke the listener to unblock its accept on
        // shutdown. We rely on `stop` flag + a self-connect during
        // drop. See TcpForwarder::drop.
        Ok(TcpForwarder {
            stop,
            handle: Some(handle),
            bound,
        })
    }

    /// Stop the VM. Equivalent to dropping it, but returns errors
    /// rather than swallowing them.
    pub fn stop(mut self) -> Result<(), Error> {
        if let Some(pool) = self.pool.take() {
            let _ = pool.shutdown().map_err(Error::from)?;
        }
        self.cleanup_socket();
        Ok(())
    }

    /// Capture a snapshot of the running VM into `dest_dir`. The
    /// dir gets `restore.snap` (the captured VM state) and a
    /// `metadata.json` describing the layers/memory/vCPUs from
    /// the source [`Image`] — together they form a fresh
    /// snapshot loadable via [`Image::from_snapshot`].
    ///
    /// This is the **"rustc-warm snapshot" pattern** — boot a VM
    /// from a base image (e.g. `rust:1-slim`), populate
    /// expensive in-VM state (run `cargo build` to fill
    /// `target/` with cached deps), capture, then re-use the new
    /// snapshot via `Image::from_snapshot(...).acquire()` for
    /// fast subsequent iterations.
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let base = Image::from_snapshot("path/to/rust-slim")?;
    /// let vm = base.start(&VmConfig::new())?;
    /// // Pre-warm: populate target/ with cached deps.
    /// vm.exec_builder()
    ///     .argv(["sh", "-c", "cd /src && cargo build --release"])
    ///     .output()?;
    /// // Capture; vm is consumed (and stopped).
    /// let warm = vm.snapshot("/tmp/rust-warm")?;
    /// // Now `warm.acquire()` gets you a VM with target/
    /// // already populated — every subsequent compile re-uses
    /// // the cached deps.
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// **Only works on a Vm produced by [`Image::start`]**.
    /// Pooled VMs (from [`Image::acquire`]) live in worker
    /// subprocesses and don't have host-side access to the
    /// snapshot machinery; snapshot them by re-starting the
    /// source image and snapshotting that.
    pub fn snapshot(mut self, dest_dir: impl Into<PathBuf>) -> Result<Image, Error> {
        let dest_dir = dest_dir.into();
        let meta = self.image_meta.clone().ok_or_else(|| {
            Error::vm_msg(
                "Vm::snapshot requires an in-process Vm (use image.start, not image.acquire)"
                    .to_owned(),
            )
        })?;
        let pool = self.pool.as_ref().ok_or_else(|| {
            Error::vm_msg("Vm::snapshot: no pool to drive the capture".to_owned())
        })?;
        std::fs::create_dir_all(&dest_dir).map_err(Error::Io)?;
        let snap_path = dest_dir.join("restore.snap");
        // Trigger the capture via the pool RPC. Generous
        // timeout — capture is fast (~10s of ms) but disk
        // saving for big VMs can take a moment.
        let _result = pool
            .snapshot_timeout(
                snap_path.to_string_lossy().to_string(),
                Duration::from_secs(60),
            )
            .map_err(|e| Error::Vm {
                msg: format!("snapshot capture failed: {e:?}"),
                source: None,
            })?;
        // Write a metadata.json that mirrors what the bake step
        // emits, so Image::from_snapshot can load this dir.
        let metadata = serde_json::json!({
            "memory_mib": meta.memory_mib,
            "vcpus": meta.vcpus,
            "layers": meta
                .layers
                .iter()
                .map(|p| p.to_string_lossy().to_string())
                .collect::<Vec<_>>(),
            "delta_squashfs": meta
                .delta_squashfs
                .as_ref()
                .map(|p| p.to_string_lossy().to_string()),
            "snapshot_base": snap_path.to_string_lossy().to_string(),
            "baked_at": chrono_rfc3339_now(),
            "source": "Vm::snapshot",
        });
        std::fs::write(
            dest_dir.join("metadata.json"),
            serde_json::to_string_pretty(&metadata)
                .map_err(|e| Error::vm_msg(format!("metadata serialize: {e}")))?,
        )
        .map_err(Error::Io)?;
        // Cleanly shut down our worker — the snapshot is on disk.
        if let Some(pool) = self.pool.take() {
            let _ = pool.shutdown();
        }
        self.cleanup_socket();
        // Suppress the Drop, we already cleaned up.
        self.skip_cleanup = true;
        // Load the freshly-written dir as a new Image.
        Image::from_snapshot(&dest_dir)
    }

    fn cleanup_socket(&self) {
        let _ = std::fs::remove_file(&self.vsock_mux_path);
        let _ = std::fs::remove_file(&self.vsock_exec_path);
        if let Some(dir) = &self.own_vsock_mux_dir {
            // Only unlink the dir if it's still empty (best-effort).
            let _ = std::fs::remove_dir(dir);
        }
    }
}

// ---------- minimal base64 (RFC 4648, mirror of agent's) ----------
//
// Inlined to keep deps minimal. Round-trip-tested against the
// agent's implementation; identical alphabet + padding rules.

const B64_ALPHA: &[u8; 64] =
    b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

/// Public so the napi binding can encode bytes for the agent's
/// `write_file` action JSON without re-implementing this. The
/// shape is internal — embedders outside this workspace should
/// stick to [`crate::Vm::write_file`] / [`crate::Vm::read_file`]
/// which call into this for you.
pub fn b64_encode(bytes: &[u8]) -> String {
    let mut out = String::with_capacity((bytes.len() + 2) / 3 * 4);
    let mut i = 0;
    while i + 3 <= bytes.len() {
        let b0 = bytes[i] as u32;
        let b1 = bytes[i + 1] as u32;
        let b2 = bytes[i + 2] as u32;
        let n = (b0 << 16) | (b1 << 8) | b2;
        out.push(B64_ALPHA[((n >> 18) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 12) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 6) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[(n & 0x3f) as usize] as char);
        i += 3;
    }
    let rem = bytes.len() - i;
    if rem == 1 {
        let b0 = bytes[i] as u32;
        let n = b0 << 16;
        out.push(B64_ALPHA[((n >> 18) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 12) & 0x3f) as usize] as char);
        out.push('=');
        out.push('=');
    } else if rem == 2 {
        let b0 = bytes[i] as u32;
        let b1 = bytes[i + 1] as u32;
        let n = (b0 << 16) | (b1 << 8);
        out.push(B64_ALPHA[((n >> 18) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 12) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 6) & 0x3f) as usize] as char);
        out.push('=');
    }
    out
}

/// Public counterpart to [`b64_encode`]. Same shape, same caveat.
pub fn b64_decode(s: &str) -> Result<Vec<u8>, String> {
    let mut tbl = [255u8; 256];
    for (i, &b) in B64_ALPHA.iter().enumerate() {
        tbl[b as usize] = i as u8;
    }
    let bytes: Vec<u8> = s.bytes().filter(|b| !b.is_ascii_whitespace()).collect();
    if bytes.len() % 4 != 0 {
        return Err(format!("base64 length {} is not a multiple of 4", bytes.len()));
    }
    let mut out = Vec::with_capacity(bytes.len() / 4 * 3);
    for chunk in bytes.chunks_exact(4) {
        let v: [u8; 4] = chunk.try_into().unwrap();
        let pad = v.iter().filter(|&&b| b == b'=').count();
        let mut acc: u32 = 0;
        for &b in &v {
            let d = if b == b'=' { 0 } else { tbl[b as usize] };
            if b != b'=' && d == 255 {
                return Err(format!("invalid base64 character {:#x}", b));
            }
            acc = (acc << 6) | (d as u32);
        }
        out.push(((acc >> 16) & 0xff) as u8);
        if pad < 2 {
            out.push(((acc >> 8) & 0xff) as u8);
        }
        if pad < 1 {
            out.push((acc & 0xff) as u8);
        }
    }
    Ok(out)
}

impl Drop for Vm {
    fn drop(&mut self) {
        // PooledVm sets skip_cleanup so its inner Vm doesn't
        // shut down the shared pool or unlink sockets the
        // HiddenPool keeps alive.
        if self.skip_cleanup {
            return;
        }
        if let Some(pool) = self.pool.take() {
            let _ = pool.shutdown();
        }
        self.cleanup_socket();
    }
}

/// Owns the accept-loop thread for a [`Vm::expose_tcp`] forwarder.
///
/// Drop this to stop accepting new connections. In-flight
/// connections continue until they close naturally — they're owned
/// by their own splice threads, not by the forwarder.
pub struct TcpForwarder {
    stop: Arc<AtomicBool>,
    handle: Option<JoinHandle<()>>,
    bound: SocketAddr,
}

impl TcpForwarder {
    /// The address the forwarder is listening on. Useful when you
    /// asked for `host_port = 0` and want to know the OS-assigned
    /// port.
    pub fn local_addr(&self) -> SocketAddr {
        self.bound
    }

    /// Stop accepting new connections. Equivalent to dropping the
    /// forwarder, but returns when the accept thread has actually
    /// exited.
    pub fn stop(mut self) {
        self.shutdown();
    }

    fn shutdown(&mut self) {
        self.stop.store(true, Ordering::SeqCst);
        // Self-connect to unblock the listener's accept(). We don't
        // care about the result — the connection just exists to wake
        // the loop, which then sees `stop` set and exits.
        let _ = TcpStream::connect_timeout(&self.bound, Duration::from_millis(200));
        if let Some(h) = self.handle.take() {
            let _ = h.join();
        }
    }
}

impl Drop for TcpForwarder {
    fn drop(&mut self) {
        self.shutdown();
    }
}

/// Accept loop for `Vm::expose_tcp`. Spawns a per-connection splice
/// thread for each accepted TCP stream; the splice threads live
/// independently of the forwarder so in-flight requests survive
/// `TcpForwarder::drop`.
fn accept_loop(listener: TcpListener, vsock_path: PathBuf, stop: Arc<AtomicBool>) {
    for incoming in listener.incoming() {
        if stop.load(Ordering::SeqCst) {
            break;
        }
        let tcp = match incoming {
            Ok(s) => s,
            Err(_) => continue,
        };
        let vsock = vsock_path.clone();
        std::thread::Builder::new()
            .name("supermachine-tcp-conn".into())
            .spawn(move || {
                if let Err(e) = splice_tcp_to_unix(tcp, &vsock) {
                    // Log to stderr — this is best-effort; the
                    // embedder's preferred logging is out of scope.
                    eprintln!("supermachine: tcp forward: {e}");
                }
            })
            .ok();
    }
}

/// Bridge a single TCP connection to the vsock-mux unix socket.
/// Two threads per connection: one shovels TCP→Unix, the other
/// Unix→TCP. Either side closing tears the bridge down.
fn splice_tcp_to_unix(tcp: TcpStream, vsock_path: &Path) -> std::io::Result<()> {
    let unix = UnixStream::connect(vsock_path)?;
    // try_clone so each direction owns its own handle.
    let tcp_w = tcp.try_clone()?;
    let unix_w = unix.try_clone()?;
    let t1 = std::thread::Builder::new()
        .name("supermachine-tcp-c2g".into())
        .spawn(move || {
            let _ = pump(tcp, unix_w);
        })?;
    let t2 = std::thread::Builder::new()
        .name("supermachine-tcp-g2c".into())
        .spawn(move || {
            let _ = pump(unix, tcp_w);
        })?;
    let _ = t1.join();
    let _ = t2.join();
    Ok(())
}

/// Generic byte pump from `r` → `w` until EOF or error. We use
/// `Read + Write` trait objects via concrete types so this works
/// for both TcpStream and UnixStream. Half-close on EOF: the writer
/// gets shutdown so the peer of `w` sees the FIN.
fn pump<R, W>(mut r: R, mut w: W) -> std::io::Result<()>
where
    R: Read,
    W: Write + Shutdownable,
{
    let mut buf = [0u8; 16 * 1024];
    loop {
        let n = match r.read(&mut buf) {
            Ok(0) => break,
            Ok(n) => n,
            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
            Err(e) => return Err(e),
        };
        if let Err(e) = w.write_all(&buf[..n]) {
            return Err(e);
        }
    }
    let _ = w.shutdown_write();
    Ok(())
}

/// Trait letting `pump` call `shutdown(Write)` on either a
/// `TcpStream` or a `UnixStream` without dynamic dispatch.
trait Shutdownable {
    fn shutdown_write(&mut self) -> std::io::Result<()>;
}

impl Shutdownable for TcpStream {
    fn shutdown_write(&mut self) -> std::io::Result<()> {
        TcpStream::shutdown(self, std::net::Shutdown::Write)
    }
}

impl Shutdownable for UnixStream {
    fn shutdown_write(&mut self) -> std::io::Result<()> {
        UnixStream::shutdown(self, std::net::Shutdown::Write)
    }
}

fn unique_suffix() -> u64 {
    use std::sync::atomic::{AtomicU64, Ordering};
    static COUNTER: AtomicU64 = AtomicU64::new(0);
    let nanos = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_nanos() as u64)
        .unwrap_or(0);
    nanos.wrapping_add(COUNTER.fetch_add(1, Ordering::Relaxed))
}

/// Tiny RFC 3339 timestamp formatter — used by [`Vm::snapshot`]'s
/// metadata. Avoids pulling in `chrono` for a single timestamp.
fn chrono_rfc3339_now() -> String {
    let secs = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs() as i64)
        .unwrap_or(0);
    // Days since 1970-01-01 / seconds within day.
    let days = secs.div_euclid(86_400);
    let sod = secs.rem_euclid(86_400);
    let hh = sod / 3600;
    let mm = (sod % 3600) / 60;
    let ss = sod % 60;
    // Civil-from-days (Howard Hinnant's algorithm).
    let z = days + 719_468;
    let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
    let doe = z - era * 146_097;
    let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
    let y = yoe + era * 400;
    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
    let mp = (5 * doy + 2) / 153;
    let d = doy - (153 * mp + 2) / 5 + 1;
    let m = if mp < 10 { mp + 3 } else { mp - 9 };
    let y = if m <= 2 { y + 1 } else { y };
    format!("{y:04}-{m:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z")
}