supermachine 0.3.3

//! High-level public API: [`Image`], [`Vm`], [`VmConfig`], [`Error`].
//!
//! These types wrap the lower-level [`crate::vmm`] primitives
//! (`WarmPool`, `VmResources`, …) into a small, stable surface
//! for embedders: load an image, start a VM, talk to its guest,
//! stop. The lower-level types remain available under
//! `#[doc(hidden)]` for the CLI / router / bench crates that
//! pre-date the narrowing.

use std::collections::VecDeque;
use std::io::{Read, Write};
use std::net::{SocketAddr, TcpListener, TcpStream};
use std::os::unix::net::UnixStream;
use std::path::{Path, PathBuf};
use std::process::{Child, Command, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Condvar, Mutex};
use std::thread::JoinHandle;
use std::time::{Duration, Instant};

use crate::assets::AssetPaths;
use crate::vmm::pool::{PoolClientError, WarmPool, WarmPoolError};
use crate::vmm::resources::VmResources;
use crate::vmm::runner::RunOptions;

/// All errors the high-level API can return. Designed to be
/// `match`able: the variants name *what failed*, not which
/// internal type produced it.
///
/// Each variant carries a human-readable `msg` plus an optional
/// typed `source: Option<Box<dyn std::error::Error + Send + Sync>>`
/// so callers can downcast to the underlying error
/// (`io::Error`, `WarmPoolError`, `PoolClientError`, ...) when
/// they need typed handling. `std::error::Error::source()` walks
/// this chain so `?` propagation preserves it through downstream
/// `Box<dyn Error>` conversions.
///
/// `#[non_exhaustive]` so future versions can add variants without
/// breaking exhaustive matches in consumer code.
#[non_exhaustive]
pub enum Error {
    /// The image / snapshot couldn't be loaded — bad path, bad
    /// magic bytes, or version mismatch.
    Image {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// VM start / restore failed. Includes `WarmPool` setup errors,
    /// HVF entitlement issues, missing assets, and pool-spawn
    /// failures. Downcast `source` to the typed cause where
    /// applicable.
    Vm {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// The configured assets (kernel, init shim) couldn't be
    /// located. Set [`VmConfig::with_assets`] explicitly to
    /// override auto-discovery.
    Assets {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// I/O on a vsock socket / file. The original `io::Error` is
    /// the variant payload — match `Error::Io(e)` and inspect
    /// `e.kind()` for typed handling.
    Io(std::io::Error),
    /// Registry pull failed — image manifest fetch, layer download,
    /// or auth handshake. Surface message includes the registry
    /// HTTP status / response body where available.
    Network {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
    /// [`PullPolicy::Never`] was set but no usable cache exists.
    /// Switch to [`PullPolicy::Missing`] (the default) to allow a
    /// pull, or pre-bake via the `supermachine` CLI.
    CacheMiss {
        msg: String,
    },
    /// A cached snapshot was found but isn't loadable on this
    /// binary — runtime SHA mismatch, snapshot format version
    /// mismatch, or corrupt/missing layer files. The error message
    /// names the specific reason. With [`PullPolicy::Missing`] /
    /// [`PullPolicy::Always`] the library auto-rebakes; only
    /// [`PullPolicy::Never`] surfaces this.
    CacheInvalid {
        msg: String,
    },
    /// The bake step itself failed — snapshot capture timed out,
    /// the workload didn't bind a port within the readiness window,
    /// or the worker exited mid-bake. See `bake.log` in the
    /// snapshot dir for details.
    Bake {
        msg: String,
        source: Option<Box<dyn std::error::Error + Send + Sync>>,
    },
}

// Constructor helpers that callers in this crate use. Keeping
// the public surface field-style (struct variants) means new
// fields are non-breaking; the callers below all funnel through
// these so we can evolve the construction shape later without
// touching every call site.
//
// Public API is the variants themselves; these are pub(crate).
impl Error {
    pub(crate) fn image_msg(msg: impl Into<String>) -> Self {
        Error::Image { msg: msg.into(), source: None }
    }
    pub(crate) fn vm_msg(msg: impl Into<String>) -> Self {
        Error::Vm { msg: msg.into(), source: None }
    }
    pub(crate) fn assets_msg(msg: impl Into<String>) -> Self {
        Error::Assets { msg: msg.into(), source: None }
    }
    pub(crate) fn network_msg(msg: impl Into<String>) -> Self {
        Error::Network { msg: msg.into(), source: None }
    }
    pub(crate) fn bake_msg(msg: impl Into<String>) -> Self {
        Error::Bake { msg: msg.into(), source: None }
    }
    pub(crate) fn cache_miss(msg: impl Into<String>) -> Self {
        Error::CacheMiss { msg: msg.into() }
    }
    pub(crate) fn cache_invalid(msg: impl Into<String>) -> Self {
        Error::CacheInvalid { msg: msg.into() }
    }
}

impl std::fmt::Debug for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::Image { msg, source } => f
                .debug_struct("Image")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::Vm { msg, source } => f
                .debug_struct("Vm")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::Assets { msg, source } => f
                .debug_struct("Assets")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::Io(e) => f.debug_tuple("Io").field(e).finish(),
            Error::Network { msg, source } => f
                .debug_struct("Network")
                .field("msg", msg)
                .field("source", source)
                .finish(),
            Error::CacheMiss { msg } => f.debug_struct("CacheMiss").field("msg", msg).finish(),
            Error::CacheInvalid { msg } => {
                f.debug_struct("CacheInvalid").field("msg", msg).finish()
            }
            Error::Bake { msg, source } => f
                .debug_struct("Bake")
                .field("msg", msg)
                .field("source", source)
                .finish(),
        }
    }
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::Image { msg, .. } => write!(f, "image: {msg}"),
            Error::Vm { msg, .. } => write!(f, "vm: {msg}"),
            Error::Assets { msg, .. } => write!(f, "assets: {msg}"),
            Error::Io(e) => write!(f, "io: {e}"),
            Error::Network { msg, .. } => write!(f, "network: {msg}"),
            Error::CacheMiss { msg } => write!(f, "cache miss: {msg}"),
            Error::CacheInvalid { msg } => write!(f, "cache invalid: {msg}"),
            Error::Bake { msg, .. } => write!(f, "bake: {msg}"),
        }
    }
}

impl std::error::Error for Error {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Error::Image { source, .. }
            | Error::Vm { source, .. }
            | Error::Assets { source, .. }
            | Error::Network { source, .. }
            | Error::Bake { source, .. } => {
                source.as_ref().map(|s| s.as_ref() as &(dyn std::error::Error + 'static))
            }
            Error::Io(e) => Some(e),
            Error::CacheMiss { .. } | Error::CacheInvalid { .. } => None,
        }
    }
}

impl From<std::io::Error> for Error {
    fn from(e: std::io::Error) -> Self {
        Error::Io(e)
    }
}

impl From<WarmPoolError> for Error {
    fn from(e: WarmPoolError) -> Self {
        Error::Vm {
            msg: e.to_string(),
            source: Some(Box::new(e)),
        }
    }
}

impl From<PoolClientError> for Error {
    fn from(e: PoolClientError) -> Self {
        Error::Vm {
            msg: e.to_string(),
            source: Some(Box::new(e)),
        }
    }
}

/// How [`Image::from_oci`] decides whether to talk to the registry
/// or use a locally-cached snapshot. Same semantics as Docker's
/// `--pull` flag.
///
/// **The default is [`PullPolicy::Missing`]** — use the cache if
/// it exists; pull only if absent. Right for pinned tags or digest
/// references. For `:latest`-style mutable tags use
/// [`PullPolicy::Always`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum PullPolicy {
    /// Pull manifest from the registry every time; rebake if the
    /// digest changed since the last bake. Right for `:latest`-style
    /// mutable tags.
    Always,
    /// Use the cached snapshot if it exists locally and is valid.
    /// Don't talk to the registry at all unless the cache is
    /// missing or invalid. **The default.**
    Missing,
    /// Use the cache or fail. Never pull. Right for offline /
    /// air-gapped environments.
    Never,
}

impl Default for PullPolicy {
    fn default() -> Self {
        Self::Missing
    }
}

impl PullPolicy {
    /// String form the underlying `bake` pipeline accepts. Mirror
    /// of the CLI's `--pull` argument values.
    fn as_bake_str(self) -> &'static str {
        match self {
            Self::Always => "always",
            Self::Missing => "missing",
            Self::Never => "never",
        }
    }
}

/// A baked OCI image: its restore snapshot plus the metadata
/// describing which kernel + virtio-blk layers it needs. Cheap
/// to clone.
///
/// Two ways to construct one:
///
/// - [`Image::from_oci`] — pull (or reuse cache) from a registry,
///   bake into a snapshot, return the resulting image. The
///   high-level "I have an image reference" entry point.
/// - [`Image::from_snapshot`] — load an already-baked snapshot
///   directory directly. Useful when you want to keep snapshots
///   under your own management or share one across processes.
#[derive(Debug, Clone)]
pub struct Image {
    snapshot_path: PathBuf,
    /// Default memory; can be overridden via [`VmConfig::with_memory_mib`].
    pub(crate) memory_mib: u32,
    /// Default vCPUs; can be overridden via [`VmConfig::with_vcpus`].
    pub(crate) vcpus: u32,
    /// virtio-blk layer file paths in the order the bake step
    /// produced them. The microVM needs all of them attached at
    /// restore time (the OverlayFS in the guest is mounted on top).
    pub(crate) layers: Vec<PathBuf>,
    /// Optional per-image delta layer applied after `layers`.
    pub(crate) delta_squashfs: Option<PathBuf>,
    /// Bundled kernel path, if the snapshot dir ships one alongside.
    /// Lets a self-contained bundle (e.g. `MyApp.app/Contents/
    /// Resources/<image>/kernel`) start a VM without requiring the
    /// embedder's host to have supermachine assets installed
    /// system-wide. `None` means [`Vm::start`] falls back to
    /// [`AssetPaths::discover`].
    pub(crate) bundled_kernel: Option<PathBuf>,
    /// Hidden warm pool, lazy-initialized on first
    /// [`Image::acquire`]. The pool holds a single long-lived
    /// `WarmPool` (in-process worker + the snapshot mmap'd in)
    /// behind a mutex; each `acquire` calls `restore` to reset
    /// the worker to clean snapshot state, returns a [`PooledVm`]
    /// that holds the lock, and re-enters the pool on `Drop`.
    /// Per-acquire cost is just the snapshot restore (~5 ms),
    /// not the full VM spawn (~50–100 ms).
    ///
    /// Wrapped in `Arc` so cloning an `Image` shares the same
    /// pool instance — useful when multiple parts of an app hold
    /// `Image` references but should share a single warm worker.
    pub(crate) hidden_pool: std::sync::OnceLock<Arc<HiddenPool>>,
}

/// Internal state for the hidden subprocess pool an [`Image`]
/// manages for [`Image::acquire`] users. Spawns N
/// `supermachine-worker` subprocesses up front, each pre-restored
/// from the snapshot — so `acquire` is just "pop an idle worker
/// off the queue" (~1 ms) and N concurrent acquires really run N
/// VMs in parallel (each in its own subprocess, each its own
/// `hv_vm_create` singleton).
///
/// On `Drop`, kills every worker and unlinks every socket.
#[doc(hidden)]
pub struct HiddenPool {
    /// Per-worker state: child process + vsock paths + idle flag.
    /// Mutex-protected so acquire/replenishment race cleanly.
    state: Mutex<PoolState>,
    /// Wakes acquire() callers blocked on an empty idle queue.
    /// Signalled by the replenisher thread when a fresh worker
    /// has its sockets up.
    available: Condvar,
    /// Where each worker's vsock mux/exec sockets live. Wrapped
    /// for the cleanup pass in Drop.
    socks_dir: PathBuf,
    /// True once Drop has started; replenisher exits next loop.
    shutting_down: AtomicBool,
    /// Replenisher thread handle, joined in Drop.
    replenisher: Option<JoinHandle<()>>,
    /// Image-derived spawn config, copied to keep the pool
    /// self-contained (Image won't be borrowed across thread
    /// boundaries from inside the pool).
    spawn_cfg: Arc<SpawnConfig>,
}

/// Per-Pool internal state under one mutex. Held only briefly
/// during acquire/release; the long-running work happens outside.
struct PoolState {
    /// Idle workers ready for the next `acquire()`.
    idle: VecDeque<Worker>,
    /// Total workers currently alive (idle + checked-out). The
    /// replenisher uses this to decide whether to spawn more.
    alive: usize,
    /// Target N. Set at construction; not mutated.
    target: usize,
}

/// One spawned `supermachine-worker` subprocess + its vsock
/// socket paths.
struct Worker {
    child: Child,
    vsock_mux_path: PathBuf,
    vsock_exec_path: PathBuf,
}

impl Worker {
    /// Best-effort kill — used both on Drop and when returning a
    /// "dirty" worker for replacement.
    fn kill(&mut self) {
        let _ = self.child.kill();
        // Reap so it doesn't linger as a zombie.
        let _ = self.child.wait();
        let _ = std::fs::remove_file(&self.vsock_mux_path);
        let _ = std::fs::remove_file(&self.vsock_exec_path);
        // Handoff socket sibling, if any.
        let mut h = self.vsock_mux_path.clone();
        h.set_extension("handoff");
        let _ = std::fs::remove_file(&h);
    }
}

/// Resolved + reusable spawn config for one Image's pool.
struct SpawnConfig {
    worker_bin: PathBuf,
    snapshot_path: PathBuf,
    layers: Vec<PathBuf>,
    delta_squashfs: Option<PathBuf>,
    memory_mib: u32,
    vcpus: u32,
    socks_dir: PathBuf,
    /// Identifier folded into the socket file names. Just for
    /// readability when looking at /tmp.
    name_prefix: String,
    /// Honored when waiting for a freshly spawned worker's vsock
    /// socket to appear.
    spawn_timeout: Duration,
}

impl SpawnConfig {
    /// Spawn ONE worker subprocess, mint unique socket paths,
    /// wait up to `spawn_timeout` for the vsock-mux socket to
    /// appear, return the live `Worker`.
    fn spawn_one(&self) -> Result<Worker, Error> {
        // Compact suffix: low 32 bits of the nanosecond
        // timestamp, hex'd. Uniqueness is bounded to one
        // process's pool; collisions are vanishingly unlikely
        // and would just collide socket files (caught by the
        // pre-spawn unlink anyway).
        let suffix = (unique_suffix() & 0xffff_ffff) as u32;
        let vsock_mux_path = self
            .socks_dir
            .join(format!("{}-{:08x}.sock", self.name_prefix, suffix));
        let vsock_exec_path = {
            let mut p = vsock_mux_path.clone();
            let mut name = p.file_name().unwrap().to_owned();
            name.push("-exec");
            p.set_file_name(name);
            p
        };
        let _ = std::fs::remove_file(&vsock_mux_path);
        let _ = std::fs::remove_file(&vsock_exec_path);

        let mut cmd = Command::new(&self.worker_bin);
        for layer in &self.layers {
            cmd.arg("--virtio-blk").arg(layer);
        }
        if let Some(delta) = &self.delta_squashfs {
            cmd.arg("--virtio-blk").arg(delta);
        }
        cmd.arg("--memory").arg(self.memory_mib.to_string());
        cmd.arg("--vcpus").arg(self.vcpus.to_string());
        cmd.arg("--restore-from").arg(&self.snapshot_path);
        cmd.arg("--cow-restore");
        cmd.arg("--vsock-mux").arg(&vsock_mux_path);
        cmd.arg("--vsock-exec").arg(&vsock_exec_path);
        // Quiet by default — embedders don't want VM kernel logs
        // on their stdout. SUPERMACHINE_WORKER_LOG=1 to opt in.
        let log_to_stdio = std::env::var("SUPERMACHINE_WORKER_LOG")
            .map(|v| v == "1" || v == "true")
            .unwrap_or(false);
        if !log_to_stdio {
            cmd.stdout(Stdio::null()).stderr(Stdio::null());
        }
        let child = cmd
            .spawn()
            .map_err(|e| Error::vm_msg(format!("spawn worker {}: {e}", self.worker_bin.display())))?;

        // Wait for the vsock-mux socket to appear (worker has
        // restored + listener is up).
        let deadline = Instant::now() + self.spawn_timeout;
        while !vsock_mux_path.exists() {
            if Instant::now() > deadline {
                let mut w = Worker {
                    child,
                    vsock_mux_path: vsock_mux_path.clone(),
                    vsock_exec_path: vsock_exec_path.clone(),
                };
                w.kill();
                return Err(Error::vm_msg(format!(
                    "worker spawn: vsock socket {} did not appear within {:?}",
                    vsock_mux_path.display(),
                    self.spawn_timeout
                )));
            }
            std::thread::sleep(Duration::from_millis(10));
        }
        Ok(Worker {
            child,
            vsock_mux_path,
            vsock_exec_path,
        })
    }
}

impl std::fmt::Debug for HiddenPool {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let s = self.state.lock().ok();
        f.debug_struct("HiddenPool")
            .field("socks_dir", &self.socks_dir)
            .field(
                "alive",
                &s.as_ref().map(|s| s.alive).unwrap_or(usize::MAX),
            )
            .field(
                "idle",
                &s.as_ref().map(|s| s.idle.len()).unwrap_or(usize::MAX),
            )
            .finish()
    }
}

impl Drop for HiddenPool {
    fn drop(&mut self) {
        self.shutting_down.store(true, Ordering::SeqCst);
        // Wake replenisher so it observes shutting_down and exits.
        self.available.notify_all();
        // Kill any idle workers + reap.
        if let Ok(mut s) = self.state.lock() {
            while let Some(mut w) = s.idle.pop_front() {
                w.kill();
                s.alive = s.alive.saturating_sub(1);
            }
        }
        if let Some(h) = self.replenisher.take() {
            let _ = h.join();
        }
        // Best-effort cleanup of the per-pool socks dir.
        let _ = std::fs::remove_dir_all(&self.socks_dir);
    }
}

impl HiddenPool {
    /// Block until an idle worker is available, return it.
    fn acquire(&self) -> Result<Worker, Error> {
        let mut state = self
            .state
            .lock()
            .map_err(|_| Error::vm_msg("pool mutex poisoned".to_owned()))?;
        loop {
            if let Some(w) = state.idle.pop_front() {
                return Ok(w);
            }
            if self.shutting_down.load(Ordering::SeqCst) {
                return Err(Error::vm_msg("pool is shutting down".to_owned()));
            }
            // Block until replenisher posts a fresh worker.
            state = self
                .available
                .wait(state)
                .map_err(|_| Error::vm_msg("pool condvar poisoned".to_owned()))?;
        }
    }

    /// PooledVm Drop calls this with the (now-dirty) worker.
    /// Kills it on a background thread + asks the replenisher to
    /// spawn a replacement so the pool stays at target size.
    fn release(&self, mut worker: Worker) {
        // Kill the dirty worker first (synchronous, fast — kill +
        // waitpid is a few ms). Then decrement alive and signal
        // replenisher to spawn a replacement.
        worker.kill();
        if let Ok(mut s) = self.state.lock() {
            s.alive = s.alive.saturating_sub(1);
        }
        // Wake replenisher to check + spawn.
        self.available.notify_all();
    }
}

impl Image {
    /// Load an image from the on-disk artifacts produced by
    /// `supermachine run IMAGE`. The argument can be either:
    ///
    /// - The directory containing `metadata.json` and `restore.snap`
    ///   (typical: `~/.local/supermachine-snapshots/<name>/`).
    /// - The `restore.snap` file itself; we read `metadata.json`
    ///   from its parent dir.
    ///
    /// ```sh
    /// supermachine run nginx:1.27-alpine --detach && supermachine run --stop
    /// # snapshot dir: ~/.local/supermachine-snapshots/nginx_1_27-alpine/
    /// ```
    ///
    /// On disk, that directory contains:
    ///
    /// ```text
    /// metadata.json    # layers, memory, vcpus, etc.
    /// restore.snap     # captured VM state (CoW-mappable)
    /// delta.squashfs   # writable overlay layer (optional)
    /// ```
    pub fn from_snapshot(path: impl Into<PathBuf>) -> Result<Self, Error> {
        let path = path.into();
        // Resolve to a (snapshot_path, metadata_path) pair.
        let (snapshot_path, metadata_path) = if path.is_dir() {
            (path.join("restore.snap"), path.join("metadata.json"))
        } else if path.is_file() {
            let parent = path.parent().ok_or_else(|| {
                Error::image_msg(format!("snapshot path has no parent dir: {}", path.display()))
            })?;
            (path.clone(), parent.join("metadata.json"))
        } else {
            return Err(Error::image_msg(format!(
                "snapshot path not found: {}",
                path.display()
            )));
        };

        if !snapshot_path.is_file() {
            return Err(Error::image_msg(format!(
                "snapshot file not found: {}",
                snapshot_path.display()
            )));
        }
        if !metadata_path.is_file() {
            return Err(Error::image_msg(format!(
                "metadata.json not found alongside snapshot at {}",
                metadata_path.display()
            )));
        }

        let meta_text = std::fs::read_to_string(&metadata_path)
            .map_err(|e| Error::image_msg(format!("read {}: {e}", metadata_path.display())))?;
        let meta: serde_json::Value = serde_json::from_str(&meta_text)
            .map_err(|e| Error::image_msg(format!("parse {}: {e}", metadata_path.display())))?;

        let memory_mib = meta
            .get("memory_mib")
            .and_then(|v| v.as_u64())
            .map(|v| v as u32)
            .unwrap_or(256);
        let vcpus = meta
            .get("vcpus")
            .and_then(|v| v.as_u64())
            .map(|v| v as u32)
            .unwrap_or(1);

        // metadata.json paths may be absolute (default for native
        // bakes that store paths under ~/.local/...) or relative
        // (used by `supermachine bundle --image NAME`, which writes
        // a self-contained dir with `./layers/<sha>.squashfs` style
        // entries). Resolve relative paths against the metadata
        // dir so a bundle works after `cp -r` to a different host.
        let metadata_dir = metadata_path
            .parent()
            .map(Path::to_path_buf)
            .unwrap_or_else(|| PathBuf::from("."));
        let resolve_path = |s: &str| -> PathBuf {
            let p = PathBuf::from(s);
            if p.is_absolute() {
                p
            } else {
                metadata_dir.join(p)
            }
        };

        let layers: Vec<PathBuf> = meta
            .get("layers")
            .and_then(|v| v.as_array())
            .map(|arr| {
                arr.iter()
                    .filter_map(|x| x.as_str().map(&resolve_path))
                    .collect()
            })
            .unwrap_or_default();
        let delta_squashfs = meta
            .get("delta_squashfs")
            .and_then(|v| v.as_str())
            .map(&resolve_path);

        // Bundled kernel discovery: a self-contained bundle puts
        // the kernel image next to the snapshot. Prefer that over
        // host-wide AssetPaths so a shipped `.app` doesn't depend
        // on the user having supermachine installed.
        let bundled_kernel = {
            let cand = metadata_dir.join("kernel");
            if cand.is_file() {
                Some(cand)
            } else {
                None
            }
        };

        Ok(Self {
            snapshot_path,
            memory_mib,
            vcpus,
            layers,
            delta_squashfs,
            bundled_kernel,
            hidden_pool: std::sync::OnceLock::new(),
        })
    }

    /// Pull and bake an image from a registry reference, returning
    /// the loadable [`Image`]. Equivalent to running
    /// `supermachine run <image_ref> --no-detach` from a Rust app,
    /// minus the daemon — you get the [`Image`] back, then call
    /// [`Vm::start`] yourself.
    ///
    /// Uses [`PullPolicy::Missing`] (cache-first) by default. For
    /// other policies, see [`Image::from_oci_with_policy`].
    ///
    /// ```no_run
    /// # use supermachine::{Image, Vm, VmConfig};
    /// let image = Image::from_oci("nginx:1.27-alpine")?;
    /// let vm = Vm::start(&image, &VmConfig::new())?;
    /// # let _ = vm; Ok::<(), supermachine::Error>(())
    /// ```
    pub fn from_oci(image_ref: &str) -> Result<Self, Error> {
        Self::from_oci_with_policy(image_ref, PullPolicy::default())
    }

    /// As [`Image::from_oci`] but with an explicit [`PullPolicy`].
    /// See [`PullPolicy`] for the cache + registry interaction
    /// table.
    pub fn from_oci_with_policy(
        image_ref: &str,
        policy: PullPolicy,
    ) -> Result<Self, Error> {
        let snapshots_dir = default_snapshots_dir();
        Self::from_oci_to_dir(image_ref, policy, &snapshots_dir, None)
    }

    /// Most explicit constructor: pull/bake into a specific
    /// snapshots directory, with an optional explicit name.
    /// Lets you keep multiple "supermachine snapshot stores"
    /// (e.g. per-project), or pin a snapshot under a name that
    /// differs from the image-derived default.
    pub fn from_oci_to_dir(
        image_ref: &str,
        policy: PullPolicy,
        snapshots_dir: &Path,
        name: Option<&str>,
    ) -> Result<Self, Error> {
        // 1. Compute where the cached snapshot would live and
        //    short-circuit on hit (Missing) or miss (Never).
        let derived = name
            .map(|s| s.to_owned())
            .unwrap_or_else(|| crate::bake::snapshot_name_for_image(image_ref));
        let snap_dir = snapshots_dir.join(&derived);
        let cache_loadable = Self::from_snapshot(&snap_dir).is_ok();

        match policy {
            PullPolicy::Never => {
                if cache_loadable {
                    return Self::from_snapshot(&snap_dir);
                }
                let restore_snap = snap_dir.join("restore.snap");
                if restore_snap.is_file() {
                    return Err(Error::cache_invalid(format!(
                        "snapshot present at {} but not loadable on this binary; \
                         rebake required (PullPolicy::Never won't auto-rebake)",
                        snap_dir.display()
                    )));
                }
                return Err(Error::cache_miss(format!(
                    "no cached snapshot for {image_ref} at {} (PullPolicy::Never)",
                    snap_dir.display()
                )));
            }
            PullPolicy::Missing if cache_loadable => {
                return Self::from_snapshot(&snap_dir);
            }
            // Missing+invalid OR Always: fall through to bake.
            _ => {}
        }

        // 2. Bake. This shells out to the existing bake pipeline:
        //    registry pull (or reuse cached layers) → squashfs →
        //    boot worker once → capture snapshot.
        let root = repo_root_for_bake()?;
        let request = crate::bake::BakeRequest {
            image: image_ref.to_owned(),
            name: name.map(|s| s.to_owned()),
            runtime: "supermachine".to_owned(),
            guest_port: 80,
            memory_mib: 256,
            vcpus: 1,
            pull_policy: policy.as_bake_str().to_owned(),
            snapshots_dir: snapshots_dir.to_path_buf(),
            cmd_override: None,
            extra_args: Vec::new(),
        };
        let bake_t0 = std::time::Instant::now();
        crate::bake::run_push(&request, bake_t0, &root).map_err(map_bake_error)?;

        // 3. Load the freshly-baked snapshot.
        Self::from_snapshot(&snap_dir)
    }

    /// Builder for configurable bakes — env vars, cmd override,
    /// custom memory / port, custom snapshot name.
    ///
    /// ```no_run
    /// # use supermachine::Image;
    /// let image = Image::builder("nginx:1.27-alpine")
    ///     .with_name("nginx-prod")
    ///     .with_memory_mib(512)
    ///     .with_env("FOO", "bar")
    ///     .with_cmd(["nginx", "-g", "daemon off;"])
    ///     .build()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// The builder produces a different snapshot for each
    /// configuration — bake-time inputs are part of the snapshot
    /// fingerprint. Reuse a name across configurations and the
    /// previous snapshot is invalidated; pick distinct names if
    /// you need both side-by-side.
    pub fn builder(image_ref: impl Into<String>) -> OciImageBuilder {
        OciImageBuilder::new(image_ref)
    }

    /// Get an [`Image`] for `name`, baking it from `image_ref`
    /// only if a compatible snapshot doesn't already exist.
    ///
    /// This is the right call for app startup. The first run
    /// bakes (one-time cost: the registry pull + snapshot build,
    /// e.g. ~12 s for `rust:1-slim`); subsequent runs see the
    /// cached snapshot and return in microseconds. After a
    /// `cargo update` that bumped the supermachine version, the
    /// cached snapshot's bake-key no longer matches the current
    /// worker binary, and `ensure_baked` rebakes automatically —
    /// no shell scripts, no manual `rm -rf snapshots/`.
    ///
    /// `configure` is a builder closure: chain
    /// [`OciImageBuilder`] methods like `with_memory_mib`,
    /// `with_cmd`, `with_env` to customize the bake. Pass
    /// `|b| b` for defaults.
    ///
    /// ```no_run
    /// use std::time::Duration;
    /// use supermachine::{Image, VmConfig};
    ///
    /// // Bake once on first run, reuse forever after — including
    /// // across supermachine version upgrades.
    /// let image = Image::ensure_baked("rust_1_slim", "rust:1-slim", |b| {
    ///     b.with_memory_mib(2048)
    /// })?;
    /// // Pre-warm 5 workers so 5 threads can each grab one in parallel.
    /// let _ = image.acquire_with(&VmConfig::new().with_pool_warm(5))?;
    ///
    /// // Per-task path:
    /// let vm = image.acquire()?;
    /// vm.write_file("/tmp/main.rs", b"fn main() { println!(\"hi\"); }")?;
    /// let out = vm.exec_builder()
    ///     .argv(["sh", "-c", "rustc /tmp/main.rs -o /tmp/m && /tmp/m"])
    ///     .timeout(Duration::from_secs(30))
    ///     .output()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn ensure_baked<F>(
        name: impl Into<String>,
        image_ref: impl Into<String>,
        configure: F,
    ) -> Result<Image, Error>
    where
        F: FnOnce(OciImageBuilder) -> OciImageBuilder,
    {
        let builder = configure(
            OciImageBuilder::new(image_ref).with_name(name),
        );
        builder.build()
    }

    /// Path to the snapshot file backing this image.
    pub fn snapshot_path(&self) -> &Path {
        &self.snapshot_path
    }

    /// Memory the snapshot was baked with. [`Vm::start`] uses
    /// this if [`VmConfig::with_memory_mib`] isn't set.
    pub fn memory_mib(&self) -> u32 {
        self.memory_mib
    }

    /// vCPUs the snapshot was baked with.
    pub fn vcpus(&self) -> u32 {
        self.vcpus
    }

    /// Start a one-shot microVM from this image. Equivalent to
    /// [`Vm::start(self, config)`][Vm::start] but reads more
    /// naturally at the call site:
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = image.start(&VmConfig::new())?;
    /// // ... use vm ...
    /// vm.stop()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// Use [`Image::acquire`] instead if you want a `PooledVm`
    /// that returns to a (hidden) pool on `Drop` for cheaper
    /// reuse — typical for evaluation harnesses, CI verifiers,
    /// or any code that runs many short-lived VMs of the same
    /// image back-to-back.
    pub fn start(&self, config: &VmConfig) -> Result<Vm, Error> {
        Vm::start(self, config)
    }

    /// Acquire a microVM from this image's hidden pool. Returns
    /// a [`PooledVm`] which `Deref`s to [`Vm`] and returns to
    /// the pool on `Drop`. Use this for the common
    /// "spin up a VM, do one task, throw it away, do another"
    /// loop — the pool keeps re-restoring from the same snapshot
    /// behind the scenes so per-iteration cost stays at the
    /// snapshot-restore floor (~5 ms on Apple Silicon).
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// # use std::time::Duration;
    /// let image = Image::from_snapshot("path/to/rust-slim")?;
    /// for src in ["fn main() {}", "fn main() { panic!() }"] {
    ///     let vm = image.acquire()?;
    ///     vm.write_file("/tmp/main.rs", src.as_bytes())?;
    ///     let out = vm.exec_builder()
    ///         .argv(["sh", "-c", "rustc /tmp/main.rs -o /tmp/m && /tmp/m"])
    ///         .timeout(Duration::from_secs(30))
    ///         .output()?;
    ///     println!("status={:?} out={:?}", out.status.code(), out.stdout);
    ///     // vm dropped here — returned to pool, restored from snapshot
    /// }
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// ## Pool sizing
    ///
    /// The hidden pool defaults to **one** warm worker, sized at
    /// the first acquire and reused thereafter. Per-acquire cost
    /// is the snapshot restore (~5 ms on Apple Silicon), not the
    /// full VM spawn (~50–100 ms).
    ///
    /// For **N concurrent acquires that don't block each other**,
    /// size the pool up-front via
    /// [`Image::acquire_with`] with [`VmConfig::with_pool_warm`]:
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// // Prime 5 warm workers — first acquire blocks ~50 ms while
    /// // they spawn; subsequent acquires up to 5-wide are non-
    /// // blocking. Drop returns the worker to the pool.
    /// let _prime = image.acquire_with(&VmConfig::new().with_pool_warm(5))?;
    /// drop(_prime);
    /// // Now 5 threads can each call image.acquire() in parallel.
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// Or set `SUPERMACHINE_POOL_WARM=N` in the environment as a
    /// non-code knob.
    ///
    /// Once the pool is sized, `acquire()` blocks only when all N
    /// workers are checked out — typical pattern is one
    /// `acquire()` per concurrent task with N equal to the
    /// concurrency cap.
    pub fn acquire(&self) -> Result<PooledVm<'_>, Error> {
        self.acquire_with(&VmConfig::new())
    }

    /// Like [`Image::acquire`] but with an explicit
    /// [`VmConfig`] (overrides for memory, vCPUs, asset paths,
    /// **pool size**, etc.). The config is honored on **first**
    /// acquire — when the pool is built. Subsequent acquires
    /// reuse the existing pool regardless of `config`. This is
    /// fine for most use cases; create a fresh `Image` if you
    /// need a different config without restarting your app.
    pub fn acquire_with(&self, config: &VmConfig) -> Result<PooledVm<'_>, Error> {
        let pool_arc = self.ensure_pool(config)?;
        let worker = pool_arc.acquire()?;
        let vm = Vm {
            pool: None,
            vsock_mux_path: worker.vsock_mux_path.clone(),
            vsock_exec_path: worker.vsock_exec_path.clone(),
            own_vsock_mux_dir: None,
            skip_cleanup: true,
            // PooledVm doesn't get image_meta — Vm::snapshot
            // requires the in-process WarmPool path. Snapshot a
            // PooledVm by first restoring its source Image via
            // image.start(), then snapshot that.
            image_meta: None,
        };
        Ok(PooledVm {
            vm: Some(vm),
            worker: Some(worker),
            pool_arc: Arc::clone(pool_arc),
            _image: std::marker::PhantomData,
        })
    }

    /// Lazy-init the subprocess pool. Builds `SpawnConfig` from
    /// `config` on the first call, spawns the initial batch of
    /// workers, starts the replenishment thread. Subsequent
    /// calls return the existing pool regardless of `config`
    /// (already configured).
    fn ensure_pool(
        &self,
        config: &VmConfig,
    ) -> Result<&Arc<HiddenPool>, Error> {
        if let Some(p) = self.hidden_pool.get() {
            return Ok(p);
        }
        // Pool size: VmConfig override, then env, then default 1.
        // Concurrent acquires up to this number run in parallel.
        let target = config.pool_warm.unwrap_or_else(|| {
            std::env::var("SUPERMACHINE_POOL_WARM")
                .ok()
                .and_then(|s| s.parse().ok())
                .unwrap_or(1usize)
        }).max(1);
        // Find supermachine-worker. Tries env override, sibling-
        // of-current-exe (cargo install layout), dev-tree
        // target/release (workspace).
        #[cfg(target_os = "macos")]
        let worker_bin = crate::codesign::locate_worker_bin().ok_or_else(|| {
            Error::assets_msg(
                "supermachine-worker binary not found (looked for sibling of \
                 current_exe and target/release/supermachine-worker). Set \
                 SUPERMACHINE_WORKER_BIN if you have it elsewhere."
                    .to_owned(),
            )
        })?;
        #[cfg(not(target_os = "macos"))]
        let worker_bin: PathBuf = std::env::var_os("SUPERMACHINE_WORKER_BIN")
            .map(PathBuf::from)
            .ok_or_else(|| {
                Error::assets_msg(
                    "SUPERMACHINE_WORKER_BIN must be set on this platform".to_owned(),
                )
            })?;
        // Best-effort: ensure worker is HVF-entitled. No-op on
        // non-macOS / already-signed.
        #[cfg(target_os = "macos")]
        {
            let _ = crate::codesign::ensure_worker_signed(&worker_bin);
        }

        // Unix socket paths are capped at 104 bytes on macOS
        // (SUN_LEN). Default to /tmp instead of $TMPDIR, which on
        // macOS resolves to /var/folders/.../T/ and burns ~50
        // characters before we even start. /tmp/sm-pool-<pid>/
        // leaves room for a meaningful socket name underneath.
        let socks_dir = match &config.vsock_mux_dir {
            Some(d) => d.clone(),
            None => PathBuf::from(format!(
                "/tmp/sm-pool-{}",
                std::process::id(),
            )),
        };
        std::fs::create_dir_all(&socks_dir).map_err(Error::Io)?;
        let memory_mib = config.memory_mib.unwrap_or(self.memory_mib);
        let vcpus = config.vcpus.unwrap_or(self.vcpus);
        let spawn_timeout = config
            .restore_timeout
            .unwrap_or_else(|| Duration::from_secs(30));
        // Same SUN_LEN concern: use a small token instead of the
        // full snapshot dir name. Per-pool counter would be even
        // shorter; for now an 8-char hash is enough.
        let name_prefix = "w".to_owned();
        let spawn_cfg = Arc::new(SpawnConfig {
            worker_bin,
            snapshot_path: self.snapshot_path.clone(),
            layers: self.layers.clone(),
            delta_squashfs: self.delta_squashfs.clone(),
            memory_mib,
            vcpus,
            socks_dir: socks_dir.clone(),
            name_prefix,
            spawn_timeout,
        });
        // Spawn the initial batch synchronously so the first
        // acquire() finds workers already idle. We could also
        // background-spawn here, but synchronous gives the
        // embedder predictable "init done = pool ready" semantics.
        let mut idle = VecDeque::with_capacity(target);
        for _ in 0..target {
            idle.push_back(spawn_cfg.spawn_one()?);
        }
        let pool = Arc::new(HiddenPool {
            state: Mutex::new(PoolState {
                idle,
                alive: target,
                target,
            }),
            available: Condvar::new(),
            socks_dir,
            shutting_down: AtomicBool::new(false),
            replenisher: None,
            spawn_cfg: Arc::clone(&spawn_cfg),
        });
        // Start the replenisher. It watches alive vs target and
        // spawns one worker at a time when behind.
        let pool_weak = Arc::downgrade(&pool);
        let h = std::thread::Builder::new()
            .name("supermachine-pool-replenish".into())
            .spawn(move || replenisher_loop(pool_weak))
            .map_err(|e| Error::vm_msg(format!("spawn replenisher thread: {e}")))?;
        // Inject the join handle into the pool. We can't write
        // through Arc directly, so we use a small unsafe trick:
        // since this is the only writer, no other thread holds
        // the pool yet. Or just keep it in a side-channel.
        // Simpler: replenisher detaches naturally on Pool drop
        // via shutting_down + available.notify_all + Arc Weak
        // upgrade failure. We just need to join it on Drop, which
        // means we need to store the JoinHandle somewhere
        // mutable. Use a side cell.
        REPLENISHER_HANDLES.lock().unwrap().insert(
            Arc::as_ptr(&pool) as usize,
            h,
        );
        let _ = self.hidden_pool.set(pool);
        Ok(self
            .hidden_pool
            .get()
            .expect("hidden pool was just initialized"))
    }
}

/// Replenisher thread: when alive < target and the pool isn't
/// shutting down, spawn a worker, push it onto `idle`, signal
/// available. Idle indefinitely otherwise.
fn replenisher_loop(pool: std::sync::Weak<HiddenPool>) {
    loop {
        // Upgrade the weak ref. If None, the pool is gone — exit.
        let Some(p) = pool.upgrade() else { return };
        if p.shutting_down.load(Ordering::SeqCst) {
            return;
        }
        // Decide whether to spawn.
        let need_more = {
            let s = match p.state.lock() {
                Ok(s) => s,
                Err(_) => return,
            };
            s.alive < s.target
        };
        if !need_more {
            // Wait for someone to release a worker (which decrements
            // alive and notifies). Use the same condvar for this.
            let s = match p.state.lock() {
                Ok(s) => s,
                Err(_) => return,
            };
            let _ = p.available.wait_timeout(s, Duration::from_millis(200));
            continue;
        }
        // Spawn outside the lock — spawn_one is slow.
        let spawned = p.spawn_cfg.spawn_one();
        match spawned {
            Ok(w) => {
                if let Ok(mut s) = p.state.lock() {
                    s.idle.push_back(w);
                    s.alive += 1;
                    p.available.notify_all();
                }
            }
            Err(_) => {
                // Spawn failed — back off briefly and try again.
                std::thread::sleep(Duration::from_millis(500));
            }
        }
    }
}

// Side-table for replenisher join handles, keyed by the pool's
// Arc pointer. We can't store the JoinHandle inside HiddenPool
// because the replenisher needs a Weak<HiddenPool>, which means
// the pool has to be Arc'd before we know the JoinHandle.
static REPLENISHER_HANDLES: std::sync::LazyLock<
    std::sync::Mutex<std::collections::HashMap<usize, std::thread::JoinHandle<()>>>,
> = std::sync::LazyLock::new(|| std::sync::Mutex::new(std::collections::HashMap::new()));

/// A [`Vm`] checked out of an [`Image`]'s hidden pool. `Deref`s
/// to `Vm`, so every method on `Vm` is callable. On `Drop` the
/// VM returns to the pool — the next [`Image::acquire`] gets
/// a freshly snapshot-restored worker in ~5 ms.
///
/// Bound to the `Image`'s lifetime so the pool can't outlive
/// its owner. Acquires currently serialize on the pool's single
/// worker; concurrent acquires from one process block until the
/// previous PooledVm is dropped.
pub struct PooledVm<'a> {
    vm: Option<Vm>,
    /// Worker subprocess we checked out. On Drop, returned to
    /// the pool (which kills + replenishes).
    worker: Option<Worker>,
    /// Keeps the pool alive for the lifetime of this PooledVm.
    pool_arc: Arc<HiddenPool>,
    _image: std::marker::PhantomData<&'a Image>,
}

impl<'a> std::ops::Deref for PooledVm<'a> {
    type Target = Vm;
    fn deref(&self) -> &Vm {
        // Invariant: vm is `Some` until Drop runs.
        self.vm.as_ref().expect("PooledVm used after drop")
    }
}

impl<'a> std::ops::DerefMut for PooledVm<'a> {
    fn deref_mut(&mut self) -> &mut Vm {
        self.vm.as_mut().expect("PooledVm used after drop")
    }
}

impl<'a> Drop for PooledVm<'a> {
    fn drop(&mut self) {
        // Drop the inner Vm first — its Drop respects
        // skip_cleanup and is a no-op (we don't want to close
        // the worker's socket files; they're owned by Worker).
        let _ = self.vm.take();
        // Hand the worker back to the pool, which kills it
        // (snapshot state may be dirty) and triggers
        // replenishment so the pool stays at target N.
        if let Some(worker) = self.worker.take() {
            self.pool_arc.release(worker);
        }
        // pool_arc drops here, decrementing refcount.
    }
}

/// Configurable bake of an OCI image. Built via [`Image::builder`];
/// terminate with [`OciImageBuilder::build`] to produce an
/// [`Image`].
///
/// Every setter that affects the workload's behavior (env, cmd,
/// memory, guest_port) is part of the bake's input fingerprint:
/// changing it forces a re-bake and produces a different snapshot.
/// Use distinct `with_name` values if you want side-by-side
/// snapshots for the same image ref with different configs.
pub struct OciImageBuilder {
    image: String,
    name: Option<String>,
    pull_policy: PullPolicy,
    memory_mib: Option<u32>,
    vcpus: Option<u32>,
    guest_port: Option<u16>,
    cmd: Option<Vec<String>>,
    envs: Vec<(String, String)>,
    snapshots_dir: Option<PathBuf>,
}

impl OciImageBuilder {
    /// Start a new builder for `image_ref` (e.g. `"nginx:1.27-alpine"`,
    /// `"ghcr.io/owner/image@sha256:..."`).
    pub fn new(image_ref: impl Into<String>) -> Self {
        Self {
            image: image_ref.into(),
            name: None,
            pull_policy: PullPolicy::default(),
            memory_mib: None,
            vcpus: None,
            guest_port: None,
            cmd: None,
            envs: Vec::new(),
            snapshots_dir: None,
        }
    }

    /// Override the number of vCPUs the snapshot is baked with.
    /// Default `1`. Multi-vCPU is opt-in: it lifts sustained
    /// HTTP-serving throughput (single-vCPU is the c=32+
    /// bottleneck) at the cost of slightly higher cold boot and
    /// some snapshot/restore caveats. See
    /// docs/design/concurrency-floor-2026-05-04.md.
    pub fn with_vcpus(mut self, vcpus: u32) -> Self {
        self.vcpus = Some(vcpus);
        self
    }

    /// Snapshot name. Default: derived from the image ref via
    /// `bake::snapshot_name_for_image`. Use this when you want
    /// `nginx:1.27-alpine` baked twice with different configs.
    pub fn with_name(mut self, name: impl Into<String>) -> Self {
        self.name = Some(name.into());
        self
    }

    /// Cache + registry policy. See [`PullPolicy`].
    pub fn with_pull_policy(mut self, policy: PullPolicy) -> Self {
        self.pull_policy = policy;
        self
    }

    /// Override the bake-time memory budget (MiB). The runtime
    /// memory is set on [`VmConfig`]; this is the size the
    /// snapshot is captured at.
    pub fn with_memory_mib(mut self, mib: u32) -> Self {
        self.memory_mib = Some(mib);
        self
    }

    /// Override the guest service port the bake waits for as the
    /// readiness signal. Default `80`.
    pub fn with_guest_port(mut self, port: u16) -> Self {
        self.guest_port = Some(port);
        self
    }

    /// Override the image's `CMD`. Pass an argv array, same shape
    /// as Docker's `--entrypoint` + arguments combined.
    ///
    /// ```no_run
    /// # use supermachine::Image;
    /// let img = Image::builder("python:3.12-alpine")
    ///     .with_cmd(["python", "-m", "http.server", "8080"])
    ///     .with_guest_port(8080)
    ///     .build()?;
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn with_cmd<I, S>(mut self, cmd: I) -> Self
    where
        I: IntoIterator<Item = S>,
        S: Into<String>,
    {
        self.cmd = Some(cmd.into_iter().map(Into::into).collect());
        self
    }

    /// Add an environment variable for the workload. Repeatable.
    /// Mirrors `docker run -e KEY=VAL`.
    pub fn with_env(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
        self.envs.push((key.into(), value.into()));
        self
    }

    /// Override the directory snapshots are stored in. Default
    /// is `~/.local/supermachine-snapshots`. Use this to keep
    /// per-project snapshot stores isolated from each other.
    pub fn with_snapshots_dir(mut self, dir: impl Into<PathBuf>) -> Self {
        self.snapshots_dir = Some(dir.into());
        self
    }

    /// Run the bake (or reuse a cached snapshot per
    /// `with_pull_policy`) and return the resulting [`Image`].
    pub fn build(self) -> Result<Image, Error> {
        let snapshots_dir = self
            .snapshots_dir
            .unwrap_or_else(default_snapshots_dir);
        let derived_name = self
            .name
            .clone()
            .unwrap_or_else(|| crate::bake::snapshot_name_for_image(&self.image));
        let snap_dir = snapshots_dir.join(&derived_name);

        // Cache fast-path: same as Image::from_oci_to_dir, but on
        // the builder we have to assume the cache might be stale
        // for a different config under the same name. We trust the
        // bake pipeline's input-hash check (`native_bake_key`) to
        // re-bake when the inputs changed; on cache hit it's a
        // no-op and we just load the existing snapshot.
        let cache_loadable = Image::from_snapshot(&snap_dir).is_ok();
        match self.pull_policy {
            PullPolicy::Never => {
                if cache_loadable {
                    return Image::from_snapshot(&snap_dir);
                }
                let restore_snap = snap_dir.join("restore.snap");
                if restore_snap.is_file() {
                    return Err(Error::cache_invalid(format!(
                        "snapshot present at {} but not loadable on this binary; \
                         rebake required (PullPolicy::Never won't auto-rebake)",
                        snap_dir.display()
                    )));
                }
                return Err(Error::cache_miss(format!(
                    "no cached snapshot for {} at {} (PullPolicy::Never)",
                    self.image,
                    snap_dir.display()
                )));
            }
            // Missing+invalid OR Always: fall through to bake. The
            // bake pipeline will short-circuit on input-hash match
            // even on Always policy.
            _ => {}
        }

        // Encode env / cmd into the form `bake::run_push` accepts.
        let mut extra_args: Vec<String> = Vec::new();
        for (k, v) in &self.envs {
            extra_args.push("--env".to_owned());
            extra_args.push(format!("{k}={v}"));
        }
        let cmd_override = match &self.cmd {
            Some(argv) => Some(
                serde_json::to_string(argv)
                    .map_err(|e| Error::bake_msg(format!("encode cmd: {e}")))?,
            ),
            None => None,
        };

        let root = repo_root_for_bake()?;
        let request = crate::bake::BakeRequest {
            image: self.image.clone(),
            name: self.name.clone(),
            runtime: "supermachine".to_owned(),
            guest_port: self.guest_port.unwrap_or(80),
            memory_mib: self.memory_mib.unwrap_or(256),
            vcpus: self.vcpus.unwrap_or(1),
            pull_policy: self.pull_policy.as_bake_str().to_owned(),
            snapshots_dir: snapshots_dir.clone(),
            cmd_override,
            extra_args,
        };
        let bake_t0 = std::time::Instant::now();
        crate::bake::run_push(&request, bake_t0, &root).map_err(map_bake_error)?;
        Image::from_snapshot(&snap_dir)
    }
}

/// Default snapshots directory: `~/.local/supermachine-snapshots`,
/// matching the CLI's default. Customizable via
/// [`Image::from_oci_to_dir`] or `$SUPERMACHINE_SNAPSHOTS`.
fn default_snapshots_dir() -> PathBuf {
    if let Some(d) = std::env::var_os("SUPERMACHINE_SNAPSHOTS") {
        return PathBuf::from(d);
    }
    let home = std::env::var_os("HOME")
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from("."));
    home.join(".local/supermachine-snapshots")
}

/// `bake::run_push` wants a "repo root" so it can locate the
/// supermachine-worker binary, the kernel image, the entitlements
/// plist, etc. The CLI walks up from its own exe to find it. From
/// a library context the same auto-discovery applies (an embedder
/// running their app from the dev tree finds the workspace; a
/// release-tarball install finds `<prefix>/share/supermachine`).
fn repo_root_for_bake() -> Result<PathBuf, Error> {
    if let Some(root) = std::env::var_os("SUPERMACHINE_ROOT") {
        return Ok(PathBuf::from(root));
    }
    let exe = std::env::current_exe()
        .map_err(|e| Error::bake_msg(format!("current_exe: {e}")))?;
    for ancestor in exe.ancestors() {
        if ancestor.join("tools/supermachine-push").is_file() {
            return Ok(ancestor.to_path_buf());
        }
        if ancestor.join("share/supermachine/kernel").is_file() {
            return Ok(ancestor.to_path_buf());
        }
    }
    std::env::current_dir().map_err(|e| Error::bake_msg(format!("current_dir: {e}")))
}

/// Map a `bake::run_push` error string into the right
/// [`Error`] variant. The bake pipeline returns flat strings, so
/// we pattern-match keywords.
fn map_bake_error(msg: String) -> Error {
    let lc = msg.to_ascii_lowercase();
    if lc.contains("registry") || lc.contains("manifest") || lc.contains("docker pull")
        || lc.contains("auth")
    {
        Error::network_msg(msg)
    } else if lc.contains("snapshot") && lc.contains("timeout") {
        Error::bake_msg(msg)
    } else if lc.contains("listener readiness") {
        Error::bake_msg(msg)
    } else {
        // Default: treat as a bake error rather than misclassifying.
        Error::bake_msg(msg)
    }
}

/// Configuration for [`Vm::start`]. Built via the chainable
/// `VmConfig::with_*` methods or constructed directly:
///
/// ```
/// use supermachine::VmConfig;
/// let cfg = VmConfig::new()
///     .with_memory_mib(512)
///     .with_vcpus(2);
/// # let _ = cfg;
/// ```
#[derive(Debug, Clone, Default)]
pub struct VmConfig {
    /// Override the image's baked memory. `None` = use Image's value.
    memory_mib: Option<u32>,
    /// Override the image's baked vCPUs. `None` = use Image's value.
    vcpus: Option<u32>,
    assets: Option<AssetPaths>,
    vsock_mux_dir: Option<PathBuf>,
    restore_timeout: Option<Duration>,
    /// Number of pre-warmed worker subprocesses for the hidden
    /// pool that backs [`Image::acquire`]. Each pre-warmed
    /// worker is its own subprocess (separate `hv_vm_create`),
    /// so this also caps the maximum number of concurrent
    /// `acquire()` calls before they start blocking. Default
    /// `None` falls back to the `SUPERMACHINE_POOL_WARM` env
    /// var, then to `1`.
    pub(crate) pool_warm: Option<usize>,
}

impl VmConfig {
    /// Use the image's baked defaults for memory + vCPUs;
    /// auto-discover assets; vsock-mux socket in `$TMPDIR`;
    /// 10 s restore timeout.
    pub fn new() -> Self {
        Self::default()
    }

    /// Override the image's baked memory.
    pub fn with_memory_mib(mut self, mib: u32) -> Self {
        self.memory_mib = Some(mib);
        self
    }

    /// Override the image's baked vCPU count.
    pub fn with_vcpus(mut self, vcpus: u32) -> Self {
        self.vcpus = Some(vcpus);
        self
    }

    /// Override asset auto-discovery. Useful for `.app` bundles
    /// that ship the kernel + init shim under
    /// `Contents/Resources/`.
    pub fn with_assets(mut self, assets: AssetPaths) -> Self {
        self.assets = Some(assets);
        self
    }

    /// Where to put the host-side vsock-mux unix socket. Default
    /// is `$TMPDIR`. Use this if you need the socket inside an
    /// app-private dir for sandboxing reasons.
    pub fn with_vsock_mux_dir(mut self, dir: impl Into<PathBuf>) -> Self {
        self.vsock_mux_dir = Some(dir.into());
        self
    }

    /// How long to wait for the snapshot to restore. Default 10 s.
    pub fn with_restore_timeout(mut self, timeout: Duration) -> Self {
        self.restore_timeout = Some(timeout);
        self
    }

    /// How many pre-warmed worker subprocesses to keep ready in
    /// the hidden pool that backs [`Image::acquire`]. Each
    /// pre-warmed worker is a separate subprocess, so this also
    /// caps the maximum number of `acquire()` calls that can
    /// run in parallel before they start blocking. Default 1.
    /// Honored at first acquire (when the pool is built);
    /// subsequent acquires reuse the existing pool.
    ///
    /// For an eval-harness running 5 tasks concurrently per
    /// iteration: `with_pool_warm(5)`.
    pub fn with_pool_warm(mut self, n: usize) -> Self {
        self.pool_warm = Some(n.max(1));
        self
    }
}

/// A running microVM. Holds an internal worker process and the
/// host-side vsock-mux unix socket through which you talk to the
/// guest.
///
/// Drop the value to stop the VM, or call [`Vm::stop`] for an
/// explicit shutdown report.
pub struct Vm {
    pool: Option<WarmPool>,
    vsock_mux_path: PathBuf,
    /// `<vsock_mux>-exec.sock` for the in-guest exec agent. Only
    /// useful once the agent crate ships in the initramfs (see
    /// `docs/design/exec-2026-05-03.md`); until then dialing this
    /// path will fail with "no listener" because the agent isn't
    /// running guest-side. The unix socket itself is created
    /// unconditionally so `Vm::exec` can wire to it once the agent
    /// lands.
    vsock_exec_path: PathBuf,
    /// Best-effort cleanup of the temp socket dir we created.
    own_vsock_mux_dir: Option<PathBuf>,
    /// `true` for [`PooledVm`]'s wrapped Vm — Drop must NOT
    /// shut down the pool or unlink sockets, since those are
    /// owned by [`Image`]'s [`HiddenPool`] and reused across
    /// [`Image::acquire`] calls.
    skip_cleanup: bool,
    /// Source image metadata for [`Vm::snapshot`] — needed to
    /// emit a `metadata.json` describing the new snapshot's
    /// layers / memory / vCPUs (the snapshot file alone isn't
    /// loadable as an Image without these).  `None` for VMs
    /// that didn't come from an Image (currently impossible via
    /// the public API; reserved for future use cases).
    image_meta: Option<Arc<ImageMeta>>,
}

/// Subset of [`Image`] fields snapshotted into [`Vm`] so
/// [`Vm::snapshot`] can write a self-contained `metadata.json`
/// next to the captured snapshot file.
#[derive(Clone, Debug)]
pub(crate) struct ImageMeta {
    pub memory_mib: u32,
    pub vcpus: u32,
    pub layers: Vec<PathBuf>,
    pub delta_squashfs: Option<PathBuf>,
}

impl Vm {
    /// Start a microVM from `image` with the supplied configuration.
    ///
    /// What this does, in order:
    ///
    /// 1. Resolves the kernel path. Preferences (first hit wins):
    ///    `image`'s bundled kernel (if the snapshot dir shipped one),
    ///    `config.assets.kernel` (if set explicitly), then
    ///    [`AssetPaths::discover`]. Fails with [`Error::Assets`] if
    ///    none is found.
    /// 2. Creates a unique unix socket path for vsock-mux under
    ///    the configured directory.
    /// 3. Spawns an in-process VM thread that restores from
    ///    `image.snapshot_path()`. (The library runs the VM
    ///    in-process via [`crate::internal::vmm::pool::WarmPool`].
    ///    The standalone `supermachine-worker` binary is only used
    ///    by the router daemon for SCM_RIGHTS process isolation.)
    /// 4. Waits up to [`VmConfig::with_restore_timeout`] for the
    ///    restore to complete.
    /// 5. Returns the [`Vm`] handle. The vsock-mux socket is
    ///    available immediately at [`Vm::vsock_path`].
    pub fn start(image: &Image, config: &VmConfig) -> Result<Vm, Error> {
        // Vm::start runs the VM thread in this process, so this
        // process itself calls hv_vm_create. Without the HVF
        // entitlement that fails with HV_DENIED (Hv(-85377017)),
        // which is cryptic. Surface a clear error up front instead.
        // Image::acquire callers don't pay this cost — the worker
        // subprocess handles HVF for them.
        #[cfg(target_os = "macos")]
        if let Err(msg) = crate::codesign::check_self_has_hvf_entitlement() {
            return Err(Error::vm_msg(msg));
        }
        let assets = match &config.assets {
            Some(a) => a.clone(),
            None => AssetPaths::discover(),
        };
        // Kernel preference: bundled (snapshot dir) > config.assets >
        // AssetPaths::discover. A bundled kernel makes the snapshot
        // self-contained so a `.app` ships everything it needs.
        let kernel: PathBuf = if let Some(k) = image.bundled_kernel.as_ref() {
            k.clone()
        } else if let Some(k) = assets.kernel.as_ref() {
            k.clone()
        } else {
            return Err(Error::assets_msg(
                "no kernel found: snapshot dir has no bundled kernel and AssetPaths::discover() came up empty; set VmConfig::with_assets() or $SUPERMACHINE_ASSETS_DIR".to_owned(),
            ));
        };
        let kernel = kernel.as_path();

        // Per-VM unix socket path under the chosen dir.
        let dir = match &config.vsock_mux_dir {
            Some(d) => d.clone(),
            None => std::env::temp_dir(),
        };
        let mut own_dir = None;
        if !dir.is_dir() {
            std::fs::create_dir_all(&dir).map_err(Error::Io)?;
            own_dir = Some(dir.clone());
        }
        let vsock_mux_path = dir.join(format!(
            "supermachine-vm-{}-{}.sock",
            std::process::id(),
            unique_suffix(),
        ));
        // `<vsock_mux>-exec` is the convention that worker.rs and
        // the design doc agree on. Same parent dir so unlinking the
        // mux on shutdown sweeps it too.
        let vsock_exec_path = {
            let mut p = vsock_mux_path.clone();
            let mut name = p.file_name().unwrap().to_owned();
            name.push("-exec");
            p.set_file_name(name);
            p
        };

        // Build VmResources for snapshot restore. Memory + vCPUs
        // come from the image's bake metadata unless the caller
        // explicitly overrode them.
        let memory_mib = config.memory_mib.unwrap_or(image.memory_mib);
        let vcpus = config.vcpus.unwrap_or(image.vcpus);
        let mut resources = VmResources::new()
            .with_kernel_path(kernel.to_string_lossy().to_string())
            .with_memory_mib(memory_mib as usize)
            .with_vcpus(vcpus)
            .with_cow_restore(true)
            .with_restore(image.snapshot_path.to_string_lossy().to_string())
            .with_vsock_mux(vsock_mux_path.to_string_lossy().to_string())
            .with_vsock_exec(vsock_exec_path.to_string_lossy().to_string());

        // Attach the OCI image's virtio-blk layers in bake order.
        // The guest's overlayfs union is built bottom-up over these.
        for layer in &image.layers {
            resources = resources.with_block_device(layer.to_string_lossy().to_string());
        }
        if let Some(delta) = &image.delta_squashfs {
            resources = resources.with_block_device(delta.to_string_lossy().to_string());
        }

        // Pool of size 1 — single worker, single VM.
        let options = RunOptions::default();
        let pool = WarmPool::start(resources, options).map_err(Error::from)?;

        // Restore from the snapshot. WarmPool's restore_timeout
        // dispatches the RESTORE command to the pre-spawned worker
        // and blocks until the guest is up.
        let timeout = config
            .restore_timeout
            .unwrap_or_else(|| Duration::from_secs(10));
        let _ = pool
            .restore_timeout(image.snapshot_path.to_string_lossy().to_string(), timeout)
            .map_err(Error::from)?;

        Ok(Vm {
            pool: Some(pool),
            vsock_mux_path,
            vsock_exec_path,
            own_vsock_mux_dir: own_dir,
            skip_cleanup: false,
            image_meta: Some(Arc::new(ImageMeta {
                memory_mib,
                vcpus,
                layers: image.layers.clone(),
                delta_squashfs: image.delta_squashfs.clone(),
            })),
        })
    }

    /// Path to the host-side unix socket that proxies bytes to /
    /// from the first TSI listener inside the guest. Connect to it
    /// with [`UnixStream::connect`] (or via [`Vm::connect`]).
    pub fn vsock_path(&self) -> &Path {
        &self.vsock_mux_path
    }

    /// Path to the host-side unix socket that bridges to the
    /// in-guest exec agent (native AF_VSOCK on the guest side).
    /// Reachable once the agent lands in the initramfs and is
    /// running guest-side; until then dialing it returns an
    /// immediate EOF.
    pub fn exec_path(&self) -> &Path {
        &self.vsock_exec_path
    }

    /// Spawn a process inside the running guest. Equivalent to
    /// `docker exec`. Returns an [`crate::exec::ExecChild`] handle
    /// you can read stdout/stderr from, write stdin to, and
    /// `wait()` for an exit status.
    ///
    /// ```no_run
    /// # use std::io::Read;
    /// # use supermachine::{Image, Vm, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = Vm::start(&image, &VmConfig::new())?;
    /// let mut child = vm.exec(["sh", "-c", "echo hi"])?;
    /// let mut buf = String::new();
    /// child.stdout().unwrap().read_to_string(&mut buf)?;
    /// assert_eq!(buf, "hi\n");
    /// child.wait()?;
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    pub fn exec<I, S>(&self, argv: I) -> std::io::Result<crate::exec::ExecChild>
    where
        I: IntoIterator<Item = S>,
        S: Into<String>,
    {
        self.exec_builder().argv(argv).spawn()
    }

    /// Configurable exec — TTY, env vars, cwd, initial winsize,
    /// timeout, and the [`crate::exec::ExecBuilder::output`]
    /// convenience that drains stdio + collects exit status into
    /// one [`crate::exec::ExecOutcome`].
    pub fn exec_builder(&self) -> crate::exec::ExecBuilder {
        crate::exec::ExecBuilder::new(self.vsock_exec_path.clone())
    }

    /// Write `bytes` to `path` inside the guest, atomically.
    /// Native vsock RPC — no exec, no shell. Roughly ~100 µs per
    /// call regardless of file size (up to the 12 MiB raw limit
    /// imposed by the agent's frame cap).
    ///
    /// The guest agent stages to a sibling tmp file then renames
    /// for atomicity, so partial writes don't leave a half-baked
    /// file at `path`.
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = image.start(&VmConfig::new())?;
    /// vm.write_file("/tmp/main.rs", b"fn main() { println!(\"hi\"); }")?;
    /// let out = vm.exec_builder()
    ///     .argv(["rustc", "/tmp/main.rs", "-o", "/tmp/main"])
    ///     .output()?;
    /// assert!(out.success());
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    pub fn write_file(&self, path: &str, bytes: &[u8]) -> std::io::Result<()> {
        let body = serde_json::json!({
            "action": "write_file",
            "path": path,
            "data_b64": b64_encode(bytes),
        });
        crate::exec::send_control(&self.vsock_exec_path, &body)
    }

    /// Read `path` from inside the guest. Symmetric with
    /// [`Vm::write_file`]; native vsock RPC, ~100 µs per call.
    /// Cap is 4 MiB by default (raises an error if larger);
    /// stream large files via [`Vm::exec`] instead.
    pub fn read_file(&self, path: &str) -> std::io::Result<Vec<u8>> {
        let body = serde_json::json!({
            "action": "read_file",
            "path": path,
        });
        // Generous read timeout for large reads — file IO inside
        // the VM is fast, but we want to tolerate cold-cache cases.
        let ack = crate::exec::send_control_with_ack(
            &self.vsock_exec_path,
            &body,
            Some(std::time::Duration::from_secs(30)),
        )?;
        let data_b64 = ack
            .get("data_b64")
            .and_then(|v| v.as_str())
            .ok_or_else(|| {
                std::io::Error::new(
                    std::io::ErrorKind::InvalidData,
                    "read_file: agent ack missing data_b64",
                )
            })?;
        b64_decode(data_b64)
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
    }

    /// Send a Unix signal to the guest's main workload process.
    /// Use this for `docker stop`-style graceful shutdown:
    ///
    /// ```no_run
    /// # use supermachine::{Image, Vm, VmConfig};
    /// # let image = Image::from_snapshot("path")?;
    /// # let vm = Vm::start(&image, &VmConfig::new())?;
    /// vm.workload_signal(libc::SIGTERM)?;
    /// // ...wait for the workload to clean up...
    /// vm.stop()?;
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    ///
    /// Implementation: dials the in-guest exec agent on a fresh
    /// connection with a CONTROL frame; the agent reads
    /// `/run/supermachine-workload.pid` (written by init-oci's
    /// PID-1 supervisor) and `kill(pid, signum)` it. Returns
    /// `Err(NotFound)` if the workload hasn't been spawned yet
    /// (only happens during the bake-time window).
    pub fn workload_signal(&self, signum: i32) -> std::io::Result<()> {
        let body = serde_json::json!({
            "action": "signal",
            "signum": signum,
        });
        crate::exec::send_control(&self.vsock_exec_path, &body)
    }

    /// Connect to the guest's first TSI listener. The returned
    /// `UnixStream` is byte-equivalent to a `TcpStream` to the
    /// guest's `:80` (or whatever port it bound).
    ///
    /// For HTTP, just write a request and read the response:
    /// supermachine's vsock-mux is a transparent proxy.
    pub fn connect(&self) -> std::io::Result<UnixStream> {
        UnixStream::connect(&self.vsock_mux_path)
    }

    /// Bind a TCP listener on `127.0.0.1:host_port` that forwards
    /// each accepted connection to the guest's TSI listener (the
    /// same destination as [`Vm::connect`]). Returns a
    /// [`TcpForwarder`] that owns the accept-loop thread; drop it
    /// (or call [`TcpForwarder::stop`]) to stop accepting new
    /// connections. In-flight connections continue until they close
    /// naturally.
    ///
    /// `host_port = 0` lets the OS pick a free port; read the actual
    /// address back via [`TcpForwarder::local_addr`].
    ///
    /// `guest_port` is currently informational — supermachine's
    /// vsock-mux exposes the first TSI listener regardless. The
    /// parameter is in the signature so future versions can route
    /// to a specific guest port without breaking callers.
    ///
    /// Use this when you want the embedded VM to look like a normal
    /// localhost service (e.g. `http://127.0.0.1:9090/`) rather than
    /// having every caller go through `vm.connect()`.
    ///
    /// ```no_run
    /// # use supermachine::{Image, Vm, VmConfig};
    /// let image = Image::from_snapshot("path/to/snapshot")?;
    /// let vm = Vm::start(&image, &VmConfig::new())?;
    /// let fwd = vm.expose_tcp(9090, 80)?;
    /// println!("nginx is on {}", fwd.local_addr());
    /// // ... do work ...
    /// drop(fwd); // stop forwarding
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    pub fn expose_tcp(&self, host_port: u16, _guest_port: u16) -> std::io::Result<TcpForwarder> {
        let listener = TcpListener::bind(("127.0.0.1", host_port))?;
        let bound = listener.local_addr()?;
        // Short accept timeout so the stop flag is responsive.
        listener.set_nonblocking(false)?;
        let stop = Arc::new(AtomicBool::new(false));
        let stop_thread = stop.clone();
        let vsock_path = self.vsock_mux_path.clone();
        let handle = std::thread::Builder::new()
            .name(format!("supermachine-tcp-{host_port}"))
            .spawn(move || {
                accept_loop(listener, vsock_path, stop_thread);
            })
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
        // Best-effort: poke the listener to unblock its accept on
        // shutdown. We rely on `stop` flag + a self-connect during
        // drop. See TcpForwarder::drop.
        Ok(TcpForwarder {
            stop,
            handle: Some(handle),
            bound,
        })
    }

    /// Stop the VM. Equivalent to dropping it, but returns errors
    /// rather than swallowing them.
    pub fn stop(mut self) -> Result<(), Error> {
        if let Some(pool) = self.pool.take() {
            let _ = pool.shutdown().map_err(Error::from)?;
        }
        self.cleanup_socket();
        Ok(())
    }

    /// Capture a snapshot of the running VM into `dest_dir`. The
    /// dir gets `restore.snap` (the captured VM state) and a
    /// `metadata.json` describing the layers/memory/vCPUs from
    /// the source [`Image`] — together they form a fresh
    /// snapshot loadable via [`Image::from_snapshot`].
    ///
    /// This is the **"rustc-warm snapshot" pattern** — boot a VM
    /// from a base image (e.g. `rust:1-slim`), populate
    /// expensive in-VM state (run `cargo build` to fill
    /// `target/` with cached deps), capture, then re-use the new
    /// snapshot via `Image::from_snapshot(...).acquire()` for
    /// fast subsequent iterations.
    ///
    /// ```no_run
    /// # use supermachine::{Image, VmConfig};
    /// let base = Image::from_snapshot("path/to/rust-slim")?;
    /// let vm = base.start(&VmConfig::new())?;
    /// // Pre-warm: populate target/ with cached deps.
    /// vm.exec_builder()
    ///     .argv(["sh", "-c", "cd /src && cargo build --release"])
    ///     .output()?;
    /// // Capture; vm is consumed (and stopped).
    /// let warm = vm.snapshot("/tmp/rust-warm")?;
    /// // Now `warm.acquire()` gets you a VM with target/
    /// // already populated — every subsequent compile re-uses
    /// // the cached deps.
    /// # Ok::<(), supermachine::Error>(())
    /// ```
    ///
    /// **Only works on a Vm produced by [`Image::start`]**.
    /// Pooled VMs (from [`Image::acquire`]) live in worker
    /// subprocesses and don't have host-side access to the
    /// snapshot machinery; snapshot them by re-starting the
    /// source image and snapshotting that.
    pub fn snapshot(mut self, dest_dir: impl Into<PathBuf>) -> Result<Image, Error> {
        let dest_dir = dest_dir.into();
        let meta = self.image_meta.clone().ok_or_else(|| {
            Error::vm_msg(
                "Vm::snapshot requires an in-process Vm (use image.start, not image.acquire)"
                    .to_owned(),
            )
        })?;
        let pool = self.pool.as_ref().ok_or_else(|| {
            Error::vm_msg("Vm::snapshot: no pool to drive the capture".to_owned())
        })?;
        std::fs::create_dir_all(&dest_dir).map_err(Error::Io)?;
        let snap_path = dest_dir.join("restore.snap");
        // Trigger the capture via the pool RPC. Generous
        // timeout — capture is fast (~10s of ms) but disk
        // saving for big VMs can take a moment.
        let _result = pool
            .snapshot_timeout(
                snap_path.to_string_lossy().to_string(),
                Duration::from_secs(60),
            )
            .map_err(|e| Error::Vm {
                msg: format!("snapshot capture failed: {e:?}"),
                source: None,
            })?;
        // Write a metadata.json that mirrors what the bake step
        // emits, so Image::from_snapshot can load this dir.
        let metadata = serde_json::json!({
            "memory_mib": meta.memory_mib,
            "vcpus": meta.vcpus,
            "layers": meta
                .layers
                .iter()
                .map(|p| p.to_string_lossy().to_string())
                .collect::<Vec<_>>(),
            "delta_squashfs": meta
                .delta_squashfs
                .as_ref()
                .map(|p| p.to_string_lossy().to_string()),
            "snapshot_base": snap_path.to_string_lossy().to_string(),
            "baked_at": chrono_rfc3339_now(),
            "source": "Vm::snapshot",
        });
        std::fs::write(
            dest_dir.join("metadata.json"),
            serde_json::to_string_pretty(&metadata)
                .map_err(|e| Error::vm_msg(format!("metadata serialize: {e}")))?,
        )
        .map_err(Error::Io)?;
        // Cleanly shut down our worker — the snapshot is on disk.
        if let Some(pool) = self.pool.take() {
            let _ = pool.shutdown();
        }
        self.cleanup_socket();
        // Suppress the Drop, we already cleaned up.
        self.skip_cleanup = true;
        // Load the freshly-written dir as a new Image.
        Image::from_snapshot(&dest_dir)
    }

    fn cleanup_socket(&self) {
        let _ = std::fs::remove_file(&self.vsock_mux_path);
        let _ = std::fs::remove_file(&self.vsock_exec_path);
        if let Some(dir) = &self.own_vsock_mux_dir {
            // Only unlink the dir if it's still empty (best-effort).
            let _ = std::fs::remove_dir(dir);
        }
    }
}

// ---------- minimal base64 (RFC 4648, mirror of agent's) ----------
//
// Inlined to keep deps minimal. Round-trip-tested against the
// agent's implementation; identical alphabet + padding rules.

const B64_ALPHA: &[u8; 64] =
    b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

pub(crate) fn b64_encode(bytes: &[u8]) -> String {
    let mut out = String::with_capacity((bytes.len() + 2) / 3 * 4);
    let mut i = 0;
    while i + 3 <= bytes.len() {
        let b0 = bytes[i] as u32;
        let b1 = bytes[i + 1] as u32;
        let b2 = bytes[i + 2] as u32;
        let n = (b0 << 16) | (b1 << 8) | b2;
        out.push(B64_ALPHA[((n >> 18) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 12) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 6) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[(n & 0x3f) as usize] as char);
        i += 3;
    }
    let rem = bytes.len() - i;
    if rem == 1 {
        let b0 = bytes[i] as u32;
        let n = b0 << 16;
        out.push(B64_ALPHA[((n >> 18) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 12) & 0x3f) as usize] as char);
        out.push('=');
        out.push('=');
    } else if rem == 2 {
        let b0 = bytes[i] as u32;
        let b1 = bytes[i + 1] as u32;
        let n = (b0 << 16) | (b1 << 8);
        out.push(B64_ALPHA[((n >> 18) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 12) & 0x3f) as usize] as char);
        out.push(B64_ALPHA[((n >> 6) & 0x3f) as usize] as char);
        out.push('=');
    }
    out
}

pub(crate) fn b64_decode(s: &str) -> Result<Vec<u8>, String> {
    let mut tbl = [255u8; 256];
    for (i, &b) in B64_ALPHA.iter().enumerate() {
        tbl[b as usize] = i as u8;
    }
    let bytes: Vec<u8> = s.bytes().filter(|b| !b.is_ascii_whitespace()).collect();
    if bytes.len() % 4 != 0 {
        return Err(format!("base64 length {} is not a multiple of 4", bytes.len()));
    }
    let mut out = Vec::with_capacity(bytes.len() / 4 * 3);
    for chunk in bytes.chunks_exact(4) {
        let v: [u8; 4] = chunk.try_into().unwrap();
        let pad = v.iter().filter(|&&b| b == b'=').count();
        let mut acc: u32 = 0;
        for &b in &v {
            let d = if b == b'=' { 0 } else { tbl[b as usize] };
            if b != b'=' && d == 255 {
                return Err(format!("invalid base64 character {:#x}", b));
            }
            acc = (acc << 6) | (d as u32);
        }
        out.push(((acc >> 16) & 0xff) as u8);
        if pad < 2 {
            out.push(((acc >> 8) & 0xff) as u8);
        }
        if pad < 1 {
            out.push((acc & 0xff) as u8);
        }
    }
    Ok(out)
}

impl Drop for Vm {
    fn drop(&mut self) {
        // PooledVm sets skip_cleanup so its inner Vm doesn't
        // shut down the shared pool or unlink sockets the
        // HiddenPool keeps alive.
        if self.skip_cleanup {
            return;
        }
        if let Some(pool) = self.pool.take() {
            let _ = pool.shutdown();
        }
        self.cleanup_socket();
    }
}

/// Owns the accept-loop thread for a [`Vm::expose_tcp`] forwarder.
///
/// Drop this to stop accepting new connections. In-flight
/// connections continue until they close naturally — they're owned
/// by their own splice threads, not by the forwarder.
pub struct TcpForwarder {
    stop: Arc<AtomicBool>,
    handle: Option<JoinHandle<()>>,
    bound: SocketAddr,
}

impl TcpForwarder {
    /// The address the forwarder is listening on. Useful when you
    /// asked for `host_port = 0` and want to know the OS-assigned
    /// port.
    pub fn local_addr(&self) -> SocketAddr {
        self.bound
    }

    /// Stop accepting new connections. Equivalent to dropping the
    /// forwarder, but returns when the accept thread has actually
    /// exited.
    pub fn stop(mut self) {
        self.shutdown();
    }

    fn shutdown(&mut self) {
        self.stop.store(true, Ordering::SeqCst);
        // Self-connect to unblock the listener's accept(). We don't
        // care about the result — the connection just exists to wake
        // the loop, which then sees `stop` set and exits.
        let _ = TcpStream::connect_timeout(&self.bound, Duration::from_millis(200));
        if let Some(h) = self.handle.take() {
            let _ = h.join();
        }
    }
}

impl Drop for TcpForwarder {
    fn drop(&mut self) {
        self.shutdown();
    }
}

/// Accept loop for `Vm::expose_tcp`. Spawns a per-connection splice
/// thread for each accepted TCP stream; the splice threads live
/// independently of the forwarder so in-flight requests survive
/// `TcpForwarder::drop`.
fn accept_loop(listener: TcpListener, vsock_path: PathBuf, stop: Arc<AtomicBool>) {
    for incoming in listener.incoming() {
        if stop.load(Ordering::SeqCst) {
            break;
        }
        let tcp = match incoming {
            Ok(s) => s,
            Err(_) => continue,
        };
        let vsock = vsock_path.clone();
        std::thread::Builder::new()
            .name("supermachine-tcp-conn".into())
            .spawn(move || {
                if let Err(e) = splice_tcp_to_unix(tcp, &vsock) {
                    // Log to stderr — this is best-effort; the
                    // embedder's preferred logging is out of scope.
                    eprintln!("supermachine: tcp forward: {e}");
                }
            })
            .ok();
    }
}

/// Bridge a single TCP connection to the vsock-mux unix socket.
/// Two threads per connection: one shovels TCP→Unix, the other
/// Unix→TCP. Either side closing tears the bridge down.
fn splice_tcp_to_unix(tcp: TcpStream, vsock_path: &Path) -> std::io::Result<()> {
    let unix = UnixStream::connect(vsock_path)?;
    // try_clone so each direction owns its own handle.
    let tcp_w = tcp.try_clone()?;
    let unix_w = unix.try_clone()?;
    let t1 = std::thread::Builder::new()
        .name("supermachine-tcp-c2g".into())
        .spawn(move || {
            let _ = pump(tcp, unix_w);
        })?;
    let t2 = std::thread::Builder::new()
        .name("supermachine-tcp-g2c".into())
        .spawn(move || {
            let _ = pump(unix, tcp_w);
        })?;
    let _ = t1.join();
    let _ = t2.join();
    Ok(())
}

/// Generic byte pump from `r` → `w` until EOF or error. We use
/// `Read + Write` trait objects via concrete types so this works
/// for both TcpStream and UnixStream. Half-close on EOF: the writer
/// gets shutdown so the peer of `w` sees the FIN.
fn pump<R, W>(mut r: R, mut w: W) -> std::io::Result<()>
where
    R: Read,
    W: Write + Shutdownable,
{
    let mut buf = [0u8; 16 * 1024];
    loop {
        let n = match r.read(&mut buf) {
            Ok(0) => break,
            Ok(n) => n,
            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
            Err(e) => return Err(e),
        };
        if let Err(e) = w.write_all(&buf[..n]) {
            return Err(e);
        }
    }
    let _ = w.shutdown_write();
    Ok(())
}

/// Trait letting `pump` call `shutdown(Write)` on either a
/// `TcpStream` or a `UnixStream` without dynamic dispatch.
trait Shutdownable {
    fn shutdown_write(&mut self) -> std::io::Result<()>;
}

impl Shutdownable for TcpStream {
    fn shutdown_write(&mut self) -> std::io::Result<()> {
        TcpStream::shutdown(self, std::net::Shutdown::Write)
    }
}

impl Shutdownable for UnixStream {
    fn shutdown_write(&mut self) -> std::io::Result<()> {
        UnixStream::shutdown(self, std::net::Shutdown::Write)
    }
}

fn unique_suffix() -> u64 {
    use std::sync::atomic::{AtomicU64, Ordering};
    static COUNTER: AtomicU64 = AtomicU64::new(0);
    let nanos = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_nanos() as u64)
        .unwrap_or(0);
    nanos.wrapping_add(COUNTER.fetch_add(1, Ordering::Relaxed))
}

/// Tiny RFC 3339 timestamp formatter — used by [`Vm::snapshot`]'s
/// metadata. Avoids pulling in `chrono` for a single timestamp.
fn chrono_rfc3339_now() -> String {
    let secs = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs() as i64)
        .unwrap_or(0);
    // Days since 1970-01-01 / seconds within day.
    let days = secs.div_euclid(86_400);
    let sod = secs.rem_euclid(86_400);
    let hh = sod / 3600;
    let mm = (sod % 3600) / 60;
    let ss = sod % 60;
    // Civil-from-days (Howard Hinnant's algorithm).
    let z = days + 719_468;
    let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
    let doe = z - era * 146_097;
    let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365;
    let y = yoe + era * 400;
    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
    let mp = (5 * doy + 2) / 153;
    let d = doy - (153 * mp + 2) / 5 + 1;
    let m = if mp < 10 { mp + 3 } else { mp - 9 };
    let y = if m <= 2 { y + 1 } else { y };
    format!("{y:04}-{m:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z")
}