supermachine 0.7.6

//! VM runner — the reusable library entry point behind the
//! `supermachine-worker` command-line harness. Owns VM boot/restore,
//! virtio device wiring, vCPU dispatch, snapshot triggers, and
//! pool-worker warm-restore loops.

use std::fmt;
use std::os::unix::net::UnixStream;
use std::time::SystemTime;

use crate::vmm::pool::PoolWorker;
use crate::vmm::resources::{ResourceError, VmResources};
use crate::vmm::tls::TlsConfig;

#[derive(Default)]
pub struct RunOptions {
    pub tls: Option<TlsConfig>,
    pub pool_sock: Option<UnixStream>,
    pub pool_worker: Option<PoolWorker>,
    /// Experimental warm-pool shortcut used to prove whether full HVF GIC blob
    /// restore is required for a snapshot class.
    pub experimental_skip_warm_gic_restore: bool,
}

#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct RunReport {
    pub warm_restores: u64,
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
struct WarmSnapshotCache {
    path: String,
    file_len: u64,
    modified: Option<SystemTime>,
    file: std::fs::File,
    snap: crate::vmm::snapshot::Snapshot,
    ram_offset: u64,
    memory_bytes: usize,
}

#[derive(Debug)]
pub enum RunError {
    Build(crate::vmm::builder::BuildError),
    Hvf(crate::hvf::Error),
    MmapCow {
        path: String,
        source: std::io::Error,
    },
    Pool(crate::vmm::pool::PoolError),
    Resource(ResourceError),
    SnapshotLoad {
        path: String,
        source: crate::vmm::snapshot::FileError,
    },
    ThreadSpawn {
        name: String,
        source: std::io::Error,
    },
    Tls(crate::vmm::tls::StartError),
    UnexpectedProofOfLifeExit {
        reason: crate::hvf::ExitReason,
        ec: u64,
    },
    VsockMux(crate::vmm::vsock_mux::StartError),
    Worker(crate::vmm::worker::WorkerError),
}

impl fmt::Display for RunError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            RunError::Build(e) => write!(f, "{e}"),
            RunError::Hvf(e) => write!(f, "HVF operation failed: {e:?}"),
            RunError::MmapCow { path, source } => {
                write!(f, "mmap CoW snapshot RAM {path}: {source}")
            }
            RunError::Pool(e) => write!(f, "{e}"),
            RunError::Resource(e) => write!(f, "{e}"),
            RunError::SnapshotLoad { path, source } => {
                write!(f, "load snapshot {path}: {source:?}")
            }
            RunError::ThreadSpawn { name, source } => {
                write!(f, "spawn thread {name}: {source}")
            }
            RunError::Tls(e) => write!(f, "{e}"),
            RunError::UnexpectedProofOfLifeExit { reason, ec } => {
                write!(
                    f,
                    "unexpected proof-of-life exit: {reason:?} ESR_EL2 EC={ec:#x}"
                )
            }
            RunError::VsockMux(e) => write!(f, "{e}"),
            RunError::Worker(e) => write!(f, "{e}"),
        }
    }
}

impl std::error::Error for RunError {}

impl From<ResourceError> for RunError {
    fn from(value: ResourceError) -> Self {
        Self::Resource(value)
    }
}

impl From<crate::vmm::builder::BuildError> for RunError {
    fn from(value: crate::vmm::builder::BuildError) -> Self {
        Self::Build(value)
    }
}

impl From<crate::hvf::Error> for RunError {
    fn from(value: crate::hvf::Error) -> Self {
        Self::Hvf(value)
    }
}

impl From<crate::vmm::pool::PoolError> for RunError {
    fn from(value: crate::vmm::pool::PoolError) -> Self {
        Self::Pool(value)
    }
}

impl From<crate::vmm::worker::WorkerError> for RunError {
    fn from(value: crate::vmm::worker::WorkerError) -> Self {
        Self::Worker(value)
    }
}

impl From<crate::vmm::vsock_mux::StartError> for RunError {
    fn from(value: crate::vmm::vsock_mux::StartError) -> Self {
        Self::VsockMux(value)
    }
}

impl From<crate::vmm::tls::StartError> for RunError {
    fn from(value: crate::vmm::tls::StartError) -> Self {
        Self::Tls(value)
    }
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn run(resources: &VmResources, options: RunOptions) -> Result<RunReport, RunError> {
    // Bake-then-pool relaxes the "snapshot trigger requires out_path"
    // rule. The runner doesn't actually capture in that mode — it
    // signals BAKE_READY on the supervisor socket and hands the
    // SNAPSHOT_ASYNC / SNAPSHOT RPCs to the host instead.
    let bake_then_pool = (options.pool_sock.is_some() || options.pool_worker.is_some())
        && resources.restore_from.is_none()
        && resources.snapshot.out_path.is_none();
    if !bake_then_pool {
        resources.validate_for_run()?;
    } else if resources.memory_bytes() == 0 {
        return Err(RunError::Resource(
            crate::vmm::resources::ResourceError::ZeroMemory,
        ));
    } else if resources.vcpus == 0 {
        return Err(RunError::Resource(
            crate::vmm::resources::ResourceError::ZeroVcpus,
        ));
    } else if resources.kernel_path.is_none() {
        return Err(RunError::Resource(
            crate::vmm::resources::ResourceError::MissingKernel,
        ));
    }
    run_kernel(
        resources,
        options.tls,
        options.pool_sock,
        options.pool_worker,
        options.experimental_skip_warm_gic_restore,
    )
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn run_proof_of_life() -> Result<(), RunError> {
    use crate::arch::aarch64::layout;
    use crate::vmm::vstate::{MicroVm, TEST_PROGRAM};

    eprintln!("supermachine: HVF init test");
    let vm = MicroVm::new(64 * 1024 * 1024)?;
    eprintln!(
        "  VM created, RAM mapped at GPA 0x{:x}, {} MiB",
        vm.ram_gpa,
        vm.ram_size / (1024 * 1024)
    );

    let entry = vm.ram_gpa + layout::KERNEL_LOAD_OFFSET;
    // SAFETY: TEST_PROGRAM is 8 bytes, well within RAM.
    unsafe {
        vm.write_ram(entry, &TEST_PROGRAM);
    }
    vm.set_boot_cpsr()?;
    vm.set_pc(entry)?;
    eprintln!("  PC set to 0x{entry:x}, CPSR=EL1h (DAIF masked)");

    eprintln!("  running vCPU…");
    let (reason, esr, _gpa, _va) = vm.run_once()?;
    let ec = (esr >> 26) & 0x3f;
    eprintln!("  exit: {reason:?}  ESR_EL2=0x{esr:x}  EC={ec:#x}");

    if reason == crate::hvf::ExitReason::Exception && ec == 0x16 {
        eprintln!("  PASS: HVF round-trip working — guest executed HVC #0");
        Ok(())
    } else {
        eprintln!("  UNEXPECTED exit; HVF probably misconfigured");
        Err(RunError::UnexpectedProofOfLifeExit { reason, ec })
    }
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn run_kernel(
    resources: &VmResources,
    tls_cfg: Option<TlsConfig>,
    pool_sock: Option<UnixStream>,
    pool_worker: Option<PoolWorker>,
    option_skip_warm_gic_restore: bool,
) -> Result<RunReport, RunError> {
    use crate::vmm::builder;
    use crate::vmm::pool::{PoolControl, WarmRestoreTimings};
    use crate::vmm::snapshot;
    use crate::vmm::worker::{self, DispatchSnapshot};

    let kernel_path = resources.kernel_path.as_deref();
    let initrd_path = resources.initrd_path.as_deref();
    let cmdline = resources.cmdline.as_str();
    let mem_size = resources.memory_bytes();
    let blk_paths = &resources.block_devices;
    let n_vcpus = resources.vcpus;
    let snapshot_after_ms = resources.snapshot.after_ms;
    let snapshot_at = resources.snapshot.at_heartbeat;
    let snapshot_on_listener = resources.snapshot.on_listener;
    let snapshot_on_pre_exec = resources.snapshot.on_pre_exec;
    let quiesce_ms = resources.snapshot.quiesce_ms;
    let snapshot_out = resources.snapshot.out_path.as_deref();
    let balloon_target_pages = resources.balloon_target_pages;
    let restore_from = resources.restore_from.as_deref();
    let cow_restore = resources.cow_restore;
    let vsock_mux_path = resources.endpoints.vsock_mux.as_deref();
    let http_port_addr = resources.endpoints.http_port.as_deref();
    let vsock_mux_handoff_path = resources.endpoints.vsock_mux_handoff.as_deref();
    let vsock_exec_path = resources.endpoints.vsock_exec.as_deref();
    let vsock_exec_guest_port = resources
        .endpoints
        .vsock_exec_guest_port
        .unwrap_or(crate::vmm::resources::DEFAULT_EXEC_GUEST_PORT);
    let timings = std::env::var_os("SUPERMACHINE_TIMINGS").is_some();
    let skip_warm_gic_restore =
        option_skip_warm_gic_restore || std::env::var_os("SUPERMACHINE_SKIP_WARM_GIC_RESTORE").is_some();
    let fixed_warm_ram_remap = std::env::var_os("SUPERMACHINE_REMAP_FIXED").is_some();
    let run_t0 = std::time::Instant::now();

    let mut cow_ram: Option<(*mut u8, usize)> = None;
    let restore = match restore_from {
        Some(p) => Some(if cow_restore {
            eprintln!("supermachine: restoring from {p} (CoW mmap)");
            let t0 = std::time::Instant::now();
            let (snap, ram_offset, memory_bytes) =
                snapshot::load_meta(p).map_err(|source| RunError::SnapshotLoad {
                    path: p.to_string(),
                    source,
                })?;
            if timings {
                eprintln!(
                    "[timing] restore.load_meta={}us total={}us",
                    t0.elapsed().as_micros(),
                    run_t0.elapsed().as_micros()
                );
            }
            let t0 = std::time::Instant::now();
            let (ptr, len) =
                snapshot::mmap_ram_cow_at(p, ram_offset, memory_bytes).map_err(|source| {
                    RunError::MmapCow {
                        path: p.to_string(),
                        source,
                    }
                })?;
            if timings {
                eprintln!(
                    "[timing] restore.mmap_cow={}us total={}us",
                    t0.elapsed().as_micros(),
                    run_t0.elapsed().as_micros()
                );
            }
            cow_ram = Some((ptr, len));
            snap
        } else {
            eprintln!("supermachine: restoring from {p}");
            let t0 = std::time::Instant::now();
            snapshot::load_from_file(p)
                .map_err(|source| RunError::SnapshotLoad {
                    path: p.to_string(),
                    source,
                })
                .inspect(|_| {
                    if timings {
                        eprintln!(
                            "[timing] restore.load_full={}us total={}us",
                            t0.elapsed().as_micros(),
                            run_t0.elapsed().as_micros()
                        );
                    }
                })?
        }),
        None => None,
    };

    if restore.is_none() {
        eprintln!("supermachine: kernel boot");
        eprintln!("  kernel    : {}", kernel_path.unwrap_or(""));
        if let Some(p) = initrd_path {
            eprintln!("  initramfs : {p}");
        }
        eprintln!("  cmdline   : {cmdline}");
        eprintln!("  memory    : {} MiB", mem_size / (1024 * 1024));
        for p in blk_paths {
            eprintln!("  blk       : {p}");
        }
    } else if let Some(s) = restore.as_ref() {
        eprintln!(
            "  memory    : {} MiB (from snapshot)",
            s.memory.len() / (1024 * 1024)
        );
    }

    let restore_memory_len = restore.as_ref().map(|s| s.memory.len());
    let t0 = std::time::Instant::now();
    let mut vmm = builder::build_vmm(resources, cow_ram, restore_memory_len)?;
    if timings {
        eprintln!(
            "[timing] restore.build_vmm={}us total={}us",
            t0.elapsed().as_micros(),
            run_t0.elapsed().as_micros()
        );
    }

    // Bind host-facing endpoints BEFORE restore. The lib's
    // `wait_for_socket` (and the router's worker-readiness
    // probe) detects the unix socket existing as the "spawn
    // done" signal — moving the bind earlier means clients
    // stop polling sooner and pay less spawn-roundtrip cost.
    //
    // Safety: each accept handler calls `wait_for_host_port`,
    // which polls the muxer's TSI listener registry until the
    // guest's listener registers (post-restore). So if a
    // client connects before restore completes, the kernel
    // queues the connection, accept fires, the handler blocks
    // in `wait_for_host_port` until the guest is up, then
    // proceeds. No data is misrouted.
    if let Some(c) = tls_cfg {
        crate::vmm::tls::start(c, vmm.vsock.clone())?;
    }
    if let Some(p) = vsock_mux_path {
        crate::vmm::vsock_mux::start(p, vmm.vsock.clone(), None)?;
    }
    if let Some(addr) = http_port_addr {
        crate::vmm::vsock_mux::start_tcp(addr, vmm.vsock.clone(), None)?;
    }
    if let Some(p) = vsock_mux_handoff_path {
        crate::vmm::vsock_mux::start_handoff(p, vmm.vsock.clone(), None)?;
    }
    if let Some(p) = vsock_exec_path {
        crate::vmm::vsock_mux::start_exec(p, vmm.vsock.clone(), vsock_exec_guest_port)?;
    }
    if timings {
        eprintln!(
            "[timing] restore.endpoints_ready={}us",
            run_t0.elapsed().as_micros()
        );
    }

    let mut first_restore_us: u128 = 0;
    if let Some(snap) = restore.as_ref() {
        let t0 = std::time::Instant::now();
        vmm.restore_snapshot(snap)?;
        first_restore_us = t0.elapsed().as_micros();
        if timings {
            eprintln!(
                "[timing] restore.state={}us total={}us",
                first_restore_us,
                run_t0.elapsed().as_micros()
            );
        }
        eprintln!(
            "  restored in {first_restore_us} us  (mmio={} listeners={})",
            snap.virtio.mmio.len(),
            snap.virtio.vsock_listeners.len()
        );

        // Hydrate per-mount PosixFs inode tables from the sidecar
        // (0.7.6+). Without this, every restore starts with an empty
        // FUSE `nodeid → host_path` table — the guest's pre-snapshot
        // dentry cache then references nodeids the daemon doesn't
        // know about, surfacing as MODULE_NOT_FOUND / EISDIR on any
        // path not walked during the warmup callback.
        if let Some(p) = restore_from {
            if !vmm.posix_fs.is_empty() {
                let t0 = std::time::Instant::now();
                let sidecar_path = snapshot::posix_fs_sidecar_path(p);
                match snapshot::read_posix_fs_sidecar(&sidecar_path) {
                    Ok(Some(blobs)) => {
                        if blobs.len() != vmm.posix_fs.len() {
                            eprintln!(
                                "supermachine: warning: posix-fs sidecar has {} entries but VM has {} mounts; \
                                 hydrating only the matching prefix (warm-restore dentry behaviour will be \
                                 degraded for any mount index past the prefix)",
                                blobs.len(),
                                vmm.posix_fs.len()
                            );
                        }
                        let n = blobs.len().min(vmm.posix_fs.len());
                        let mut ok = 0usize;
                        for i in 0..n {
                            if blobs[i].is_empty() {
                                continue;
                            }
                            match vmm.posix_fs[i].restore_state(&blobs[i]) {
                                Ok(()) => ok += 1,
                                Err(e) => {
                                    eprintln!(
                                        "supermachine: warning: posix-fs sidecar restore failed for \
                                         mount {i}: {e} — warm restore will fall back to lazy LOOKUP \
                                         (paths not walked during warmup may return ENOENT)"
                                    );
                                }
                            }
                        }
                        if timings {
                            eprintln!(
                                "[timing] restore.posix_fs_sidecar={}us mounts_restored={}/{}",
                                t0.elapsed().as_micros(),
                                ok,
                                vmm.posix_fs.len()
                            );
                        }
                    }
                    Ok(None) => {
                        // No sidecar — this is a pre-0.7.6 snapshot
                        // (or a CoW path that didn't write one). The
                        // existing warning emitted when bake version
                        // != runtime version already covers the
                        // user-facing diagnosis; stay quiet here so
                        // 0.7.5 → 0.7.6 upgrades don't spam.
                    }
                    Err(e) => {
                        eprintln!(
                            "supermachine: warning: posix-fs sidecar {sidecar_path} could not be \
                             read ({e:?}) — warm restore will fall back to lazy LOOKUP"
                        );
                    }
                }
            }
        }
    }

    // Drive the virtio-balloon device. The balloon-target is
    // bake-time-known (metadata.json `balloon_target_pages` is
    // typically 75% of memory_mib in 4 KiB pages). Once the
    // guest's `virtio_balloon` driver wakes (after restore + any
    // pending IRQs are processed), it inflates by N pages and
    // pushes their PFN list to the inflate queue, which our
    // device handler then `madvise(MADV_FREE)`s on the host's
    // CoW RAM mapping. macOS reclaims under pressure → idle
    // worker RSS drops from ~memory_mib to ~25% of memory_mib.
    //
    // Caller can override per-snapshot via env var
    // `SUPERMACHINE_BALLOON_TARGET_PAGES` (lib doesn't usually
    // do this; the bake-time metadata.json default suffices).
    if let Some(pages) = balloon_target_pages {
        vmm.balloon.request_inflate(pages);
        eprintln!("  balloon: requested inflate of {pages} pages");
    }

    // Spawn N-1 secondary threads. On a fresh boot each
    // secondary parks until PSCI CPU_ON wakes it. On a restore,
    // each gets its captured `PerVcpuState` and skips PSCI —
    // the kernel "thinks" we're already up from the original
    // bake.
    //
    // `secondary_states[i]` is the snapshot's per_vcpu[i+1] if
    // the snapshot has N>1 entries, else None (boot-from-scratch
    // path). vcpu0's state is already loaded by
    // `restore_snapshot` above.
    let secondary_states: Vec<Option<crate::vmm::snapshot::PerVcpuState>> = restore
        .as_ref()
        .map(|s| {
            (1..n_vcpus as usize)
                .map(|i| s.per_vcpu.get(i).cloned())
                .collect()
        })
        .unwrap_or_else(|| (1..n_vcpus).map(|_| None).collect());

    for idx in 1..n_vcpus {
        let coord_c = vmm.coord.clone();
        let bus_c = vmm.bus.clone();
        let name = format!("vcpu-{idx}");
        let st = secondary_states
            .get((idx - 1) as usize)
            .cloned()
            .unwrap_or(None);
        std::thread::Builder::new()
            .name(name.clone())
            .spawn(move || worker::run_secondary(idx, coord_c, bus_c, st))
            .map_err(|source| RunError::ThreadSpawn { name, source })?;
    }

    eprintln!("  vCPU launched ({n_vcpus} total), dispatch loop running\n");
    if timings && restore.is_some() {
        eprintln!(
            "[timing] restore.vcpu_launched={}us",
            run_t0.elapsed().as_micros()
        );
    }

    let pool_mode = pool_sock.is_some() || pool_worker.is_some();
    // Bake-then-pool: pool ctl is connected AND the worker was
    // started without --restore-from AND without --snapshot-out.
    // The dispatch loop signals readiness via `BAKE_READY` and
    // hands control to the host for warmup + final SNAPSHOT.
    let bake_then_pool = pool_mode && restore_from.is_none() && snapshot_out.is_none();
    let mut bake_ready_signaled = false;
    let transport_idle = pool_mode.then(|| {
        let vsock = vmm.vsock.clone();
        std::sync::Arc::new(move || vsock.is_transport_idle())
            as std::sync::Arc<dyn Fn() -> bool + Send + Sync>
    });
    let mut pool = PoolControl::start(
        pool_sock.as_ref(),
        pool_worker,
        restore.is_some().then_some(first_restore_us),
        restore
            .is_some()
            .then(|| vmm.vsock.muxer().first_host_port())
            .flatten(),
        vmm.vm.vcpu.handle(),
        transport_idle,
    )?;
    let mut report = RunReport::default();
    let mut warm_snapshot_cache: Option<WarmSnapshotCache> = None;

    // In-flight async-save threads. Drained at QUIT (and on
    // every snapshot RPC entry, to keep the list bounded). Each
    // thread writes a CompactSnapshot to disk via `.partial`+
    // rename. Holding the join handles here lets QUIT block
    // until pending writes hit disk — without this, a `QUIT`
    // immediately after `SNAPSHOT_ASYNC` would race the worker
    // exit against the in-flight save and leave a `.partial`.
    //
    // Each entry also retains an `Arc<CompactSnapshot>` so a
    // follow-up diff-save (Sync with `base_path` matching this
    // entry's path) can reuse the in-memory base for clonefile-
    // based diff capture without re-reading from disk.
    struct InFlightAsyncSave {
        path: String,
        handle: std::thread::JoinHandle<()>,
        snapshot: std::sync::Arc<crate::vmm::snapshot::CompactSnapshot>,
    }
    let mut in_flight_async_saves: Vec<InFlightAsyncSave> = Vec::new();

    // Lazy-load cache for the diff-snapshot path. Keyed on path
    // (typically the worker's restore source — same across all
    // cycle-snapshot calls in a pool). Capped at one entry: the
    // base CompactSnapshot is ~100 MiB on rust:1-slim, holding
    // multiple is wasteful and the cycle-snapshot path always
    // hits the same path. Invalidated by file mtime change so
    // a re-baked base correctly forces a reload.
    struct DiffBaseCache {
        path: String,
        modified: Option<SystemTime>,
        snapshot: std::sync::Arc<crate::vmm::snapshot::CompactSnapshot>,
    }
    let mut diff_base_cache: Option<DiffBaseCache> = None;

    // Auto-skip-blob-on-cycle-restore state. The GIC blob restore is
    // ~1.4 ms — the single fattest chunk in the per-cycle restore
    // path. For typical OCI workloads (sh, python, node, nginx, etc.)
    // the kernel programs the GIC distributor at boot and never
    // touches it after — meaning the live blob at end-of-cycle is
    // byte-identical to the snapshot blob, and re-applying it is an
    // expensive idempotent no-op.
    //
    // Strategy: on the FIRST cycle-restore for this worker process,
    // capture the live GIC blob and compare it byte-for-byte to the
    // snapshot blob. If equal, mark this worker "drift-clean" and
    // skip the blob restore on every subsequent cycle. If different,
    // mark "drift-detected" and always restore for the rest of this
    // worker's lifetime.
    //
    // Cost: one capture (~1.4 ms) on first cycle, paid once per
    // worker process. Saves ~1.4 ms × every subsequent cycle for
    // typical workloads. Pays a 1.4 ms one-time cost with no
    // ongoing penalty for exotic workloads (custom kernels, runtime
    // IRQ-affinity changes, etc.).
    //
    // Embedders can force the always-restore path with the
    // SUPERMACHINE_FORCE_GIC_RESTORE=1 env (escape hatch for
    // workloads where the auto-detect's first-cycle sample misses
    // late-emerging drift). The existing
    // SUPERMACHINE_SKIP_WARM_GIC_RESTORE=1 env still forces skip
    // (test-only; bypasses the safety check).
    let force_gic_restore = std::env::var_os("SUPERMACHINE_FORCE_GIC_RESTORE").is_some();
    let mut gic_drift_check_done = false;
    let mut gic_drift_detected = false;

    // Outer pool-worker loop. Without pool-worker mode this runs once.
    loop {
        let dispatch_exit = worker::dispatch_vcpu(
            0,
            &vmm.vm.vcpu,
            &vmm.bus,
            &vmm.coord,
            &vmm.all_mmio,
            &vmm.posix_fs,
            &vmm.vsock,
            &vmm.vm,
            DispatchSnapshot {
                after_ms: snapshot_after_ms.or(if bake_then_pool { Some(10_000) } else { None }),
                at_heartbeat: snapshot_at,
                // bake_then_pool defaults to ALSO listening for the
                // listener-up trigger (preserves existing service-
                // image bake behavior). Pre-exec is opt-in via
                // resources flag — set by the bake driver only for
                // skip_warm_snapshot=true.
                on_listener: snapshot_on_listener || bake_then_pool,
                // Pre-exec is one-shot: after bake-ready fires, the
                // PRE_EXEC_READY atomic stays true forever (init-oci
                // already passed that point). Re-entering dispatch
                // with on_pre_exec=true would re-trigger the branch
                // and waste another quiesce window. Gate it on
                // !bake_ready_signaled so it only fires before the
                // first BakeReady return.
                on_pre_exec: snapshot_on_pre_exec && !bake_ready_signaled,
                quiesce_ms,
                out_path: snapshot_out,
                stop_requested: Some(pool.pause_flag()),
                bake_ready_signal: bake_then_pool && !bake_ready_signaled,
                vsock_exec_path,
            },
        )?;
        // BakeReady is the bake-then-pool init-done signal. Fire
        // BAKE_READY on the supervisor socket and re-enter dispatch
        // (the host now drives via SNAPSHOT_ASYNC / SNAPSHOT / QUIT).
        if dispatch_exit == worker::DispatchExit::BakeReady {
            bake_ready_signaled = true;
            pool.signal_bake_ready();
            continue;
        }
        if dispatch_exit != worker::DispatchExit::Stopped
            && dispatch_exit != worker::DispatchExit::Canceled
        {
            break;
        }
        if pool.should_quit() {
            break;
        }
        if !pool.pause_requested() {
            if pool_mode && dispatch_exit == worker::DispatchExit::Canceled {
                continue;
            }
            break;
        }
        pool.clear_pause();
        // Snapshot requests are handled in-place: capture, save,
        // post result, then loop back to dispatch (the workload
        // is still alive). The pool host typically drops the Vm
        // immediately after — pool.shutdown then breaks us out
        // of the outer loop.
        if let Some(snap_req) = pool.take_snapshot_request() {
            let cap_t0 = std::time::Instant::now();
            // Quiesce the guest to WFI before capture. Mirrors
            // the bake-time snapshot path so the runner's snap
            // is taken from a clean kernel-idle state rather
            // than mid-syscall.
            //
            // Multi-vCPU exception: skip quiesce when vcpus > 1.
            // The host-side snapshot driver (`PooledVm::snapshot`
            // and the pipelined-bake's warm-snapshot RPC) issues
            // `smpark_park` BEFORE sending SNAPSHOT, which already
            // drove vCPU 0 + secondaries through a kernel-side
            // ioctl that ends with secondaries in WFI and vCPU 0
            // back in the agent's accept loop (also kernel-idle).
            // Layering quiesce_to_wfi on top added a second 100 ms
            // canceller-driven step that empirically left the
            // virtio-vsock RX path desynced — subsequent CONTROL
            // RPCs (the post-capture `smpark_unpark`) timed out.
            // Single-vCPU still runs quiesce_to_wfi; smpark only
            // applies for vcpus > 1 by design.
            let n_vcpus_total = vmm.coord.secondary_handles_snapshot().len() + 1;
            if n_vcpus_total <= 1 {
                if let Err(e) = worker::quiesce_to_wfi(
                    &vmm.vm.vcpu,
                    &vmm.bus,
                    &vmm.coord,
                    100,
                ) {
                    pool.post_snapshot_result(Err(format!("quiesce: {e:?}")));
                    continue;
                }
            }
            // Multi-vCPU rendezvous: ask secondaries to exit
            // hv_vcpu_run, capture their own register state on
            // their owning threads, deposit, and wait. We
            // (vcpu0's thread) then read vcpu0 inline and splice
            // the secondaries' states into the snapshot. Without
            // this, multi-vCPU snapshots would have stale or
            // missing per-vcpu state for indices >0 and the
            // restored guest's secondaries would resume from
            // garbage.
            let secondary_handles = vmm.coord.secondary_handles_snapshot();
            if !secondary_handles.is_empty() {
                vmm.coord.request_snapshot_pause(&secondary_handles);
            }
            let virtio = snapshot::VirtioSnapshot {
                mmio: vmm.all_mmio.iter().map(|m| m.capture_state()).collect(),
                vsock_listeners: vmm.vsock.muxer().capture_tsi_listeners(),
            };
            let secondary_states = if !secondary_handles.is_empty() {
                vmm.coord.take_secondary_states()
            } else {
                Vec::new()
            };
            // Note: we DO NOT prune finished async-save entries
            // here. Pruning at this point dropped the
            // Arc<CompactSnapshot> for completed base saves,
            // which then forced the warm SNAPSHOT-with-base call
            // to fall through to the slow lazy-load-from-disk
            // path (~500 ms penalty). Retention is bounded in
            // practice (the bake-then-pool flow has at most
            // base async + warm sync = 2 entries); QUIT drains
            // them all. If a long-running pool worker eventually
            // accumulates entries we'll add a bounded LRU here.

            // Diff-snapshot path (Sync + base_path provided).
            // Two ways to get the base CompactSnapshot:
            //
            //   1. In-flight async save matches base_path — join
            //      the save thread, reuse its in-memory copy
            //      (zero extra I/O). Hot path for the bake's
            //      pipelined warm-after-base flow.
            //
            //   2. No matching in-flight save — load base from
            //      disk lazily via mmap-walk. Adds ~150 ms but
            //      enables the diff path on the cycle-snapshot
            //      flow (`pooled.snapshot()` from a worker that
            //      was restored via `--restore-from`, not via
            //      SNAPSHOT_ASYNC). Net win is still ~100 ms vs
            //      the plain streaming sync save.
            let diff_base_arc: Option<(String, std::sync::Arc<snapshot::CompactSnapshot>)> =
                if matches!(snap_req.mode, crate::vmm::pool::SnapshotMode::Sync) {
                    snap_req.base_path.as_deref().and_then(|bp| {
                        // Try in-flight first.
                        let idx = in_flight_async_saves.iter().position(|s| s.path == bp);
                        if let Some(i) = idx {
                            let entry = in_flight_async_saves.remove(i);
                            let _ = entry.handle.join();
                            return Some((entry.path, entry.snapshot));
                        }
                        // Cache hit on the lazy-loaded base?
                        // The cycle-snapshot path repeats the
                        // same `bp` for every call, so this hit
                        // rate is ~100% in practice.
                        let cur_modified = std::fs::metadata(bp)
                            .ok()
                            .and_then(|m| m.modified().ok());
                        if let Some(c) = diff_base_cache.as_ref() {
                            if c.path == bp && c.modified == cur_modified {
                                return Some((bp.to_owned(), std::sync::Arc::clone(&c.snapshot)));
                            }
                        }
                        // Lazy-load from disk and populate cache.
                        match snapshot::load_compact_from_file(bp) {
                            Ok(loaded) => {
                                let arc = std::sync::Arc::new(loaded);
                                diff_base_cache = Some(DiffBaseCache {
                                    path: bp.to_owned(),
                                    modified: cur_modified,
                                    snapshot: std::sync::Arc::clone(&arc),
                                });
                                Some((bp.to_owned(), arc))
                            }
                            Err(e) => {
                                eprintln!(
                                    "[snapshot] diff base lazy-load from {bp} failed: {e:?}; \
                                     falling back to plain save"
                                );
                                None
                            }
                        }
                    })
                } else {
                    None
                };

            match snap_req.mode {
                crate::vmm::pool::SnapshotMode::Sync => {
                    if let Some((base_path, base_arc)) = diff_base_arc {
                        // Capture compact (in pause window), resume
                        // guest, then save via clonefile + diff
                        // pwrite.
                        let cap_only_t0 = std::time::Instant::now();
                        let compact = match snapshot::capture_compact(
                            &vmm.vm,
                            virtio,
                            secondary_states,
                        ) {
                            Ok(c) => c,
                            Err(e) => {
                                if !secondary_handles.is_empty() {
                                    vmm.coord.release_after_snapshot();
                                }
                                pool.post_snapshot_result(Err(format!(
                                    "diff snapshot capture: {e}"
                                )));
                                continue;
                            }
                        };
                        if !secondary_handles.is_empty() {
                            vmm.coord.release_after_snapshot();
                        }
                        let capture_us = cap_only_t0.elapsed().as_micros();

                        // Try clone+diff. On any failure (clonefile
                        // EXDEV, meta overflow, etc.), fall through
                        // to plain compact save.
                        let save_t0 = std::time::Instant::now();
                        let save_result = snapshot::save_compact_to_file_via_clone(
                            &compact,
                            &base_arc,
                            &base_path,
                            &snap_req.out_path,
                        );
                        let save_stats = match save_result {
                            Ok(s) => s,
                            Err(diff_err) => {
                                eprintln!(
                                    "[snapshot] diff path failed ({diff_err:?}); \
                                     falling back to plain compact save"
                                );
                                match snapshot::save_compact_to_file(
                                    &compact,
                                    &snap_req.out_path,
                                ) {
                                    Ok(s) => s,
                                    Err(e) => {
                                        pool.post_snapshot_result(Err(format!(
                                            "snapshot: {e:?}"
                                        )));
                                        continue;
                                    }
                                }
                            }
                        };
                        let total_us = save_t0.elapsed().as_micros();
                        // 0.7.6+ posix-fs sidecar: persist the
                        // (nodeid → host_path) table alongside the
                        // snapshot so warm restores don't see EAI/
                        // EISDIR on paths not walked during warmup.
                        // Best-effort; logs on failure.
                        snapshot::capture_and_write_posix_fs_sidecar(
                            &snap_req.out_path,
                            &vmm.posix_fs,
                        );
                        pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
                            bytes_written: save_stats.ram_data_bytes,
                            capture_us,
                            save_us: total_us,
                        }));
                    } else {
                        // Plain streaming capture+save under guest
                        // pause.
                        let stream_t0 = std::time::Instant::now();
                        let save_stats = match snapshot::capture_and_save_streaming(
                            &vmm.vm,
                            &virtio,
                            &secondary_states,
                            &snap_req.out_path,
                        ) {
                            Ok(s) => s,
                            Err(e) => {
                                if !secondary_handles.is_empty() {
                                    vmm.coord.release_after_snapshot();
                                }
                                pool.post_snapshot_result(Err(format!("snapshot: {e}")));
                                continue;
                            }
                        };
                        if !secondary_handles.is_empty() {
                            vmm.coord.release_after_snapshot();
                        }
                        let total_us = stream_t0.elapsed().as_micros();
                        let _ = cap_t0;
                        // Posix-fs sidecar — see Sync+diff branch.
                        snapshot::capture_and_write_posix_fs_sidecar(
                            &snap_req.out_path,
                            &vmm.posix_fs,
                        );
                        pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
                            bytes_written: save_stats.ram_bytes + save_stats.ram_data_bytes,
                            capture_us: 0,
                            save_us: total_us,
                        }));
                    }
                }
                crate::vmm::pool::SnapshotMode::Async => {
                    // Pipelined: capture into a compact in-memory
                    // buffer (~50 ms for ~100 MiB of non-zero pages
                    // on M-series), resume guest immediately, write
                    // to disk in a background thread.
                    let cap_only_t0 = std::time::Instant::now();
                    let compact = match snapshot::capture_compact(
                        &vmm.vm,
                        virtio,
                        secondary_states,
                    ) {
                        Ok(c) => c,
                        Err(e) => {
                            if !secondary_handles.is_empty() {
                                vmm.coord.release_after_snapshot();
                            }
                            pool.post_snapshot_result(Err(format!("snapshot: {e}")));
                            continue;
                        }
                    };
                    if !secondary_handles.is_empty() {
                        vmm.coord.release_after_snapshot();
                    }
                    let capture_us = cap_only_t0.elapsed().as_micros();
                    let out_path = snap_req.out_path.clone();
                    let n_pages = compact.pages.len();
                    let ram_size = compact.ram_size as u64;
                    // Wrap in Arc so a follow-up SNAPSHOT-with-base
                    // call can borrow the in-memory base for the
                    // diff path.
                    let compact_arc = std::sync::Arc::new(compact);
                    let snap_for_thread = std::sync::Arc::clone(&compact_arc);
                    let out_path_for_save = out_path.clone();
                    // Capture posix-fs state synchronously HERE
                    // (while the guest is still paused for this
                    // snapshot capture's secondary rendezvous) so
                    // the sidecar reflects the same instant the RAM
                    // image does. The actual sidecar write happens
                    // on the background thread after the snap save
                    // completes — non-blocking from the runner's
                    // POV.
                    let posix_fs_blobs_for_thread: Vec<Option<Vec<u8>>> = vmm
                        .posix_fs
                        .iter()
                        .map(|pfs| Some(pfs.snapshot_state()))
                        .collect();
                    // Clone the supervisor writer Arc so the bg save
                    // thread can post `SAVE_DONE <path>` when its
                    // write completes. This is what enables a host-
                    // side observer (e.g. a fresh `Pool::spawn_one`
                    // wanting to restore from this snapshot file) to
                    // wait for the save to finish without polling
                    // the filesystem.
                    let supervisor_writer_for_save = pool.supervisor_writer_clone();
                    let join = match std::thread::Builder::new()
                        .name("supermachine-snapshot-async-save".into())
                        .spawn(move || {
                            let result = snapshot::save_compact_to_file(
                                &snap_for_thread,
                                &out_path_for_save,
                            );
                            if let Err(ref e) = result {
                                eprintln!(
                                    "[snapshot-async] save {out_path_for_save} failed: {e:?}"
                                );
                            }
                            // Write the posix-fs sidecar only after
                            // the main snapshot landed — if the save
                            // failed, a stale sidecar would be
                            // worse than no sidecar.
                            if result.is_ok() && !posix_fs_blobs_for_thread.is_empty() {
                                let sidecar = snapshot::posix_fs_sidecar_path(&out_path_for_save);
                                if let Err(e) = snapshot::write_posix_fs_sidecar(
                                    &sidecar,
                                    &posix_fs_blobs_for_thread,
                                ) {
                                    eprintln!(
                                        "[snapshot-async] posix-fs sidecar {sidecar} write failed: \
                                         {e:?} — warm restores will fall back to lazy LOOKUP"
                                    );
                                }
                            }
                            // Notify supervisor. Best-effort — if the
                            // host-side reader has gone away or the
                            // writer is dropped, the message is just
                            // lost (no panic, no deadlock).
                            if let Some(w) = supervisor_writer_for_save {
                                if let Ok(mut g) = w.lock() {
                                    if let Some(s) = g.as_mut() {
                                        use std::io::Write;
                                        let _ = match &result {
                                            Ok(_) => writeln!(s, "SAVE_DONE {out_path_for_save}"),
                                            Err(e) => writeln!(
                                                s,
                                                "SAVE_FAIL {out_path_for_save} {}",
                                                format!("{e:?}").replace('\n', " ")
                                            ),
                                        };
                                        let _ = s.flush();
                                    }
                                }
                            }
                        }) {
                        Ok(j) => j,
                        Err(e) => {
                            pool.post_snapshot_result(Err(format!(
                                "spawn async save thread: {e}"
                            )));
                            continue;
                        }
                    };
                    in_flight_async_saves.push(InFlightAsyncSave {
                        path: out_path,
                        handle: join,
                        snapshot: compact_arc,
                    });
                    // Report immediately. `bytes_written` is the
                    // captured working set (≈ what'll hit disk).
                    pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
                        bytes_written: ram_size + (n_pages as u64) * 4096,
                        capture_us,
                        save_us: 0,
                    }));
                }
            }
            // Resume dispatch — the workload is still alive.
            continue;
        }
        let Some(req) = pool.take_restore_request() else {
            break;
        };
        if let Some(p) = req.egress_policy {
            crate::vmm::egress_policy::set(&p);
        }
        let t0 = std::time::Instant::now();
        // Reset all per-dispatch state.
        let phase_t0 = std::time::Instant::now();
        vmm.reset_vsock_transport();
        let reset_vsock_us = phase_t0.elapsed().as_micros();
        let phase_t0 = std::time::Instant::now();
        let file_meta = std::fs::metadata(&req.path).map_err(|source| RunError::MmapCow {
            path: req.path.clone(),
            source,
        })?;
        let modified = file_meta.modified().ok();
        let cache_hit = warm_snapshot_cache.as_ref().is_some_and(|cached| {
            cached.path == req.path
                && cached.file_len == file_meta.len()
                && cached.modified == modified
        });
        if !cache_hit {
            let file = std::fs::File::open(&req.path).map_err(|source| RunError::MmapCow {
                path: req.path.clone(),
                source,
            })?;
            let (snap, ram_offset, memory_bytes) =
                snapshot::load_meta(&req.path).map_err(|source| RunError::SnapshotLoad {
                    path: req.path.clone(),
                    source,
                })?;
            warm_snapshot_cache = Some(WarmSnapshotCache {
                path: req.path.clone(),
                file_len: file_meta.len(),
                modified,
                file,
                snap,
                ram_offset,
                memory_bytes,
            });
        }
        let load_meta_us = phase_t0.elapsed().as_micros();
        let cached = warm_snapshot_cache
            .as_ref()
            .expect("warm snapshot cache populated");
        // Multi-vCPU cycle-restore — phase 1: park secondaries in
        // `maybe_apply_restore` so they're out of `hv_vcpu_run`
        // before we remap RAM and rewrite GIC + vcpu0 state. Each
        // secondary spins waiting for its target state to be
        // published; we publish AFTER GIC + vcpu0 are done, in
        // phase 2 below.
        //
        // Without this, secondaries kept whatever state they had
        // accumulated during the previous pool cycle — the freshly
        // restored vcpu0/GIC state mismatched their stale ICH LRs
        // and redist bits, and the next softirq/IPI panicked.
        let secondary_handles = vmm.coord.secondary_handles_snapshot();
        if !secondary_handles.is_empty() {
            vmm.coord.pause_secondaries_for_restore(&secondary_handles);
        }

        // Re-mmap RAM from the (potentially new) snapshot path.
        // SAFETY: vCPU 0 has just exited dispatch; secondaries are
        // now parked in `maybe_apply_restore` (phase 1 above) so
        // the RAM remap is safe across all vCPUs.
        let phase_t0 = std::time::Instant::now();
        unsafe {
            if fixed_warm_ram_remap {
                vmm.vm.remap_cow_from_file_fixed(
                    &cached.file,
                    cached.ram_offset,
                    cached.memory_bytes,
                )?;
            } else {
                vmm.vm
                    .remap_cow_from_file(&cached.file, cached.ram_offset, cached.memory_bytes)?;
            }
        }
        let remap_cow_us = phase_t0.elapsed().as_micros();
        // Auto-skip-blob-on-cycle-restore: on the first cycle for
        // this worker, capture the live blob and compare to the
        // snapshot blob. Equal → distributor didn't drift during
        // last workload run → safe to skip on every cycle for this
        // worker. Different → drift; always restore.
        //
        // Cost is paid in `gic_capture_us`; net win starts on
        // cycle 2 (saves ~1.4 ms/cycle for typical workloads).
        let mut gic_capture_us: u128 = 0;
        if !gic_drift_check_done && !force_gic_restore && !skip_warm_gic_restore {
            let cap_t0 = std::time::Instant::now();
            match crate::hvf::gic_state_capture() {
                Ok(live_blob) => {
                    if live_blob.as_slice() != cached.snap.gic_blob.as_slice() {
                        gic_drift_detected = true;
                        eprintln!(
                            "  [warm-restore] GIC drift detected on first cycle \
                             (live={} bytes, snap={} bytes); always-restore for \
                             this worker. Set SUPERMACHINE_FORCE_GIC_RESTORE=1 \
                             to disable the auto-skip optimization globally.",
                            live_blob.len(),
                            cached.snap.gic_blob.len(),
                        );
                    } else {
                        eprintln!(
                            "  [warm-restore] GIC blob stable across cycles; \
                             auto-skipping blob restore (saves ~1.4 ms/cycle)."
                        );
                    }
                }
                Err(e) => {
                    // Capture failed — be conservative, fall back to
                    // always-restore. Probably HVF in a state where
                    // capture isn't allowed; restore will likely fail
                    // too, but that's the existing behavior.
                    gic_drift_detected = true;
                    eprintln!(
                        "  [warm-restore] GIC capture failed ({e:?}); \
                         falling back to always-restore."
                    );
                }
            }
            gic_drift_check_done = true;
            gic_capture_us = cap_t0.elapsed().as_micros();
        }
        // Decide whether to skip on THIS cycle. `skip_warm_gic_restore`
        // is the test-only env override (force-skip, bypassing the
        // safety check). Otherwise we skip iff we've completed the
        // drift check AND it found no drift, AND the embedder hasn't
        // forced restore.
        let skip_blob_this_cycle = if skip_warm_gic_restore {
            true // explicit override
        } else if force_gic_restore {
            false // embedder escape hatch
        } else {
            gic_drift_check_done && !gic_drift_detected
        };
        // Restore vcpu0 + GIC blob from the already-loaded metadata.
        // The GIC blob restore inside this is GLOBAL (distributor +
        // redistributor) — must precede secondaries' per-vcpu state
        // application so their ICH LRs reference a coherent INTID
        // space.
        let phase_t0 = std::time::Instant::now();
        let restore_timings = vmm.restore_snapshot_timed_with_options(
            &cached.snap,
            snapshot::SnapshotRestoreOptions {
                skip_gic_blob: skip_blob_this_cycle,
            },
        )?;
        // Phase 2: publish per-secondary states; each secondary's
        // spin-wait in `maybe_apply_restore` resolves, applies its
        // state on its OWNING thread, and parks at the resume
        // rendezvous. We block here until all running secondaries
        // have applied.
        if !secondary_handles.is_empty() {
            let secondary_states: Vec<Option<crate::vmm::snapshot::PerVcpuState>> = (1..vmm
                .coord
                .n_vcpus
                as usize)
                .map(|i| cached.snap.per_vcpu.get(i).cloned())
                .collect();
            vmm.coord
                .publish_and_wait_secondary_restore(&secondary_states);
            // Bump the resume generation — secondaries return from
            // `maybe_apply_restore` and re-enter `hv_vcpu_run` with
            // the snapshot's coherent state.
            vmm.coord.release_after_restore();
        }
        let restore_snapshot_us = phase_t0.elapsed().as_micros();
        let us = t0.elapsed().as_micros();
        let timings = WarmRestoreTimings {
            reset_vsock_us,
            remap_cow_us,
            load_meta_us,
            restore_snapshot_us,
            ram_copy_us: restore_timings.ram_copy_us,
            gic_restore_us: restore_timings.gic_restore_us,
            vcpu_restore_us: restore_timings.vcpu_restore_us,
            vtimer_offset_us: restore_timings.vtimer_offset_us,
            mmio_restore_us: restore_timings.mmio_restore_us,
            listener_restore_us: restore_timings.listener_restore_us,
        };
        eprintln!(
            "  warm restore from {} in {us} us (reset={} remap={} load_meta={} gic_check={} restore={} ram={} gic={} vcpu={} vtimer={} mmio={} listener={}{})",
            req.path,
            timings.reset_vsock_us,
            timings.remap_cow_us,
            timings.load_meta_us,
            gic_capture_us,
            timings.restore_snapshot_us,
            timings.ram_copy_us,
            timings.gic_restore_us,
            timings.vcpu_restore_us,
            timings.vtimer_offset_us,
            timings.mmio_restore_us,
            timings.listener_restore_us,
            if skip_blob_this_cycle { " [gic-skip]" } else { "" },
        );
        report.warm_restores += 1;
        pool.complete_restore(us, vmm.vsock.muxer().first_host_port(), timings);
    }
    // Drain any in-flight async saves before returning. Without
    // this, the worker process exits, the OS reaps the save
    // thread mid-write, and we leave a `<path>.partial` instead
    // of the canonical file. Critical for the bake-then-pool
    // flow's base snapshot, where the save runs in parallel
    // with the warmup workload and the warm SNAPSHOT+QUIT might
    // arrive before the base save completes.
    for entry in in_flight_async_saves.drain(..) {
        let _ = entry.handle.join();
    }
    Ok(report)
}