supermachine 0.4.12

//! VM runner — the reusable library entry point behind the
//! `supermachine-worker` command-line harness. Owns VM boot/restore,
//! virtio device wiring, vCPU dispatch, snapshot triggers, and
//! pool-worker warm-restore loops.

use std::fmt;
use std::os::unix::net::UnixStream;
use std::time::SystemTime;

use crate::vmm::pool::PoolWorker;
use crate::vmm::resources::{ResourceError, VmResources};
use crate::vmm::tls::TlsConfig;

#[derive(Default)]
pub struct RunOptions {
    pub tls: Option<TlsConfig>,
    pub pool_sock: Option<UnixStream>,
    pub pool_worker: Option<PoolWorker>,
    /// Experimental warm-pool shortcut used to prove whether full HVF GIC blob
    /// restore is required for a snapshot class.
    pub experimental_skip_warm_gic_restore: bool,
}

#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct RunReport {
    pub warm_restores: u64,
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
struct WarmSnapshotCache {
    path: String,
    file_len: u64,
    modified: Option<SystemTime>,
    file: std::fs::File,
    snap: crate::vmm::snapshot::Snapshot,
    ram_offset: u64,
    memory_bytes: usize,
}

#[derive(Debug)]
pub enum RunError {
    Build(crate::vmm::builder::BuildError),
    Hvf(crate::hvf::Error),
    MmapCow {
        path: String,
        source: std::io::Error,
    },
    Pool(crate::vmm::pool::PoolError),
    Resource(ResourceError),
    SnapshotLoad {
        path: String,
        source: crate::vmm::snapshot::FileError,
    },
    ThreadSpawn {
        name: String,
        source: std::io::Error,
    },
    Tls(crate::vmm::tls::StartError),
    UnexpectedProofOfLifeExit {
        reason: crate::hvf::ExitReason,
        ec: u64,
    },
    VsockMux(crate::vmm::vsock_mux::StartError),
    Worker(crate::vmm::worker::WorkerError),
}

impl fmt::Display for RunError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            RunError::Build(e) => write!(f, "{e}"),
            RunError::Hvf(e) => write!(f, "HVF operation failed: {e:?}"),
            RunError::MmapCow { path, source } => {
                write!(f, "mmap CoW snapshot RAM {path}: {source}")
            }
            RunError::Pool(e) => write!(f, "{e}"),
            RunError::Resource(e) => write!(f, "{e}"),
            RunError::SnapshotLoad { path, source } => {
                write!(f, "load snapshot {path}: {source:?}")
            }
            RunError::ThreadSpawn { name, source } => {
                write!(f, "spawn thread {name}: {source}")
            }
            RunError::Tls(e) => write!(f, "{e}"),
            RunError::UnexpectedProofOfLifeExit { reason, ec } => {
                write!(
                    f,
                    "unexpected proof-of-life exit: {reason:?} ESR_EL2 EC={ec:#x}"
                )
            }
            RunError::VsockMux(e) => write!(f, "{e}"),
            RunError::Worker(e) => write!(f, "{e}"),
        }
    }
}

impl std::error::Error for RunError {}

impl From<ResourceError> for RunError {
    fn from(value: ResourceError) -> Self {
        Self::Resource(value)
    }
}

impl From<crate::vmm::builder::BuildError> for RunError {
    fn from(value: crate::vmm::builder::BuildError) -> Self {
        Self::Build(value)
    }
}

impl From<crate::hvf::Error> for RunError {
    fn from(value: crate::hvf::Error) -> Self {
        Self::Hvf(value)
    }
}

impl From<crate::vmm::pool::PoolError> for RunError {
    fn from(value: crate::vmm::pool::PoolError) -> Self {
        Self::Pool(value)
    }
}

impl From<crate::vmm::worker::WorkerError> for RunError {
    fn from(value: crate::vmm::worker::WorkerError) -> Self {
        Self::Worker(value)
    }
}

impl From<crate::vmm::vsock_mux::StartError> for RunError {
    fn from(value: crate::vmm::vsock_mux::StartError) -> Self {
        Self::VsockMux(value)
    }
}

impl From<crate::vmm::tls::StartError> for RunError {
    fn from(value: crate::vmm::tls::StartError) -> Self {
        Self::Tls(value)
    }
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn run(resources: &VmResources, options: RunOptions) -> Result<RunReport, RunError> {
    // Bake-then-pool relaxes the "snapshot trigger requires out_path"
    // rule. The runner doesn't actually capture in that mode — it
    // signals BAKE_READY on the supervisor socket and hands the
    // SNAPSHOT_ASYNC / SNAPSHOT RPCs to the host instead.
    let bake_then_pool = (options.pool_sock.is_some() || options.pool_worker.is_some())
        && resources.restore_from.is_none()
        && resources.snapshot.out_path.is_none();
    if !bake_then_pool {
        resources.validate_for_run()?;
    } else if resources.memory_bytes() == 0 {
        return Err(RunError::Resource(
            crate::vmm::resources::ResourceError::ZeroMemory,
        ));
    } else if resources.vcpus == 0 {
        return Err(RunError::Resource(
            crate::vmm::resources::ResourceError::ZeroVcpus,
        ));
    } else if resources.kernel_path.is_none() {
        return Err(RunError::Resource(
            crate::vmm::resources::ResourceError::MissingKernel,
        ));
    }
    run_kernel(
        resources,
        options.tls,
        options.pool_sock,
        options.pool_worker,
        options.experimental_skip_warm_gic_restore,
    )
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn run_proof_of_life() -> Result<(), RunError> {
    use crate::arch::aarch64::layout;
    use crate::vmm::vstate::{MicroVm, TEST_PROGRAM};

    eprintln!("supermachine: HVF init test");
    let vm = MicroVm::new(64 * 1024 * 1024)?;
    eprintln!(
        "  VM created, RAM mapped at GPA 0x{:x}, {} MiB",
        vm.ram_gpa,
        vm.ram_size / (1024 * 1024)
    );

    let entry = vm.ram_gpa + layout::KERNEL_LOAD_OFFSET;
    // SAFETY: TEST_PROGRAM is 8 bytes, well within RAM.
    unsafe {
        vm.write_ram(entry, &TEST_PROGRAM);
    }
    vm.set_boot_cpsr()?;
    vm.set_pc(entry)?;
    eprintln!("  PC set to 0x{entry:x}, CPSR=EL1h (DAIF masked)");

    eprintln!("  running vCPU…");
    let (reason, esr, _gpa, _va) = vm.run_once()?;
    let ec = (esr >> 26) & 0x3f;
    eprintln!("  exit: {reason:?}  ESR_EL2=0x{esr:x}  EC={ec:#x}");

    if reason == crate::hvf::ExitReason::Exception && ec == 0x16 {
        eprintln!("  PASS: HVF round-trip working — guest executed HVC #0");
        Ok(())
    } else {
        eprintln!("  UNEXPECTED exit; HVF probably misconfigured");
        Err(RunError::UnexpectedProofOfLifeExit { reason, ec })
    }
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn run_kernel(
    resources: &VmResources,
    tls_cfg: Option<TlsConfig>,
    pool_sock: Option<UnixStream>,
    pool_worker: Option<PoolWorker>,
    option_skip_warm_gic_restore: bool,
) -> Result<RunReport, RunError> {
    use crate::vmm::builder;
    use crate::vmm::pool::{PoolControl, WarmRestoreTimings};
    use crate::vmm::snapshot;
    use crate::vmm::worker::{self, DispatchSnapshot};

    let kernel_path = resources.kernel_path.as_deref();
    let initrd_path = resources.initrd_path.as_deref();
    let cmdline = resources.cmdline.as_str();
    let mem_size = resources.memory_bytes();
    let blk_paths = &resources.block_devices;
    let n_vcpus = resources.vcpus;
    let snapshot_after_ms = resources.snapshot.after_ms;
    let snapshot_at = resources.snapshot.at_heartbeat;
    let snapshot_on_listener = resources.snapshot.on_listener;
    let quiesce_ms = resources.snapshot.quiesce_ms;
    let snapshot_out = resources.snapshot.out_path.as_deref();
    let balloon_target_pages = resources.balloon_target_pages;
    let restore_from = resources.restore_from.as_deref();
    let cow_restore = resources.cow_restore;
    let vsock_mux_path = resources.endpoints.vsock_mux.as_deref();
    let http_port_addr = resources.endpoints.http_port.as_deref();
    let vsock_mux_handoff_path = resources.endpoints.vsock_mux_handoff.as_deref();
    let vsock_exec_path = resources.endpoints.vsock_exec.as_deref();
    let vsock_exec_guest_port = resources
        .endpoints
        .vsock_exec_guest_port
        .unwrap_or(crate::vmm::resources::DEFAULT_EXEC_GUEST_PORT);
    let timings = std::env::var_os("SUPERMACHINE_TIMINGS").is_some();
    let skip_warm_gic_restore =
        option_skip_warm_gic_restore || std::env::var_os("SUPERMACHINE_SKIP_WARM_GIC_RESTORE").is_some();
    let fixed_warm_ram_remap = std::env::var_os("SUPERMACHINE_REMAP_FIXED").is_some();
    let run_t0 = std::time::Instant::now();

    let mut cow_ram: Option<(*mut u8, usize)> = None;
    let restore = match restore_from {
        Some(p) => Some(if cow_restore {
            eprintln!("supermachine: restoring from {p} (CoW mmap)");
            let t0 = std::time::Instant::now();
            let (snap, ram_offset, memory_bytes) =
                snapshot::load_meta(p).map_err(|source| RunError::SnapshotLoad {
                    path: p.to_string(),
                    source,
                })?;
            if timings {
                eprintln!(
                    "[timing] restore.load_meta={}us total={}us",
                    t0.elapsed().as_micros(),
                    run_t0.elapsed().as_micros()
                );
            }
            let t0 = std::time::Instant::now();
            let (ptr, len) =
                snapshot::mmap_ram_cow_at(p, ram_offset, memory_bytes).map_err(|source| {
                    RunError::MmapCow {
                        path: p.to_string(),
                        source,
                    }
                })?;
            if timings {
                eprintln!(
                    "[timing] restore.mmap_cow={}us total={}us",
                    t0.elapsed().as_micros(),
                    run_t0.elapsed().as_micros()
                );
            }
            cow_ram = Some((ptr, len));
            snap
        } else {
            eprintln!("supermachine: restoring from {p}");
            let t0 = std::time::Instant::now();
            snapshot::load_from_file(p)
                .map_err(|source| RunError::SnapshotLoad {
                    path: p.to_string(),
                    source,
                })
                .inspect(|_| {
                    if timings {
                        eprintln!(
                            "[timing] restore.load_full={}us total={}us",
                            t0.elapsed().as_micros(),
                            run_t0.elapsed().as_micros()
                        );
                    }
                })?
        }),
        None => None,
    };

    if restore.is_none() {
        eprintln!("supermachine: kernel boot");
        eprintln!("  kernel    : {}", kernel_path.unwrap_or(""));
        if let Some(p) = initrd_path {
            eprintln!("  initramfs : {p}");
        }
        eprintln!("  cmdline   : {cmdline}");
        eprintln!("  memory    : {} MiB", mem_size / (1024 * 1024));
        for p in blk_paths {
            eprintln!("  blk       : {p}");
        }
    } else if let Some(s) = restore.as_ref() {
        eprintln!(
            "  memory    : {} MiB (from snapshot)",
            s.memory.len() / (1024 * 1024)
        );
    }

    let restore_memory_len = restore.as_ref().map(|s| s.memory.len());
    let t0 = std::time::Instant::now();
    let mut vmm = builder::build_vmm(resources, cow_ram, restore_memory_len)?;
    if timings {
        eprintln!(
            "[timing] restore.build_vmm={}us total={}us",
            t0.elapsed().as_micros(),
            run_t0.elapsed().as_micros()
        );
    }

    // Bind host-facing endpoints BEFORE restore. The lib's
    // `wait_for_socket` (and the router's worker-readiness
    // probe) detects the unix socket existing as the "spawn
    // done" signal — moving the bind earlier means clients
    // stop polling sooner and pay less spawn-roundtrip cost.
    //
    // Safety: each accept handler calls `wait_for_host_port`,
    // which polls the muxer's TSI listener registry until the
    // guest's listener registers (post-restore). So if a
    // client connects before restore completes, the kernel
    // queues the connection, accept fires, the handler blocks
    // in `wait_for_host_port` until the guest is up, then
    // proceeds. No data is misrouted.
    if let Some(c) = tls_cfg {
        crate::vmm::tls::start(c, vmm.vsock.clone())?;
    }
    if let Some(p) = vsock_mux_path {
        crate::vmm::vsock_mux::start(p, vmm.vsock.clone(), None)?;
    }
    if let Some(addr) = http_port_addr {
        crate::vmm::vsock_mux::start_tcp(addr, vmm.vsock.clone(), None)?;
    }
    if let Some(p) = vsock_mux_handoff_path {
        crate::vmm::vsock_mux::start_handoff(p, vmm.vsock.clone(), None)?;
    }
    if let Some(p) = vsock_exec_path {
        crate::vmm::vsock_mux::start_exec(p, vmm.vsock.clone(), vsock_exec_guest_port)?;
    }
    if timings {
        eprintln!(
            "[timing] restore.endpoints_ready={}us",
            run_t0.elapsed().as_micros()
        );
    }

    let mut first_restore_us: u128 = 0;
    if let Some(snap) = restore.as_ref() {
        let t0 = std::time::Instant::now();
        vmm.restore_snapshot(snap)?;
        first_restore_us = t0.elapsed().as_micros();
        if timings {
            eprintln!(
                "[timing] restore.state={}us total={}us",
                first_restore_us,
                run_t0.elapsed().as_micros()
            );
        }
        eprintln!(
            "  restored in {first_restore_us} us  (mmio={} listeners={})",
            snap.virtio.mmio.len(),
            snap.virtio.vsock_listeners.len()
        );
    }

    // Drive the virtio-balloon device. The balloon-target is
    // bake-time-known (metadata.json `balloon_target_pages` is
    // typically 75% of memory_mib in 4 KiB pages). Once the
    // guest's `virtio_balloon` driver wakes (after restore + any
    // pending IRQs are processed), it inflates by N pages and
    // pushes their PFN list to the inflate queue, which our
    // device handler then `madvise(MADV_FREE)`s on the host's
    // CoW RAM mapping. macOS reclaims under pressure → idle
    // worker RSS drops from ~memory_mib to ~25% of memory_mib.
    //
    // Caller can override per-snapshot via env var
    // `SUPERMACHINE_BALLOON_TARGET_PAGES` (lib doesn't usually
    // do this; the bake-time metadata.json default suffices).
    if let Some(pages) = balloon_target_pages {
        vmm.balloon.request_inflate(pages);
        eprintln!("  balloon: requested inflate of {pages} pages");
    }

    // Spawn N-1 secondary threads. On a fresh boot each
    // secondary parks until PSCI CPU_ON wakes it. On a restore,
    // each gets its captured `PerVcpuState` and skips PSCI —
    // the kernel "thinks" we're already up from the original
    // bake.
    //
    // `secondary_states[i]` is the snapshot's per_vcpu[i+1] if
    // the snapshot has N>1 entries, else None (boot-from-scratch
    // path). vcpu0's state is already loaded by
    // `restore_snapshot` above.
    let secondary_states: Vec<Option<crate::vmm::snapshot::PerVcpuState>> = restore
        .as_ref()
        .map(|s| {
            (1..n_vcpus as usize)
                .map(|i| s.per_vcpu.get(i).cloned())
                .collect()
        })
        .unwrap_or_else(|| (1..n_vcpus).map(|_| None).collect());

    for idx in 1..n_vcpus {
        let coord_c = vmm.coord.clone();
        let bus_c = vmm.bus.clone();
        let name = format!("vcpu-{idx}");
        let st = secondary_states
            .get((idx - 1) as usize)
            .cloned()
            .unwrap_or(None);
        std::thread::Builder::new()
            .name(name.clone())
            .spawn(move || worker::run_secondary(idx, coord_c, bus_c, st))
            .map_err(|source| RunError::ThreadSpawn { name, source })?;
    }

    eprintln!("  vCPU launched ({n_vcpus} total), dispatch loop running\n");
    if timings && restore.is_some() {
        eprintln!(
            "[timing] restore.vcpu_launched={}us",
            run_t0.elapsed().as_micros()
        );
    }

    let pool_mode = pool_sock.is_some() || pool_worker.is_some();
    // Bake-then-pool: pool ctl is connected AND the worker was
    // started without --restore-from AND without --snapshot-out.
    // The dispatch loop signals readiness via `BAKE_READY` and
    // hands control to the host for warmup + final SNAPSHOT.
    let bake_then_pool = pool_mode && restore_from.is_none() && snapshot_out.is_none();
    let mut bake_ready_signaled = false;
    let transport_idle = pool_mode.then(|| {
        let vsock = vmm.vsock.clone();
        std::sync::Arc::new(move || vsock.is_transport_idle())
            as std::sync::Arc<dyn Fn() -> bool + Send + Sync>
    });
    let mut pool = PoolControl::start(
        pool_sock.as_ref(),
        pool_worker,
        restore.is_some().then_some(first_restore_us),
        restore
            .is_some()
            .then(|| vmm.vsock.muxer().first_host_port())
            .flatten(),
        vmm.vm.vcpu.handle(),
        transport_idle,
    )?;
    let mut report = RunReport::default();
    let mut warm_snapshot_cache: Option<WarmSnapshotCache> = None;

    // In-flight async-save threads. Drained at QUIT (and on
    // every snapshot RPC entry, to keep the list bounded). Each
    // thread writes a CompactSnapshot to disk via `.partial`+
    // rename. Holding the join handles here lets QUIT block
    // until pending writes hit disk — without this, a `QUIT`
    // immediately after `SNAPSHOT_ASYNC` would race the worker
    // exit against the in-flight save and leave a `.partial`.
    //
    // Each entry also retains an `Arc<CompactSnapshot>` so a
    // follow-up diff-save (Sync with `base_path` matching this
    // entry's path) can reuse the in-memory base for clonefile-
    // based diff capture without re-reading from disk.
    struct InFlightAsyncSave {
        path: String,
        handle: std::thread::JoinHandle<()>,
        snapshot: std::sync::Arc<crate::vmm::snapshot::CompactSnapshot>,
    }
    let mut in_flight_async_saves: Vec<InFlightAsyncSave> = Vec::new();

    // Lazy-load cache for the diff-snapshot path. Keyed on path
    // (typically the worker's restore source — same across all
    // cycle-snapshot calls in a pool). Capped at one entry: the
    // base CompactSnapshot is ~100 MiB on rust:1-slim, holding
    // multiple is wasteful and the cycle-snapshot path always
    // hits the same path. Invalidated by file mtime change so
    // a re-baked base correctly forces a reload.
    struct DiffBaseCache {
        path: String,
        modified: Option<SystemTime>,
        snapshot: std::sync::Arc<crate::vmm::snapshot::CompactSnapshot>,
    }
    let mut diff_base_cache: Option<DiffBaseCache> = None;

    // Outer pool-worker loop. Without pool-worker mode this runs once.
    loop {
        let dispatch_exit = worker::dispatch_vcpu(
            0,
            &vmm.vm.vcpu,
            &vmm.bus,
            &vmm.coord,
            &vmm.all_mmio,
            &vmm.vsock,
            &vmm.vm,
            DispatchSnapshot {
                after_ms: snapshot_after_ms.or(if bake_then_pool { Some(10_000) } else { None }),
                at_heartbeat: snapshot_at,
                on_listener: snapshot_on_listener || bake_then_pool,
                quiesce_ms,
                out_path: snapshot_out,
                stop_requested: Some(pool.pause_flag()),
                bake_ready_signal: bake_then_pool && !bake_ready_signaled,
            },
        )?;
        // BakeReady is the bake-then-pool init-done signal. Fire
        // BAKE_READY on the supervisor socket and re-enter dispatch
        // (the host now drives via SNAPSHOT_ASYNC / SNAPSHOT / QUIT).
        if dispatch_exit == worker::DispatchExit::BakeReady {
            bake_ready_signaled = true;
            pool.signal_bake_ready();
            continue;
        }
        if dispatch_exit != worker::DispatchExit::Stopped
            && dispatch_exit != worker::DispatchExit::Canceled
        {
            break;
        }
        if pool.should_quit() {
            break;
        }
        if !pool.pause_requested() {
            if pool_mode && dispatch_exit == worker::DispatchExit::Canceled {
                continue;
            }
            break;
        }
        pool.clear_pause();
        // Snapshot requests are handled in-place: capture, save,
        // post result, then loop back to dispatch (the workload
        // is still alive). The pool host typically drops the Vm
        // immediately after — pool.shutdown then breaks us out
        // of the outer loop.
        if let Some(snap_req) = pool.take_snapshot_request() {
            let cap_t0 = std::time::Instant::now();
            // Quiesce the guest to WFI before capture. Mirrors
            // the bake-time snapshot path so the runner's snap
            // is taken from a clean kernel-idle state rather
            // than mid-syscall.
            if let Err(e) = worker::quiesce_to_wfi(
                &vmm.vm.vcpu,
                &vmm.bus,
                &vmm.coord,
                100,
            ) {
                pool.post_snapshot_result(Err(format!("quiesce: {e:?}")));
                continue;
            }
            // Multi-vCPU rendezvous: ask secondaries to exit
            // hv_vcpu_run, capture their own register state on
            // their owning threads, deposit, and wait. We
            // (vcpu0's thread) then read vcpu0 inline and splice
            // the secondaries' states into the snapshot. Without
            // this, multi-vCPU snapshots would have stale or
            // missing per-vcpu state for indices >0 and the
            // restored guest's secondaries would resume from
            // garbage.
            let secondary_handles = vmm.coord.secondary_handles_snapshot();
            if !secondary_handles.is_empty() {
                vmm.coord.request_snapshot_pause(&secondary_handles);
            }
            let virtio = snapshot::VirtioSnapshot {
                mmio: vmm.all_mmio.iter().map(|m| m.capture_state()).collect(),
                vsock_listeners: vmm.vsock.muxer().capture_tsi_listeners(),
            };
            let secondary_states = if !secondary_handles.is_empty() {
                vmm.coord.take_secondary_states()
            } else {
                Vec::new()
            };
            // Note: we DO NOT prune finished async-save entries
            // here. Pruning at this point dropped the
            // Arc<CompactSnapshot> for completed base saves,
            // which then forced the warm SNAPSHOT-with-base call
            // to fall through to the slow lazy-load-from-disk
            // path (~500 ms penalty). Retention is bounded in
            // practice (the bake-then-pool flow has at most
            // base async + warm sync = 2 entries); QUIT drains
            // them all. If a long-running pool worker eventually
            // accumulates entries we'll add a bounded LRU here.

            // Diff-snapshot path (Sync + base_path provided).
            // Two ways to get the base CompactSnapshot:
            //
            //   1. In-flight async save matches base_path — join
            //      the save thread, reuse its in-memory copy
            //      (zero extra I/O). Hot path for the bake's
            //      pipelined warm-after-base flow.
            //
            //   2. No matching in-flight save — load base from
            //      disk lazily via mmap-walk. Adds ~150 ms but
            //      enables the diff path on the cycle-snapshot
            //      flow (`pooled.snapshot()` from a worker that
            //      was restored via `--restore-from`, not via
            //      SNAPSHOT_ASYNC). Net win is still ~100 ms vs
            //      the plain streaming sync save.
            let diff_base_arc: Option<(String, std::sync::Arc<snapshot::CompactSnapshot>)> =
                if matches!(snap_req.mode, crate::vmm::pool::SnapshotMode::Sync) {
                    snap_req.base_path.as_deref().and_then(|bp| {
                        // Try in-flight first.
                        let idx = in_flight_async_saves.iter().position(|s| s.path == bp);
                        if let Some(i) = idx {
                            let entry = in_flight_async_saves.remove(i);
                            let _ = entry.handle.join();
                            return Some((entry.path, entry.snapshot));
                        }
                        // Cache hit on the lazy-loaded base?
                        // The cycle-snapshot path repeats the
                        // same `bp` for every call, so this hit
                        // rate is ~100% in practice.
                        let cur_modified = std::fs::metadata(bp)
                            .ok()
                            .and_then(|m| m.modified().ok());
                        if let Some(c) = diff_base_cache.as_ref() {
                            if c.path == bp && c.modified == cur_modified {
                                return Some((bp.to_owned(), std::sync::Arc::clone(&c.snapshot)));
                            }
                        }
                        // Lazy-load from disk and populate cache.
                        match snapshot::load_compact_from_file(bp) {
                            Ok(loaded) => {
                                let arc = std::sync::Arc::new(loaded);
                                diff_base_cache = Some(DiffBaseCache {
                                    path: bp.to_owned(),
                                    modified: cur_modified,
                                    snapshot: std::sync::Arc::clone(&arc),
                                });
                                Some((bp.to_owned(), arc))
                            }
                            Err(e) => {
                                eprintln!(
                                    "[snapshot] diff base lazy-load from {bp} failed: {e:?}; \
                                     falling back to plain save"
                                );
                                None
                            }
                        }
                    })
                } else {
                    None
                };

            match snap_req.mode {
                crate::vmm::pool::SnapshotMode::Sync => {
                    if let Some((base_path, base_arc)) = diff_base_arc {
                        // Capture compact (in pause window), resume
                        // guest, then save via clonefile + diff
                        // pwrite.
                        let cap_only_t0 = std::time::Instant::now();
                        let compact = match snapshot::capture_compact(
                            &vmm.vm,
                            virtio,
                            secondary_states,
                        ) {
                            Ok(c) => c,
                            Err(e) => {
                                if !secondary_handles.is_empty() {
                                    vmm.coord.release_after_snapshot();
                                }
                                pool.post_snapshot_result(Err(format!(
                                    "diff snapshot capture: {e}"
                                )));
                                continue;
                            }
                        };
                        if !secondary_handles.is_empty() {
                            vmm.coord.release_after_snapshot();
                        }
                        let capture_us = cap_only_t0.elapsed().as_micros();

                        // Try clone+diff. On any failure (clonefile
                        // EXDEV, meta overflow, etc.), fall through
                        // to plain compact save.
                        let save_t0 = std::time::Instant::now();
                        let save_result = snapshot::save_compact_to_file_via_clone(
                            &compact,
                            &base_arc,
                            &base_path,
                            &snap_req.out_path,
                        );
                        let save_stats = match save_result {
                            Ok(s) => s,
                            Err(diff_err) => {
                                eprintln!(
                                    "[snapshot] diff path failed ({diff_err:?}); \
                                     falling back to plain compact save"
                                );
                                match snapshot::save_compact_to_file(
                                    &compact,
                                    &snap_req.out_path,
                                ) {
                                    Ok(s) => s,
                                    Err(e) => {
                                        pool.post_snapshot_result(Err(format!(
                                            "snapshot: {e:?}"
                                        )));
                                        continue;
                                    }
                                }
                            }
                        };
                        let total_us = save_t0.elapsed().as_micros();
                        pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
                            bytes_written: save_stats.ram_data_bytes,
                            capture_us,
                            save_us: total_us,
                        }));
                    } else {
                        // Plain streaming capture+save under guest
                        // pause.
                        let stream_t0 = std::time::Instant::now();
                        let save_stats = match snapshot::capture_and_save_streaming(
                            &vmm.vm,
                            &virtio,
                            &secondary_states,
                            &snap_req.out_path,
                        ) {
                            Ok(s) => s,
                            Err(e) => {
                                if !secondary_handles.is_empty() {
                                    vmm.coord.release_after_snapshot();
                                }
                                pool.post_snapshot_result(Err(format!("snapshot: {e}")));
                                continue;
                            }
                        };
                        if !secondary_handles.is_empty() {
                            vmm.coord.release_after_snapshot();
                        }
                        let total_us = stream_t0.elapsed().as_micros();
                        let _ = cap_t0;
                        pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
                            bytes_written: save_stats.ram_bytes + save_stats.ram_data_bytes,
                            capture_us: 0,
                            save_us: total_us,
                        }));
                    }
                }
                crate::vmm::pool::SnapshotMode::Async => {
                    // Pipelined: capture into a compact in-memory
                    // buffer (~50 ms for ~100 MiB of non-zero pages
                    // on M-series), resume guest immediately, write
                    // to disk in a background thread.
                    let cap_only_t0 = std::time::Instant::now();
                    let compact = match snapshot::capture_compact(
                        &vmm.vm,
                        virtio,
                        secondary_states,
                    ) {
                        Ok(c) => c,
                        Err(e) => {
                            if !secondary_handles.is_empty() {
                                vmm.coord.release_after_snapshot();
                            }
                            pool.post_snapshot_result(Err(format!("snapshot: {e}")));
                            continue;
                        }
                    };
                    if !secondary_handles.is_empty() {
                        vmm.coord.release_after_snapshot();
                    }
                    let capture_us = cap_only_t0.elapsed().as_micros();
                    let out_path = snap_req.out_path.clone();
                    let n_pages = compact.pages.len();
                    let ram_size = compact.ram_size as u64;
                    // Wrap in Arc so a follow-up SNAPSHOT-with-base
                    // call can borrow the in-memory base for the
                    // diff path.
                    let compact_arc = std::sync::Arc::new(compact);
                    let snap_for_thread = std::sync::Arc::clone(&compact_arc);
                    let out_path_for_save = out_path.clone();
                    let join = match std::thread::Builder::new()
                        .name("supermachine-snapshot-async-save".into())
                        .spawn(move || {
                            if let Err(e) = snapshot::save_compact_to_file(
                                &snap_for_thread,
                                &out_path_for_save,
                            ) {
                                eprintln!(
                                    "[snapshot-async] save {out_path_for_save} failed: {e:?}"
                                );
                            }
                        }) {
                        Ok(j) => j,
                        Err(e) => {
                            pool.post_snapshot_result(Err(format!(
                                "spawn async save thread: {e}"
                            )));
                            continue;
                        }
                    };
                    in_flight_async_saves.push(InFlightAsyncSave {
                        path: out_path,
                        handle: join,
                        snapshot: compact_arc,
                    });
                    // Report immediately. `bytes_written` is the
                    // captured working set (≈ what'll hit disk).
                    pool.post_snapshot_result(Ok(crate::vmm::pool::SnapshotResult {
                        bytes_written: ram_size + (n_pages as u64) * 4096,
                        capture_us,
                        save_us: 0,
                    }));
                }
            }
            // Resume dispatch — the workload is still alive.
            continue;
        }
        let Some(req) = pool.take_restore_request() else {
            break;
        };
        if let Some(p) = req.egress_policy {
            crate::vmm::egress_policy::set(&p);
        }
        let t0 = std::time::Instant::now();
        // Reset all per-dispatch state.
        let phase_t0 = std::time::Instant::now();
        vmm.reset_vsock_transport();
        let reset_vsock_us = phase_t0.elapsed().as_micros();
        let phase_t0 = std::time::Instant::now();
        let file_meta = std::fs::metadata(&req.path).map_err(|source| RunError::MmapCow {
            path: req.path.clone(),
            source,
        })?;
        let modified = file_meta.modified().ok();
        let cache_hit = warm_snapshot_cache.as_ref().is_some_and(|cached| {
            cached.path == req.path
                && cached.file_len == file_meta.len()
                && cached.modified == modified
        });
        if !cache_hit {
            let file = std::fs::File::open(&req.path).map_err(|source| RunError::MmapCow {
                path: req.path.clone(),
                source,
            })?;
            let (snap, ram_offset, memory_bytes) =
                snapshot::load_meta(&req.path).map_err(|source| RunError::SnapshotLoad {
                    path: req.path.clone(),
                    source,
                })?;
            warm_snapshot_cache = Some(WarmSnapshotCache {
                path: req.path.clone(),
                file_len: file_meta.len(),
                modified,
                file,
                snap,
                ram_offset,
                memory_bytes,
            });
        }
        let load_meta_us = phase_t0.elapsed().as_micros();
        let cached = warm_snapshot_cache
            .as_ref()
            .expect("warm snapshot cache populated");
        // Re-mmap RAM from the (potentially new) snapshot path.
        // SAFETY: vCPU 0 has just exited dispatch; secondaries are
        // still running but only read RAM via hv_vcpu_run, and the
        // outer pool-worker loop is single-vCPU only for v1.
        let phase_t0 = std::time::Instant::now();
        unsafe {
            if fixed_warm_ram_remap {
                vmm.vm.remap_cow_from_file_fixed(
                    &cached.file,
                    cached.ram_offset,
                    cached.memory_bytes,
                )?;
            } else {
                vmm.vm
                    .remap_cow_from_file(&cached.file, cached.ram_offset, cached.memory_bytes)?;
            }
        }
        let remap_cow_us = phase_t0.elapsed().as_micros();
        // Restore state from the already-loaded metadata.
        let phase_t0 = std::time::Instant::now();
        let restore_timings = vmm.restore_snapshot_timed_with_options(
            &cached.snap,
            snapshot::SnapshotRestoreOptions {
                skip_gic_blob: skip_warm_gic_restore,
            },
        )?;
        let restore_snapshot_us = phase_t0.elapsed().as_micros();
        let us = t0.elapsed().as_micros();
        let timings = WarmRestoreTimings {
            reset_vsock_us,
            remap_cow_us,
            load_meta_us,
            restore_snapshot_us,
            ram_copy_us: restore_timings.ram_copy_us,
            gic_restore_us: restore_timings.gic_restore_us,
            vcpu_restore_us: restore_timings.vcpu_restore_us,
            vtimer_offset_us: restore_timings.vtimer_offset_us,
            mmio_restore_us: restore_timings.mmio_restore_us,
            listener_restore_us: restore_timings.listener_restore_us,
        };
        eprintln!(
            "  warm restore from {} in {us} us (reset={} remap={} load_meta={} restore={} ram={} gic={} vcpu={} vtimer={} mmio={} listener={})",
            req.path,
            timings.reset_vsock_us,
            timings.remap_cow_us,
            timings.load_meta_us,
            timings.restore_snapshot_us,
            timings.ram_copy_us,
            timings.gic_restore_us,
            timings.vcpu_restore_us,
            timings.vtimer_offset_us,
            timings.mmio_restore_us,
            timings.listener_restore_us
        );
        report.warm_restores += 1;
        pool.complete_restore(us, vmm.vsock.muxer().first_host_port(), timings);
    }
    // Drain any in-flight async saves before returning. Without
    // this, the worker process exits, the OS reaps the save
    // thread mid-write, and we leave a `<path>.partial` instead
    // of the canonical file. Critical for the bake-then-pool
    // flow's base snapshot, where the save runs in parallel
    // with the warmup workload and the warm SNAPSHOT+QUIT might
    // arrive before the base save completes.
    for entry in in_flight_async_saves.drain(..) {
        let _ = entry.handle.join();
    }
    Ok(report)
}