supermachine 0.4.13

// Snapshot/restore for the VMM. Captures the VM state needed to
// resume in a fresh process: per-vCPU registers (GP, SIMD, sysregs),
// per-vCPU ICC + redistributor state, the opaque GIC blob, vtimer
// offset, and full guest RAM.
//
// V2 (this revision): captures virtio mirror state — per-MMIO-device
// negotiated features, status, and per-queue ring addresses + cursors,
// plus the vsock muxer's TSI listener registry. In-flight TCP/UDP
// connections and pending handshakes are intentionally dropped (peer
// TCPs see RST; clients retry). Listeners are re-bound on restore
// with fresh ephemeral host ports — the guest only knows its vm_port,
// so the change is transparent.
//
// Wire format ("SMSNAP\x06\x00", little-endian):
//   header 64 B:
//     [0..8]   magic
//     [8..16]  version (u64)
//     [16..24] captured_mach_time (u64)
//     [24..32] captured_cntvct (u64)
//     [32..40] memory_bytes (u64)
//     [40..48] gic_blob_len (u64)
//     [48..52] n_vcpus (u32)
//     [52..56] reserved (u32)
//     [56..64] ram_gpa (u64)
//   per_vcpu × n_vcpus:
//     vtimer_offset (u64),
//     gp_n simd_n sys_n icc_n redist_n  (u32×5)
//     gp_regs (reg_id u32, val u64)        × gp_n
//     simd_regs (reg_id u32, val u128)     × simd_n
//     sys_regs (reg_id u32, val u64)       × sys_n
//     icc_regs (reg_id u32, val u64)       × icc_n
//     redist (off u32, val u64)            × redist_n
//   then RAM: memory_bytes raw bytes
//
// Memory and metadata live in the same file for v1 simplicity.

#![cfg(all(target_os = "macos", target_arch = "aarch64"))]

use std::io::{Read, Write};

use applevisor_sys as av;

use crate::devices::virtio::mmio::{MmioSnapshot, QueueSnapshot};
use crate::devices::virtio::vsock::muxer::TsiListenerSnapshot;
use crate::hvf::{self, Vcpu};
use crate::vmm::vstate::MicroVm;

const SNAPSHOT_MAGIC: [u8; 8] = *b"SMSNAP\x08\x00";
const SNAPSHOT_VERSION: u64 = 8;
/// Page size to align the RAM region within the snapshot file. macOS
/// on Apple Silicon uses 16 KiB pages; aligning here lets `--cow-restore`
/// mmap the region with `MAP_PRIVATE` directly out of the file.
const RAM_PAGE_ALIGN: u64 = 16384;
const SPARSE_RAM_CHUNK: usize = 64 * 1024;

#[derive(Default, Clone)]
pub struct PerVcpuState {
    pub gp_regs: Vec<(u32, u64)>,     // hv_reg_t as u32
    pub simd_regs: Vec<(u32, u128)>,  // hv_simd_fp_reg_t as u32
    pub sys_regs: Vec<(u32, u64)>,    // hv_sys_reg_t as u32
    pub icc_regs: Vec<(u32, u64)>,    // hv_gic_icc_reg_t as u32
    pub redist_regs: Vec<(u32, u64)>, // GICR offset
    pub vtimer_offset: u64,
}

/// Per-VM virtio device state. Length of `mmio` matches the bus
/// device order in the file (vsock first, blk after).
#[derive(Default)]
pub struct VirtioSnapshot {
    pub mmio: Vec<MmioSnapshot>,
    pub vsock_listeners: Vec<TsiListenerSnapshot>,
}

pub struct Snapshot {
    pub captured_mach_time: u64,
    pub captured_cntvct: u64,
    pub ram_gpa: u64,
    pub memory: Vec<u8>,
    pub gic_blob: Vec<u8>,
    pub per_vcpu: Vec<PerVcpuState>,
    pub virtio: VirtioSnapshot,
}

#[derive(Default, Clone, Copy)]
pub struct SnapshotWriteStats {
    pub ram_bytes: u64,
    pub ram_data_bytes: u64,
    pub ram_zero_bytes: u64,
}

#[derive(Default, Clone, Copy, Debug, PartialEq, Eq)]
pub struct SnapshotRestoreTimings {
    pub ram_copy_us: u128,
    pub gic_restore_us: u128,
    pub vcpu_restore_us: u128,
    pub vtimer_offset_us: u128,
}

#[derive(Default, Clone, Copy, Debug, PartialEq, Eq)]
pub struct SnapshotRestoreOptions {
    pub skip_gic_blob: bool,
}

// ----- Register enumerations -----

fn gp_reg_enum() -> Vec<av::hv_reg_t> {
    let mut out = Vec::with_capacity(37);
    let x0 = av::hv_reg_t::X0 as u32;
    for i in 0..=30u32 {
        // SAFETY: X0..X30 are 31 contiguous variants starting at X0.
        out.push(unsafe { std::mem::transmute::<u32, av::hv_reg_t>(x0 + i) });
    }
    out.push(av::hv_reg_t::FP);
    out.push(av::hv_reg_t::LR);
    out.push(av::hv_reg_t::PC);
    out.push(av::hv_reg_t::CPSR);
    out.push(av::hv_reg_t::FPCR);
    out.push(av::hv_reg_t::FPSR);
    out
}

fn simd_reg_enum() -> Vec<av::hv_simd_fp_reg_t> {
    let q0 = av::hv_simd_fp_reg_t::Q0 as u32;
    (0..32u32)
        // SAFETY: Q0..Q31 are 32 contiguous variants starting at Q0.
        .map(|i| unsafe { std::mem::transmute::<u32, av::hv_simd_fp_reg_t>(q0 + i) })
        .collect()
}

fn sys_reg_enum() -> Vec<av::hv_sys_reg_t> {
    use av::hv_sys_reg_t::*;
    vec![
        MPIDR_EL1,
        SCTLR_EL1,
        CPACR_EL1,
        TCR_EL1,
        TTBR0_EL1,
        TTBR1_EL1,
        MAIR_EL1,
        AMAIR_EL1,
        VBAR_EL1,
        CONTEXTIDR_EL1,
        TPIDR_EL1,
        SPSR_EL1,
        ELR_EL1,
        SP_EL0,
        SP_EL1,
        ESR_EL1,
        FAR_EL1,
        PAR_EL1,
        TPIDR_EL0,
        TPIDRRO_EL0,
        CNTKCTL_EL1,
        CSSELR_EL1,
        MDSCR_EL1,
        // Pointer-auth keys (CONFIG_ARM64_PTR_AUTH guests OOPS without these).
        APIAKEYLO_EL1,
        APIAKEYHI_EL1,
        APIBKEYLO_EL1,
        APIBKEYHI_EL1,
        APDAKEYLO_EL1,
        APDAKEYHI_EL1,
        APDBKEYLO_EL1,
        APDBKEYHI_EL1,
        APGAKEYLO_EL1,
        APGAKEYHI_EL1,
        // Vtimer CTL + deadline (HVF DOES accept these despite earlier rumours).
        CNTV_CTL_EL0,
        CNTV_CVAL_EL0,
        CNTP_CTL_EL0,
        CNTP_CVAL_EL0,
        // CNTVOFF_EL2 is captured via hv_vcpu_get_vtimer_offset, not as a sysreg.
    ]
}

fn icc_reg_enum() -> Vec<av::hv_gic_icc_reg_t> {
    use av::hv_gic_icc_reg_t::*;
    vec![
        PMR_EL1,
        BPR0_EL1,
        BPR1_EL1,
        AP0R0_EL1,
        AP1R0_EL1,
        RPR_EL1,
        CTLR_EL1,
        SRE_EL1,
        IGRPEN0_EL1,
        IGRPEN1_EL1,
    ]
}

/// Per-vCPU redistributor offsets QEMU v11 captures explicitly because
/// the opaque blob doesn't cover them. From hw/intc/arm_gicv3_hvf.c.
fn redist_reg_offsets() -> Vec<u32> {
    let mut v = Vec::with_capacity(11);
    v.push(0x10080); // GICR_IGROUPR0
    v.push(0x10100); // GICR_ISENABLER0
    v.push(0x10C04); // GICR_ICFGR1
    v.push(0x10200); // GICR_ISPENDR0  (vtimer PPI 27 lives here)
    v.push(0x10300); // GICR_ISACTIVER0
    for n in 0..8u32 {
        v.push(0x10400 + 4 * n); // GICR_IPRIORITYR0..7
    }
    v
}

#[link(name = "System", kind = "framework")]
extern "C" {
    fn mach_absolute_time() -> u64;
}

// ----- Capture -----

pub fn capture_vcpu_state(vcpu: &Vcpu) -> hvf::Result<PerVcpuState> {
    let mut gp_regs = Vec::with_capacity(37);
    for r in gp_reg_enum() {
        gp_regs.push((r as u32, vcpu.get_reg(r)?));
    }
    let mut simd_regs = Vec::with_capacity(32);
    for r in simd_reg_enum() {
        simd_regs.push((r as u32, vcpu.get_simd_fp_reg(r)?));
    }
    let mut sys_regs = Vec::new();
    for r in sys_reg_enum() {
        // Best-effort: some sysregs may be unreadable in certain
        // states; skip rather than fail the whole capture.
        if let Ok(v) = vcpu.get_sys_reg(r) {
            sys_regs.push((r as u32, v));
        }
    }
    let mut icc_regs = Vec::new();
    for r in icc_reg_enum() {
        if let Ok(v) = vcpu.get_icc_reg(r) {
            icc_regs.push((r as u32, v));
        }
    }
    let mut redist_regs = Vec::new();
    for off in redist_reg_offsets() {
        // SAFETY: offsets are valid GICR register enum variants.
        let reg: av::hv_gic_redistributor_reg_t = unsafe { std::mem::transmute(off) };
        if let Ok(v) = vcpu.get_redist_reg(reg) {
            redist_regs.push((off, v));
        }
    }
    let vtimer_offset = vcpu.get_vtimer_offset()?;
    Ok(PerVcpuState {
        gp_regs,
        simd_regs,
        sys_regs,
        icc_regs,
        redist_regs,
        vtimer_offset,
    })
}

pub fn capture_snapshot(vm: &MicroVm, virtio: VirtioSnapshot) -> hvf::Result<Snapshot> {
    let per0 = capture_vcpu_state(&vm.vcpu)?;
    let gic_blob = hvf::gic_state_capture()?;
    let mut memory = vec![0u8; vm.ram_size];
    // SAFETY: ram_host is vm.ram_size bytes.
    unsafe {
        std::ptr::copy_nonoverlapping(vm.ram_host, memory.as_mut_ptr(), vm.ram_size);
    }
    let captured_mach_time = unsafe { mach_absolute_time() };
    let captured_cntvct = captured_mach_time.wrapping_sub(per0.vtimer_offset);
    Ok(Snapshot {
        captured_mach_time,
        captured_cntvct,
        ram_gpa: vm.ram_gpa,
        memory,
        gic_blob,
        per_vcpu: vec![per0],
        virtio,
    })
}

// ----- Restore -----

/// Restore per-vCPU state. Vtimer offset is the CALLER's
/// responsibility (one coherent value across all vCPUs).
pub fn restore_vcpu_state(vcpu: &Vcpu, st: &PerVcpuState) -> hvf::Result<()> {
    // 1. sysregs first (MMU, exception state, pointer-auth, vtimer).
    //    Some may be RO; only fail loudly for MMU-critical writes.
    use av::hv_sys_reg_t as S;
    let critical = |id: u32| {
        let r: S = unsafe { std::mem::transmute(id) };
        matches!(
            r,
            S::SCTLR_EL1 | S::TCR_EL1 | S::TTBR0_EL1 | S::TTBR1_EL1 | S::MAIR_EL1 | S::VBAR_EL1
        )
    };
    for (id, v) in &st.sys_regs {
        // SAFETY: id originated from sys_reg_enum() variants.
        let r: S = unsafe { std::mem::transmute(*id) };
        if let Err(e) = vcpu.set_sys_reg(r, *v) {
            if critical(*id) {
                return Err(e);
            }
        }
    }

    // 2. ICC: SRE_EL1 first (gates subsequent ICC writes), then
    //    everything except IGRPEN0/1, then IGRPEN0/1 last (unmask
    //    delivery only after PMR/BPR/AP state is in place).
    use av::hv_gic_icc_reg_t as I;
    let icc_find = |want: I| -> Option<u64> {
        st.icc_regs.iter().find_map(|(id, v)| {
            // SAFETY: id from icc_reg_enum.
            let r: I = unsafe { std::mem::transmute(*id) };
            (r == want).then_some(*v)
        })
    };
    if let Some(v) = icc_find(I::SRE_EL1) {
        let _ = vcpu.set_icc_reg(I::SRE_EL1, v);
    }
    for (id, v) in &st.icc_regs {
        // SAFETY: id from icc_reg_enum.
        let r: I = unsafe { std::mem::transmute(*id) };
        match r {
            I::SRE_EL1 | I::IGRPEN0_EL1 | I::IGRPEN1_EL1 => continue,
            _ => {
                let _ = vcpu.set_icc_reg(r, *v);
            }
        }
    }
    if let Some(v) = icc_find(I::IGRPEN0_EL1) {
        let _ = vcpu.set_icc_reg(I::IGRPEN0_EL1, v);
    }
    if let Some(v) = icc_find(I::IGRPEN1_EL1) {
        let _ = vcpu.set_icc_reg(I::IGRPEN1_EL1, v);
    }

    // 3. SIMD/FP regs.
    use av::hv_simd_fp_reg_t as Q;
    for (id, v) in &st.simd_regs {
        // SAFETY: id from simd_reg_enum.
        let r: Q = unsafe { std::mem::transmute(*id) };
        vcpu.set_simd_fp_reg(r, *v)?;
    }

    // 4. Per-vCPU redistributor regs. Order from QEMU v11
    //    arm_gicv3_hvf.c: group/config/priority first, then CLEAR each
    //    enable/pending/active mask before SET (otherwise restore is
    //    OR of default + captured bits).
    let find_off = |off: u32| -> u64 {
        st.redist_regs
            .iter()
            .find_map(|(o, v)| (*o == off).then_some(*v))
            .unwrap_or(0)
    };
    let write_off = |off: u32, val: u64| -> hvf::Result<()> {
        // SAFETY: off comes from our own enumeration; transmute matches repr.
        let r: av::hv_gic_redistributor_reg_t = unsafe { std::mem::transmute(off) };
        vcpu.set_redist_reg(r, val)
    };
    write_off(0x10080, find_off(0x10080))?; // IGROUPR0
    write_off(0x10C04, find_off(0x10C04))?; // ICFGR1
    for n in 0..8u32 {
        write_off(0x10400 + 4 * n, find_off(0x10400 + 4 * n))?;
    }
    write_off(0x10180, 0xFFFF_FFFF)?; // ICENABLER0 clear
    write_off(0x10100, find_off(0x10100))?; // ISENABLER0 set
    write_off(0x10280, 0xFFFF_FFFF)?; // ICPENDR0 clear
    write_off(0x10200, find_off(0x10200))?; // ISPENDR0 set
    write_off(0x10380, 0xFFFF_FFFF)?; // ICACTIVER0 clear
    write_off(0x10300, find_off(0x10300))?; // ISACTIVER0 set

    // 5. Force vtimer mask off so HVF re-evaluates on next run.
    let _ = vcpu.set_vtimer_mask(false);

    // 6. Vtimer force-fire: if the captured vtimer was enabled and
    //    unmasked, set CVAL=0 and force-pend the PPI bit so the guest
    //    wakes immediately rather than waiting on a stale deadline.
    let cntv_ctl = st
        .sys_regs
        .iter()
        .find_map(|(id, v)| {
            let r: S = unsafe { std::mem::transmute(*id) };
            (r == S::CNTV_CTL_EL0).then_some(*v)
        })
        .unwrap_or(0);
    let enable = cntv_ctl & 1 != 0;
    let imask = cntv_ctl & 2 != 0;
    if enable && !imask {
        vcpu.set_sys_reg(S::CNTV_CVAL_EL0, 0)?;
        // GICR_ISPENDR0 bit 27 = vtimer PPI.
        write_off(0x10200, 1u64 << 27)?;
    }

    // 7. GP regs LAST (PC/CPSR finalize the vCPU).
    use av::hv_reg_t as R;
    for (id, v) in &st.gp_regs {
        // SAFETY: id from gp_reg_enum.
        let r: R = unsafe { std::mem::transmute(*id) };
        vcpu.set_reg(r, *v)?;
    }
    Ok(())
}

/// Restore the full snapshot into `vm`. RAM is memcpy'd in (for v1
/// we don't do CoW — that's a follow-up). Caller must have created
/// `vm` with the same RAM size as the snapshot (otherwise the memcpy
/// is silently truncated).
pub fn restore_snapshot(vm: &MicroVm, snap: &Snapshot) -> hvf::Result<()> {
    restore_snapshot_timed(vm, snap).map(|_| ())
}

pub fn restore_snapshot_timed(
    vm: &MicroVm,
    snap: &Snapshot,
) -> hvf::Result<SnapshotRestoreTimings> {
    restore_snapshot_timed_with_options(vm, snap, SnapshotRestoreOptions::default())
}

pub fn restore_snapshot_timed_with_options(
    vm: &MicroVm,
    snap: &Snapshot,
    options: SnapshotRestoreOptions,
) -> hvf::Result<SnapshotRestoreTimings> {
    let mut timings = SnapshotRestoreTimings::default();
    // 1. RAM. CoW-restored snapshots leave `memory` empty (pages are
    //    already mapped via mmap(MAP_PRIVATE) before MicroVm::new).
    if !snap.memory.is_empty() {
        let t0 = std::time::Instant::now();
        // SAFETY: ram_host is vm.ram_size bytes; we cap copy length.
        unsafe {
            std::ptr::copy_nonoverlapping(
                snap.memory.as_ptr(),
                vm.ram_host,
                vm.ram_size.min(snap.memory.len()),
            );
        }
        timings.ram_copy_us = t0.elapsed().as_micros();
    }
    // 2. GIC blob (covers distributor + per-PE pending/active).
    if !options.skip_gic_blob {
        let t0 = std::time::Instant::now();
        hvf::gic_state_restore(&snap.gic_blob)?;
        timings.gic_restore_us = t0.elapsed().as_micros();
    }
    // 3. Per-vCPU state.
    let boot_vcpu = snap.per_vcpu.first().ok_or(hvf::Error::Hv(-1))?;
    let t0 = std::time::Instant::now();
    restore_vcpu_state(&vm.vcpu, boot_vcpu)?;
    timings.vcpu_restore_us = t0.elapsed().as_micros();
    // 4. Coherent vtimer offset for all vCPUs (single value).
    let now = unsafe { mach_absolute_time() };
    let new_offset = now.wrapping_sub(snap.captured_cntvct);
    let t0 = std::time::Instant::now();
    vm.vcpu.set_vtimer_offset(new_offset)?;
    timings.vtimer_offset_us = t0.elapsed().as_micros();
    Ok(timings)
}

// ----- File serialization -----

#[derive(Debug)]
pub enum FileError {
    Io(std::io::Error),
    BadMagic,
    BadVersion(u64),
    Malformed(&'static str),
    Truncated,
}
impl From<std::io::Error> for FileError {
    fn from(e: std::io::Error) -> Self {
        Self::Io(e)
    }
}

fn le_u16(bytes: &[u8]) -> Result<u16, FileError> {
    let bytes = bytes.try_into().map_err(|_| FileError::Truncated)?;
    Ok(u16::from_le_bytes(bytes))
}

fn le_u32(bytes: &[u8]) -> Result<u32, FileError> {
    let bytes = bytes.try_into().map_err(|_| FileError::Truncated)?;
    Ok(u32::from_le_bytes(bytes))
}

fn le_u64(bytes: &[u8]) -> Result<u64, FileError> {
    let bytes = bytes.try_into().map_err(|_| FileError::Truncated)?;
    Ok(u64::from_le_bytes(bytes))
}

fn le_u128(bytes: &[u8]) -> Result<u128, FileError> {
    let bytes = bytes.try_into().map_err(|_| FileError::Truncated)?;
    Ok(u128::from_le_bytes(bytes))
}

pub fn save_to_file(path: &str, snap: &Snapshot) -> Result<(), FileError> {
    save_to_file_with_stats(path, snap).map(|_| ())
}

/// Compact in-memory snapshot used by the pipelined `SNAPSHOT_ASYNC`
/// path. Holds all the small fixed-size state by value plus the
/// guest's RAM as a list of (offset, page) pairs — only non-zero
/// pages, so memory cost is the actual working-set size (~100 MiB
/// for rust:1-slim post-init) instead of the full RAM allocation
/// (2 GiB).
///
/// Two-phase save: `capture_compact` runs while the guest is paused
/// (~50 ms for 100 MiB of non-zero pages on M-series), then
/// `save_compact_to_file` runs in a background thread while the
/// guest is unpaused for the warmup workload. Disk write writes
/// the same on-disk sparse format as the streaming path.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub struct CompactSnapshot {
    pub captured_mach_time: u64,
    pub captured_cntvct: u64,
    pub ram_gpa: u64,
    pub ram_size: usize,
    pub gic_blob: Vec<u8>,
    pub per_vcpu: Vec<PerVcpuState>,
    pub virtio: VirtioSnapshot,
    /// Non-zero pages only. Each entry: (byte offset within the
    /// guest RAM, owned 4 KiB page). Sorted by offset.
    pub pages: Vec<(usize, Box<[u8; 4096]>)>,
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
const COMPACT_PAGE_SIZE: usize = 4096;

/// Serial fallback for `capture_compact`'s page walk. Used when
/// `snapshot_write_threads()` is 1 or for code paths that
/// haven't been threaded yet.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn capture_compact_pages_serial(
    memory: &[u8],
    n_pages: usize,
) -> Vec<(usize, Box<[u8; 4096]>)> {
    let mut pages: Vec<(usize, Box<[u8; 4096]>)> = Vec::with_capacity(n_pages / 20);
    for page_idx in 0..n_pages {
        let off = page_idx * COMPACT_PAGE_SIZE;
        let chunk = &memory[off..off + COMPACT_PAGE_SIZE];
        if !chunk.iter().all(|&b| b == 0) {
            let mut page = Box::new([0u8; 4096]);
            page.copy_from_slice(chunk);
            pages.push((off, page));
        }
    }
    pages
}

/// Parallel page walk for `capture_compact`. Splits guest RAM
/// into N contiguous slabs, scans each on a worker thread.
/// Output preserves source-offset order so the on-disk layout is
/// byte-identical to the serial path.
///
/// CALLER MUST guarantee the guest is paused for the duration —
/// the threads borrow `memory` (a `&[u8]` slice over
/// `vm.ram_host`) for read-only access.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn capture_compact_pages_parallel(
    memory: &[u8],
    n_pages: usize,
    n_threads: usize,
) -> Vec<(usize, Box<[u8; 4096]>)> {
    let n = n_threads.max(1);
    let pages_per_slab = (n_pages + n - 1) / n;
    let mem_ptr = memory.as_ptr() as usize;
    let mem_len = memory.len();

    // Each slab's walker returns its own ordered Vec of (offset,
    // page) entries. We concatenate them in slab order — slab K
    // covers pages [K*pages_per_slab, (K+1)*pages_per_slab),
    // strictly after slab K-1, so concatenation preserves the
    // global offset order.
    let mut slab_results: Vec<Vec<(usize, Box<[u8; 4096]>)>> =
        std::thread::scope(|s| -> Vec<Vec<(usize, Box<[u8; 4096]>)>> {
            let mut handles = Vec::with_capacity(n);
            for slab_idx in 0..n {
                let slab_start = slab_idx * pages_per_slab;
                let slab_end = ((slab_idx + 1) * pages_per_slab).min(n_pages);
                if slab_start >= slab_end {
                    continue;
                }
                let h = s.spawn(move || -> Vec<(usize, Box<[u8; 4096]>)> {
                    // SAFETY: `memory` is borrowed for the
                    // duration of `thread::scope`; we reconstruct
                    // a sub-slice with the same lifetime. Guest is
                    // paused (caller invariant) so RAM doesn't
                    // change underneath.
                    let _ = mem_len;
                    let mem: &[u8] = unsafe {
                        std::slice::from_raw_parts(mem_ptr as *const u8, mem_len)
                    };
                    let mut local: Vec<(usize, Box<[u8; 4096]>)> =
                        Vec::with_capacity((slab_end - slab_start) / 20);
                    for page_idx in slab_start..slab_end {
                        let off = page_idx * COMPACT_PAGE_SIZE;
                        let chunk = &mem[off..off + COMPACT_PAGE_SIZE];
                        if !chunk.iter().all(|&b| b == 0) {
                            let mut page = Box::new([0u8; 4096]);
                            page.copy_from_slice(chunk);
                            local.push((off, page));
                        }
                    }
                    local
                });
                handles.push(h);
            }
            handles
                .into_iter()
                .map(|h| h.join().unwrap_or_default())
                .collect()
        });
    let total: usize = slab_results.iter().map(Vec::len).sum();
    let mut pages = Vec::with_capacity(total);
    for slab in slab_results.drain(..) {
        pages.extend(slab);
    }
    pages
}

/// Pause-window-only step of the pipelined snapshot. CALLER MUST
/// guarantee the guest is paused — secondary vCPUs rendezvous-
/// stopped, vCPU 0 exited from `hv_vcpu_run`. Returns when all
/// non-zero pages have been copied; the caller can resume the
/// guest immediately and call `save_compact_to_file` from a
/// background thread.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn capture_compact(
    vm: &MicroVm,
    virtio: VirtioSnapshot,
    secondary_states: Vec<PerVcpuState>,
) -> Result<CompactSnapshot, SnapshotStreamError> {
    let per0 = capture_vcpu_state(&vm.vcpu).map_err(SnapshotStreamError::Hvf)?;
    let gic_blob = hvf::gic_state_capture().map_err(SnapshotStreamError::Hvf)?;
    let captured_mach_time = unsafe { mach_absolute_time() };
    let captured_cntvct = captured_mach_time.wrapping_sub(per0.vtimer_offset);
    let mut per_vcpu = Vec::with_capacity(1 + secondary_states.len());
    per_vcpu.push(per0);
    per_vcpu.extend(secondary_states);

    // Copy non-zero pages into the compact buffer. We walk
    // vm.ram_host page-by-page, skip all-zero pages (the kernel
    // hasn't touched them, restore will leave them as zero in the
    // CoW-mapped target).
    //
    // Multi-threaded: the walk is the SECOND-largest pause-window
    // cost in the bake-then-pool flow (after the file write).
    // Splitting into N slabs each scanned by its own worker turns
    // a serial 2 GiB scan (~500 ms on M-series) into parallel
    // ~125 ms scans. Each slab thread builds its own
    // `Vec<(offset, page)>`; we concatenate at the end. Output is
    // byte-identical to the serial path because slabs are
    // contiguous and we preserve the offset order.
    let ram_size = vm.ram_size;
    let memory: &[u8] = unsafe { std::slice::from_raw_parts(vm.ram_host, ram_size) };
    let n_pages = ram_size / COMPACT_PAGE_SIZE;
    let n_threads = snapshot_write_threads();
    let pages: Vec<(usize, Box<[u8; 4096]>)> = if n_threads <= 1 {
        capture_compact_pages_serial(memory, n_pages)
    } else {
        capture_compact_pages_parallel(memory, n_pages, n_threads)
    };
    Ok(CompactSnapshot {
        captured_mach_time,
        captured_cntvct,
        ram_gpa: vm.ram_gpa,
        ram_size,
        gic_blob,
        per_vcpu,
        virtio,
        pages,
    })
}

/// Background-thread step of the pipelined snapshot. Writes the
/// captured pages to a `.partial` sibling file, then atomically
/// renames to `path` on success. The output is the same on-disk
/// sparse format that `save_to_file_with_stats` and
/// `capture_and_save_streaming` produce — `Image::from_snapshot`
/// reads any of the three interchangeably.
///
/// Crash safety: only the `.partial` file exists mid-write. A
/// crash leaves no `restore.snap` (which is what callers/cache
/// fingerprint check for), so the next bake invocation cleanly
/// re-runs.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
/// Encode the meta section of a compact snapshot (header + GIC
/// blob + per-vCPU state + virtio metadata) into a flat byte
/// buffer. Same byte layout as what `save_compact_to_file` writes
/// from offset 0 to the start of the RAM padding. Used by the
/// diff-via-clone path (`save_compact_to_file_via_clone`) to
/// rewrite the meta section of an APFS-cloned base file in place.
///
/// `ram_offset_for_header` is the value to encode into the
/// header's `ram_offset` field. For fresh saves the caller fills
/// it in after computing pad alignment; for the diff path the
/// caller passes the base file's existing ram_offset (so warm's
/// page coordinates map onto the same on-disk RAM region).
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn encode_compact_meta(snap: &CompactSnapshot, ram_offset_for_header: u64) -> Vec<u8> {
    let mut buf = Vec::with_capacity(
        72 + snap.gic_blob.len()
            + snap.per_vcpu.len() * 256
            + snap.virtio.mmio.len() * 256
            + snap.virtio.vsock_listeners.len() * 24,
    );
    buf.extend_from_slice(&SNAPSHOT_MAGIC);
    buf.extend_from_slice(&SNAPSHOT_VERSION.to_le_bytes());
    buf.extend_from_slice(&snap.captured_mach_time.to_le_bytes());
    buf.extend_from_slice(&snap.captured_cntvct.to_le_bytes());
    buf.extend_from_slice(&(snap.ram_size as u64).to_le_bytes());
    buf.extend_from_slice(&(snap.gic_blob.len() as u64).to_le_bytes());
    buf.extend_from_slice(&(snap.per_vcpu.len() as u32).to_le_bytes());
    buf.extend_from_slice(&0u32.to_le_bytes());
    buf.extend_from_slice(&snap.ram_gpa.to_le_bytes());
    buf.extend_from_slice(&ram_offset_for_header.to_le_bytes());
    buf.extend_from_slice(&snap.gic_blob);
    for st in &snap.per_vcpu {
        buf.extend_from_slice(&st.vtimer_offset.to_le_bytes());
        buf.extend_from_slice(&(st.gp_regs.len() as u32).to_le_bytes());
        buf.extend_from_slice(&(st.simd_regs.len() as u32).to_le_bytes());
        buf.extend_from_slice(&(st.sys_regs.len() as u32).to_le_bytes());
        buf.extend_from_slice(&(st.icc_regs.len() as u32).to_le_bytes());
        buf.extend_from_slice(&(st.redist_regs.len() as u32).to_le_bytes());
        for (id, v) in &st.gp_regs {
            buf.extend_from_slice(&id.to_le_bytes());
            buf.extend_from_slice(&v.to_le_bytes());
        }
        for (id, v) in &st.simd_regs {
            buf.extend_from_slice(&id.to_le_bytes());
            buf.extend_from_slice(&v.to_le_bytes());
        }
        for (id, v) in &st.sys_regs {
            buf.extend_from_slice(&id.to_le_bytes());
            buf.extend_from_slice(&v.to_le_bytes());
        }
        for (id, v) in &st.icc_regs {
            buf.extend_from_slice(&id.to_le_bytes());
            buf.extend_from_slice(&v.to_le_bytes());
        }
        for (off, v) in &st.redist_regs {
            buf.extend_from_slice(&off.to_le_bytes());
            buf.extend_from_slice(&v.to_le_bytes());
        }
    }
    buf.extend_from_slice(&(snap.virtio.mmio.len() as u32).to_le_bytes());
    for m in &snap.virtio.mmio {
        buf.extend_from_slice(&m.driver_features[0].to_le_bytes());
        buf.extend_from_slice(&m.driver_features[1].to_le_bytes());
        buf.extend_from_slice(&m.status.to_le_bytes());
        buf.extend_from_slice(&m.interrupt_status.to_le_bytes());
        buf.extend_from_slice(&(m.queues.len() as u32).to_le_bytes());
        for q in &m.queues {
            buf.extend_from_slice(&q.size.to_le_bytes());
            buf.push(if q.ready { 1 } else { 0 });
            buf.push(0);
            buf.extend_from_slice(&q.desc_table.to_le_bytes());
            buf.extend_from_slice(&q.avail_ring.to_le_bytes());
            buf.extend_from_slice(&q.used_ring.to_le_bytes());
            buf.extend_from_slice(&q.last_avail_idx.to_le_bytes());
            buf.extend_from_slice(&q.next_used_idx.to_le_bytes());
        }
    }
    buf.extend_from_slice(&(snap.virtio.vsock_listeners.len() as u32).to_le_bytes());
    for l in &snap.virtio.vsock_listeners {
        buf.extend_from_slice(&l.cid.to_le_bytes());
        buf.extend_from_slice(&l.peer_port.to_le_bytes());
        buf.extend_from_slice(&l.vm_port.to_le_bytes());
        buf.extend_from_slice(&l.family.to_le_bytes());
        buf.extend_from_slice(&l.socktype.to_le_bytes());
    }
    buf
}

pub fn save_compact_to_file(
    snap: &CompactSnapshot,
    path: &str,
) -> Result<SnapshotWriteStats, FileError> {
    use std::io::{Seek, SeekFrom, Write};
    let partial = format!("{path}.partial");
    let mut f = std::fs::File::create(&partial)?;

    // Encode meta with placeholder ram_offset; we patch it after
    // computing the pad alignment.
    let meta = encode_compact_meta(snap, 0);
    f.write_all(&meta)?;

    let cur = f.stream_position()? as u64;
    let pad = (RAM_PAGE_ALIGN - (cur % RAM_PAGE_ALIGN)) % RAM_PAGE_ALIGN;
    if pad > 0 {
        f.write_all(&vec![0u8; pad as usize])?;
    }
    let ram_offset = f.stream_position()? as u64;
    f.set_len(ram_offset + snap.ram_size as u64)?;

    // Sparse-write the captured pages. Multi-threaded variant —
    // splits the page list across `snapshot_write_threads()`
    // workers using `pwrite` so the write phase scales with disk
    // bandwidth. Each page lands at (ram_offset + page_off),
    // so workers writing to disjoint offsets can't conflict on
    // file position.
    let n_threads = snapshot_write_threads();
    let data_bytes = if n_threads <= 1 || snap.pages.len() < 64 {
        // Serial fallback: tiny page lists aren't worth thread
        // overhead.
        let mut data_bytes = 0u64;
        for (page_off, page) in &snap.pages {
            f.seek(SeekFrom::Start(ram_offset + *page_off as u64))?;
            f.write_all(page.as_ref())?;
            data_bytes += COMPACT_PAGE_SIZE as u64;
        }
        data_bytes
    } else {
        save_compact_pages_parallel(&f, &snap.pages, ram_offset, n_threads)?
    };

    f.seek(SeekFrom::Start(64))?;
    f.write_all(&ram_offset.to_le_bytes())?;
    drop(f);

    // Atomic rename: only `path` (the canonical name) exists when
    // we return. Mid-write crashes leave `path.partial` instead,
    // which `Image::from_snapshot` ignores.
    std::fs::rename(&partial, path)?;

    Ok(SnapshotWriteStats {
        ram_bytes: snap.ram_size as u64,
        ram_data_bytes: data_bytes,
        ram_zero_bytes: snap.ram_size as u64 - data_bytes,
    })
}

/// Write the captured pages in parallel via `pwrite`. Pages list
/// is sliced into `n_threads` evenly-sized chunks. Used by
/// [`save_compact_to_file`] for the bake-time async base save.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn save_compact_pages_parallel(
    f: &std::fs::File,
    pages: &[(usize, Box<[u8; 4096]>)],
    ram_offset: u64,
    n_threads: usize,
) -> Result<u64, FileError> {
    use std::os::unix::fs::FileExt;
    let n = n_threads.max(1);
    let per_thread = (pages.len() + n - 1) / n;

    let total_data: u64 = std::thread::scope(|s| -> Result<u64, std::io::Error> {
        let mut handles = Vec::with_capacity(n);
        for chunk in pages.chunks(per_thread) {
            let f_clone = f.try_clone()?;
            let chunk_ref = chunk; // borrowed for scope lifetime
            let h = s.spawn(move || -> std::io::Result<u64> {
                let mut bytes = 0u64;
                for (page_off, page) in chunk_ref {
                    f_clone.write_all_at(
                        page.as_ref(),
                        ram_offset + *page_off as u64,
                    )?;
                    bytes += COMPACT_PAGE_SIZE as u64;
                }
                Ok(bytes)
            });
            handles.push(h);
        }
        let mut total = 0u64;
        for h in handles {
            total += h
                .join()
                .map_err(|_| std::io::Error::other("snapshot write thread panicked"))??;
        }
        Ok(total)
    })?;

    Ok(total_data)
}

/// Thin wrapper around the macOS `clonefile(2)` syscall.
/// Creates an APFS CoW clone of `src` at `dst`; data blocks are
/// shared until either side writes (then APFS does on-disk CoW).
/// Returns ENOENT/EXDEV/etc as `io::Error`; callers fall through
/// to the plain save path.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn clonefile_via_libc(src: &str, dst: &str) -> std::io::Result<()> {
    let src_c = std::ffi::CString::new(src).map_err(|_| {
        std::io::Error::new(
            std::io::ErrorKind::InvalidInput,
            "src path contains NUL byte",
        )
    })?;
    let dst_c = std::ffi::CString::new(dst).map_err(|_| {
        std::io::Error::new(
            std::io::ErrorKind::InvalidInput,
            "dst path contains NUL byte",
        )
    })?;
    let ret = unsafe { libc::clonefile(src_c.as_ptr(), dst_c.as_ptr(), 0) };
    if ret != 0 {
        Err(std::io::Error::last_os_error())
    } else {
        Ok(())
    }
}

/// Load a snapshot file into an in-memory `CompactSnapshot` —
/// only the non-zero pages are kept (same as the in-memory
/// representation that `capture_compact` produces). Used by the
/// diff-via-clone path when the runner needs base in memory but
/// it's not in the in-flight async-save list (e.g. for cycle-
/// snapshots from a worker that was restored via `--restore-from`,
/// not via a recent `SNAPSHOT_ASYNC`).
///
/// Memory cost: ~size of non-zero pages (~100 MiB on
/// rust:1-slim). Wall time: ~150 ms for a 2 GiB sparse snapshot
/// on M-series (real disk reads scale with non-zero size; sparse
/// holes return zero from the page cache instantly).
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn load_compact_from_file(path: &str) -> Result<CompactSnapshot, FileError> {
    use std::os::fd::AsRawFd;

    // Reuse load_meta for the header / GIC / per_vcpu / virtio.
    let (snap, ram_offset, memory_bytes) = load_meta(path)?;

    // mmap the RAM section read-only and walk it. Avoids the 2
    // GiB Vec<u8> allocation that `load_from_file` would do.
    let f = std::fs::File::open(path)?;
    let fd = f.as_raw_fd();
    let ptr = unsafe {
        libc::mmap(
            std::ptr::null_mut(),
            memory_bytes,
            libc::PROT_READ,
            libc::MAP_PRIVATE,
            fd,
            ram_offset as libc::off_t,
        )
    };
    if ptr == libc::MAP_FAILED {
        return Err(FileError::Io(std::io::Error::last_os_error()));
    }
    // Hint kernel for sequential read-ahead.
    unsafe {
        let _ = libc::madvise(ptr, memory_bytes, libc::MADV_SEQUENTIAL);
    }

    let memory: &[u8] = unsafe { std::slice::from_raw_parts(ptr as *const u8, memory_bytes) };
    let n_pages = memory_bytes / COMPACT_PAGE_SIZE;
    let mut pages: Vec<(usize, Box<[u8; 4096]>)> = Vec::with_capacity(n_pages / 20);
    for page_idx in 0..n_pages {
        let off = page_idx * COMPACT_PAGE_SIZE;
        let chunk = &memory[off..off + COMPACT_PAGE_SIZE];
        if !chunk.iter().all(|&b| b == 0) {
            let mut page = Box::new([0u8; 4096]);
            page.copy_from_slice(chunk);
            pages.push((off, page));
        }
    }

    // SAFETY: ptr was returned by mmap with this size and we own it.
    unsafe {
        libc::munmap(ptr, memory_bytes);
    }

    Ok(CompactSnapshot {
        captured_mach_time: snap.captured_mach_time,
        captured_cntvct: snap.captured_cntvct,
        ram_gpa: snap.ram_gpa,
        ram_size: memory_bytes,
        gic_blob: snap.gic_blob,
        per_vcpu: snap.per_vcpu,
        virtio: snap.virtio,
        pages,
    })
}

/// Differential compact-snapshot save via APFS clonefile + diff
/// pwrite. Same on-disk format as [`save_compact_to_file`] —
/// restore is byte-identical for byte-identical guest state, so
/// no special restore path is needed.
///
/// Algorithm:
///   1. `clonefile(base_path → out_path.partial)` — APFS CoW
///      clone, ~no I/O. Data blocks shared with base.
///   2. Read base's `ram_offset` from the cloned header.
///   3. Encode `snap`'s meta section (header + GIC + per-vCPU +
///      virtio); pad to base's `ram_offset`. If meta overflows
///      `ram_offset` (e.g. warmup added many vsock listeners),
///      bail out and let the caller fall through to a plain
///      save. We DON'T retry inside this function — the caller
///      chose the diff path; if the layout doesn't match we
///      return Err so they can decide whether to plain-save or
///      surface the error.
///   4. `pwrite_at` warm meta over [0, ram_offset).
///   5. Compute the diff page set: pages in `snap` whose bytes
///      differ from the same offset in `base`, plus pages in
///      `base` that are absent from `snap` (zeroed by warmup —
///      need explicit zero-page pwrite to override base's
///      content in the cloned file).
///   6. Parallel pwrite the diff pages.
///   7. Atomic rename `out_path.partial → out_path`.
///
/// On `clonefile` EXDEV / ENOENT the caller should fall through
/// to [`save_compact_to_file`].
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn save_compact_to_file_via_clone(
    snap: &CompactSnapshot,
    base: &CompactSnapshot,
    base_path: &str,
    out_path: &str,
) -> Result<SnapshotWriteStats, FileError> {
    use std::collections::{HashMap, HashSet};
    use std::os::unix::fs::FileExt;

    let partial = format!("{out_path}.partial");
    let _ = std::fs::remove_file(&partial);

    // 1. Clone base file.
    clonefile_via_libc(base_path, &partial).map_err(FileError::Io)?;

    // 2. Open partial RW.
    let f = std::fs::OpenOptions::new()
        .read(true)
        .write(true)
        .open(&partial)?;

    // 3. Read base's ram_offset from header (offset 64..72).
    let mut ram_off_bytes = [0u8; 8];
    f.read_exact_at(&mut ram_off_bytes, 64)?;
    let ram_offset = u64::from_le_bytes(ram_off_bytes);

    // Sanity: base's ram_size should match warm's. If not, the
    // VMs are configured differently and clone-based diff makes
    // no sense.
    let mut base_ram_size_bytes = [0u8; 8];
    f.read_exact_at(&mut base_ram_size_bytes, 32)?;
    let base_ram_size = u64::from_le_bytes(base_ram_size_bytes);
    if base_ram_size != snap.ram_size as u64 {
        let _ = std::fs::remove_file(&partial);
        return Err(FileError::Io(std::io::Error::other(format!(
            "diff snapshot: base ram_size {base_ram_size} != warm ram_size {}",
            snap.ram_size
        ))));
    }

    // 4. Encode warm's meta and verify it fits before ram_offset.
    let meta = encode_compact_meta(snap, ram_offset);
    if (meta.len() as u64) > ram_offset {
        let _ = std::fs::remove_file(&partial);
        return Err(FileError::Io(std::io::Error::other(format!(
            "diff snapshot: warm meta {} bytes overflows base ram_offset {ram_offset}",
            meta.len()
        ))));
    }
    // Pad to ram_offset with zeros so [0, ram_offset) is fully
    // overwritten and we don't leave any base-meta bytes behind.
    let mut padded = meta;
    padded.resize(ram_offset as usize, 0);
    f.write_all_at(&padded, 0)?;

    // 5. Compute diff page set, split into:
    //    * `data_pages`: pages with bytes != base's bytes (or
    //      pages new in warm). Need pwrite of warm content.
    //    * `zero_offsets`: offsets where base had non-zero bytes
    //      but warm has zero. Punching a hole at these offsets
    //      deallocates the cloned base's data block — no on-disk
    //      space cost, and restore reads zero (correct).
    //
    //  Why two paths: APFS clonefile shares blocks with base.
    //  pwriting zeros forces APFS to allocate a NEW block (4 KiB
    //  of physical disk per zeroed page), defeating the clone's
    //  size win. F_PUNCHHOLE deallocates the cloned block
    //  outright — restore-time reads return zero from the hole,
    //  same observable result as a zero pwrite, ~3x physical
    //  disk savings on a typical warmup that zeros buffer cache.
    let base_lookup: HashMap<usize, &[u8; 4096]> = base
        .pages
        .iter()
        .map(|(o, p)| (*o, p.as_ref()))
        .collect();
    let warm_offsets: HashSet<usize> = snap.pages.iter().map(|(o, _)| *o).collect();

    let mut data_pages: Vec<(usize, &[u8; 4096])> = Vec::new();
    for (off, warm_p) in &snap.pages {
        let warm_bytes: &[u8; 4096] = warm_p.as_ref();
        match base_lookup.get(off) {
            None => data_pages.push((*off, warm_bytes)),
            Some(b) if *b != warm_bytes => data_pages.push((*off, warm_bytes)),
            _ => {}
        }
    }
    let mut zero_offsets: Vec<usize> = Vec::new();
    for (off, _) in &base.pages {
        if !warm_offsets.contains(off) {
            zero_offsets.push(*off);
        }
    }
    // Sort + coalesce contiguous runs so each F_PUNCHHOLE
    // ioctl covers many adjacent pages — fewer syscalls, and
    // APFS deallocates the run as a single extent.
    zero_offsets.sort_unstable();

    // 6. pwrite the warm-data diff pages (parallel for >= 64).
    let n_threads = snapshot_write_threads();
    let data_bytes_written = if n_threads <= 1 || data_pages.len() < 64 {
        let mut bytes = 0u64;
        for (off, page) in &data_pages {
            f.write_all_at(*page, ram_offset + *off as u64)?;
            bytes += COMPACT_PAGE_SIZE as u64;
        }
        bytes
    } else {
        save_diff_pages_parallel(&f, &data_pages, ram_offset, n_threads)?
    };

    // 7. Punch holes for zeroed pages (coalesced runs).
    let mut hole_bytes = 0u64;
    if !zero_offsets.is_empty() {
        use std::os::unix::io::AsRawFd;
        let fd = f.as_raw_fd();
        let mut i = 0;
        while i < zero_offsets.len() {
            let run_start = zero_offsets[i];
            let mut run_end = run_start + COMPACT_PAGE_SIZE;
            i += 1;
            while i < zero_offsets.len() && zero_offsets[i] == run_end {
                run_end += COMPACT_PAGE_SIZE;
                i += 1;
            }
            let span = (run_end - run_start) as i64;
            let punch = libc::fpunchhole_t {
                fp_flags: 0,
                reserved: 0,
                fp_offset: (ram_offset + run_start as u64) as libc::off_t,
                fp_length: span,
            };
            // SAFETY: fd is valid for the file we just wrote;
            // fpunchhole_t pointer is owned by us.
            let r = unsafe {
                libc::fcntl(
                    fd,
                    libc::F_PUNCHHOLE,
                    &punch as *const libc::fpunchhole_t,
                )
            };
            if r == 0 {
                hole_bytes += span as u64;
            } else {
                // Filesystem doesn't support hole-punching (e.g.
                // some non-APFS network mounts). Fall back to
                // explicit zero pwrite for this run — same
                // semantic, just doesn't save physical disk.
                let zeros = vec![0u8; span as usize];
                f.write_all_at(&zeros, ram_offset + run_start as u64)?;
            }
        }
    }
    let data_bytes = data_bytes_written + hole_bytes;

    drop(f);
    // 7. Atomic rename.
    std::fs::rename(&partial, out_path)?;

    Ok(SnapshotWriteStats {
        ram_bytes: snap.ram_size as u64,
        ram_data_bytes: data_bytes,
        ram_zero_bytes: snap.ram_size as u64 - data_bytes,
    })
}

/// Parallel pwrite of a (offset, &[u8; 4096])-list. Sibling of
/// [`save_compact_pages_parallel`] but works over borrowed page
/// references with arbitrary lifetime (so callers can mix owned
/// warm pages and the static `ZERO_PAGE` in the same list — the
/// scoped thread inherits the page borrows).
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
fn save_diff_pages_parallel(
    f: &std::fs::File,
    pages: &[(usize, &[u8; 4096])],
    ram_offset: u64,
    n_threads: usize,
) -> Result<u64, FileError> {
    use std::os::unix::fs::FileExt;
    let n = n_threads.max(1);
    let per_thread = (pages.len() + n - 1) / n;

    let total: u64 = std::thread::scope(|s| -> Result<u64, std::io::Error> {
        let mut handles = Vec::with_capacity(n);
        for chunk in pages.chunks(per_thread) {
            let f_clone = f.try_clone()?;
            let chunk_ref: &[(usize, &[u8; 4096])] = chunk;
            let h = s.spawn(move || -> std::io::Result<u64> {
                let mut bytes = 0u64;
                for (off, page) in chunk_ref {
                    f_clone.write_all_at(*page, ram_offset + *off as u64)?;
                    bytes += COMPACT_PAGE_SIZE as u64;
                }
                Ok(bytes)
            });
            handles.push(h);
        }
        let mut total = 0u64;
        for h in handles {
            total += h
                .join()
                .map_err(|_| std::io::Error::other("snapshot diff write thread panicked"))??;
        }
        Ok(total)
    })?;

    Ok(total)
}

/// Capture vCPU/GIC/virtio state and stream the guest's RAM
/// directly from `vm.ram_host` into the output file in one pass,
/// without the intermediate 2 GiB `Vec<u8>` allocation that
/// `capture_snapshot` makes. Saves ~1.5 s of memcpy on
/// M-series for a 2 GiB-baked image — the bulk of `PooledVm::
/// snapshot` cost on a typical warmup-then-bake flow.
///
/// CALLER MUST guarantee the guest is paused for the entire
/// call (vCPU exited from `hv_vcpu_run`, secondaries rendezvous-
/// stopped). The runner's snapshot RPC handler already does
/// this; bake-time auto-capture goes through the original
/// `capture_snapshot` + `save_to_file_with_stats` pair which is
/// fine because that path captures + saves on the same pause
/// window anyway.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn capture_and_save_streaming(
    vm: &MicroVm,
    virtio: &VirtioSnapshot,
    secondary_states: &[PerVcpuState],
    path: &str,
) -> Result<SnapshotWriteStats, SnapshotStreamError> {
    let per0 = capture_vcpu_state(&vm.vcpu).map_err(SnapshotStreamError::Hvf)?;
    let gic_blob = hvf::gic_state_capture().map_err(SnapshotStreamError::Hvf)?;
    let captured_mach_time = unsafe { mach_absolute_time() };
    let captured_cntvct = captured_mach_time.wrapping_sub(per0.vtimer_offset);
    let mut per_vcpu = Vec::with_capacity(1 + secondary_states.len());
    per_vcpu.push(per0);
    per_vcpu.extend_from_slice(secondary_states);

    // Borrow the guest's RAM directly. SAFETY: vm.ram_host is
    // vm.ram_size bytes; guest is paused for the duration of
    // this call (caller's contract).
    let memory: &[u8] = unsafe { std::slice::from_raw_parts(vm.ram_host, vm.ram_size) };

    write_snapshot_file(
        path,
        captured_mach_time,
        captured_cntvct,
        vm.ram_gpa,
        memory,
        &gic_blob,
        &per_vcpu,
        virtio,
    )
    .map_err(SnapshotStreamError::Io)
}

/// Error from `capture_and_save_streaming`.
#[derive(Debug)]
pub enum SnapshotStreamError {
    Hvf(hvf::Error),
    Io(FileError),
}

impl std::fmt::Display for SnapshotStreamError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Hvf(e) => write!(f, "snapshot capture: {e:?}"),
            Self::Io(e) => write!(f, "snapshot save: {e:?}"),
        }
    }
}

pub fn save_to_file_with_stats(
    path: &str,
    snap: &Snapshot,
) -> Result<SnapshotWriteStats, FileError> {
    write_snapshot_file(
        path,
        snap.captured_mach_time,
        snap.captured_cntvct,
        snap.ram_gpa,
        &snap.memory,
        &snap.gic_blob,
        &snap.per_vcpu,
        &snap.virtio,
    )
}

#[allow(clippy::too_many_arguments)]
fn write_snapshot_file(
    path: &str,
    captured_mach_time: u64,
    captured_cntvct: u64,
    ram_gpa: u64,
    memory: &[u8],
    gic_blob: &[u8],
    per_vcpu: &[PerVcpuState],
    virtio: &VirtioSnapshot,
) -> Result<SnapshotWriteStats, FileError> {
    use std::io::{Seek, SeekFrom};
    // Crash-atomicity: write to `<path>.partial`, atomically rename
    // on success. Mirrors `save_compact_to_file`. A crash mid-write
    // leaves only the `.partial`, which `Image::from_snapshot`
    // ignores, so the next invocation cleanly re-bakes.
    let partial = format!("{path}.partial");
    let mut f = std::fs::File::create(&partial)?;
    // Header (72 bytes). ram_offset is filled in after we know where
    // RAM ends up — see the seek-and-rewrite at the bottom.
    let mut hdr = Vec::with_capacity(72);
    hdr.extend_from_slice(&SNAPSHOT_MAGIC);
    hdr.extend_from_slice(&SNAPSHOT_VERSION.to_le_bytes());
    hdr.extend_from_slice(&captured_mach_time.to_le_bytes());
    hdr.extend_from_slice(&captured_cntvct.to_le_bytes());
    hdr.extend_from_slice(&(memory.len() as u64).to_le_bytes());
    hdr.extend_from_slice(&(gic_blob.len() as u64).to_le_bytes());
    hdr.extend_from_slice(&(per_vcpu.len() as u32).to_le_bytes());
    hdr.extend_from_slice(&0u32.to_le_bytes()); // reserved
    hdr.extend_from_slice(&ram_gpa.to_le_bytes());
    hdr.extend_from_slice(&0u64.to_le_bytes()); // ram_offset placeholder
    f.write_all(&hdr)?;
    f.write_all(gic_blob)?;
    for st in per_vcpu {
        let mut p = Vec::with_capacity(64 + st.gp_regs.len() * 12 + st.simd_regs.len() * 20);
        p.extend_from_slice(&st.vtimer_offset.to_le_bytes());
        p.extend_from_slice(&(st.gp_regs.len() as u32).to_le_bytes());
        p.extend_from_slice(&(st.simd_regs.len() as u32).to_le_bytes());
        p.extend_from_slice(&(st.sys_regs.len() as u32).to_le_bytes());
        p.extend_from_slice(&(st.icc_regs.len() as u32).to_le_bytes());
        p.extend_from_slice(&(st.redist_regs.len() as u32).to_le_bytes());
        for (id, v) in &st.gp_regs {
            p.extend_from_slice(&id.to_le_bytes());
            p.extend_from_slice(&v.to_le_bytes());
        }
        for (id, v) in &st.simd_regs {
            p.extend_from_slice(&id.to_le_bytes());
            p.extend_from_slice(&v.to_le_bytes());
        }
        for (id, v) in &st.sys_regs {
            p.extend_from_slice(&id.to_le_bytes());
            p.extend_from_slice(&v.to_le_bytes());
        }
        for (id, v) in &st.icc_regs {
            p.extend_from_slice(&id.to_le_bytes());
            p.extend_from_slice(&v.to_le_bytes());
        }
        for (off, v) in &st.redist_regs {
            p.extend_from_slice(&off.to_le_bytes());
            p.extend_from_slice(&v.to_le_bytes());
        }
        f.write_all(&p)?;
    }

    // Virtio section: u32 n_mmio, then per device:
    //   u32 driver_features[2], u32 status, u32 n_queues
    //   per queue: u16 size, u8 ready, _pad, u64 desc, u64 avail, u64 used,
    //              u16 last_avail_idx, u16 next_used_idx
    // then u32 n_listeners, per listener: u64 cid, u32 peer_port,
    //              u32 vm_port, u16 family, u16 socktype.
    let mut v = Vec::with_capacity(
        64 + virtio.mmio.len() * 256 + virtio.vsock_listeners.len() * 24,
    );
    v.extend_from_slice(&(virtio.mmio.len() as u32).to_le_bytes());
    for m in &virtio.mmio {
        v.extend_from_slice(&m.driver_features[0].to_le_bytes());
        v.extend_from_slice(&m.driver_features[1].to_le_bytes());
        v.extend_from_slice(&m.status.to_le_bytes());
        v.extend_from_slice(&m.interrupt_status.to_le_bytes());
        v.extend_from_slice(&(m.queues.len() as u32).to_le_bytes());
        for q in &m.queues {
            v.extend_from_slice(&q.size.to_le_bytes());
            v.push(if q.ready { 1 } else { 0 });
            v.push(0); // pad
            v.extend_from_slice(&q.desc_table.to_le_bytes());
            v.extend_from_slice(&q.avail_ring.to_le_bytes());
            v.extend_from_slice(&q.used_ring.to_le_bytes());
            v.extend_from_slice(&q.last_avail_idx.to_le_bytes());
            v.extend_from_slice(&q.next_used_idx.to_le_bytes());
        }
    }
    v.extend_from_slice(&(virtio.vsock_listeners.len() as u32).to_le_bytes());
    for l in &virtio.vsock_listeners {
        v.extend_from_slice(&l.cid.to_le_bytes());
        v.extend_from_slice(&l.peer_port.to_le_bytes());
        v.extend_from_slice(&l.vm_port.to_le_bytes());
        v.extend_from_slice(&l.family.to_le_bytes());
        v.extend_from_slice(&l.socktype.to_le_bytes());
    }
    f.write_all(&v)?;

    // Pad to RAM_PAGE_ALIGN, then write RAM. Stash the offset back
    // into the header so loaders (especially --cow-restore) know
    // where to mmap from.
    let cur = f.stream_position()? as u64;
    let pad = (RAM_PAGE_ALIGN - (cur % RAM_PAGE_ALIGN)) % RAM_PAGE_ALIGN;
    if pad > 0 {
        f.write_all(&vec![0u8; pad as usize])?;
    }
    let ram_offset = f.stream_position()? as u64;
    let stats = write_sparse_ram(&mut f, memory, ram_offset)?;
    f.seek(SeekFrom::Start(64))?;
    f.write_all(&ram_offset.to_le_bytes())?;
    drop(f);
    // Atomic rename: make `path` appear only after the full write
    // succeeded. Crash mid-write leaves `path.partial`.
    std::fs::rename(&partial, path)?;
    Ok(stats)
}

/// How many parallel writer threads to use for the sparse RAM
/// scan/write. Default 4 — saturates APFS sequential write
/// bandwidth on M-series without thrashing the page cache.
/// Override via `SUPERMACHINE_SNAPSHOT_WRITE_THREADS=N` (1 = legacy
/// single-threaded). 1 disables parallel write (legacy path).
fn snapshot_write_threads() -> usize {
    if let Ok(v) = std::env::var("SUPERMACHINE_SNAPSHOT_WRITE_THREADS") {
        if let Ok(n) = v.parse::<usize>() {
            return n.max(1);
        }
    }
    4
}

/// Walks `memory` in `SPARSE_RAM_CHUNK`-sized runs of "all-zero" or
/// "non-zero", and writes the non-zero runs into the snapshot file
/// at `ram_offset + run_start`. Zero runs become file holes
/// (sparse). Returns `(data_bytes, zero_bytes)`.
///
/// Multi-threaded variant — if `n_threads > 1`, splits `memory`
/// into `n_threads` slabs aligned to `SPARSE_RAM_CHUNK` and each
/// worker walks its slab using `pwrite`-style `write_all_at`
/// (positional writes). The threads share a clone of the file
/// descriptor; each writes into a distinct byte range so there's
/// no offset contention. APFS handles concurrent writes fine.
///
/// Single-threaded variant kept as the path used when
/// `n_threads == 1` (legacy / debug). Same on-disk output.
fn write_sparse_ram(
    f: &mut std::fs::File,
    memory: &[u8],
    ram_offset: u64,
) -> Result<SnapshotWriteStats, FileError> {
    use std::io::{Seek, SeekFrom};

    let n_threads = snapshot_write_threads();
    let total = memory.len();
    let logical_end = ram_offset + total as u64;
    // Set the file length up front so workers' positional writes
    // don't need to extend the file (APFS would serialize on
    // metadata otherwise).
    f.set_len(logical_end)?;

    let stats = if n_threads <= 1 {
        write_sparse_ram_serial(f, memory, ram_offset)?
    } else {
        write_sparse_ram_parallel(f, memory, ram_offset, n_threads)?
    };

    f.seek(SeekFrom::Start(logical_end))?;
    Ok(stats)
}

fn write_sparse_ram_serial(
    f: &std::fs::File,
    memory: &[u8],
    ram_offset: u64,
) -> Result<SnapshotWriteStats, FileError> {
    use std::os::unix::fs::FileExt;
    let mut pos = 0usize;
    let mut data_bytes = 0u64;
    let mut zero_bytes = 0u64;

    while pos < memory.len() {
        let run_is_zero =
            is_zero_chunk(&memory[pos..(pos + (memory.len() - pos).min(SPARSE_RAM_CHUNK))]);
        let run_start = pos;
        pos += (memory.len() - pos).min(SPARSE_RAM_CHUNK);
        while pos < memory.len() {
            let next_len = (memory.len() - pos).min(SPARSE_RAM_CHUNK);
            let next_is_zero = is_zero_chunk(&memory[pos..pos + next_len]);
            if next_is_zero != run_is_zero {
                break;
            }
            pos += next_len;
        }

        let run_len = pos - run_start;
        if run_is_zero {
            zero_bytes += run_len as u64;
        } else {
            f.write_all_at(&memory[run_start..pos], ram_offset + run_start as u64)?;
            data_bytes += run_len as u64;
        }
    }
    Ok(SnapshotWriteStats {
        ram_bytes: memory.len() as u64,
        ram_data_bytes: data_bytes,
        ram_zero_bytes: zero_bytes,
    })
}

fn write_sparse_ram_parallel(
    f: &std::fs::File,
    memory: &[u8],
    ram_offset: u64,
    n_threads: usize,
) -> Result<SnapshotWriteStats, FileError> {
    let total = memory.len();
    // Round per-thread slab up to a multiple of SPARSE_RAM_CHUNK
    // so the zero-run scan in each thread aligns with the same
    // chunk boundaries the serial path would have used. Output
    // is byte-identical to the serial path.
    let slab = ((total + n_threads - 1) / n_threads + SPARSE_RAM_CHUNK - 1)
        / SPARSE_RAM_CHUNK
        * SPARSE_RAM_CHUNK;

    let mut bounds: Vec<(usize, usize)> = Vec::with_capacity(n_threads);
    let mut start = 0usize;
    while start < total {
        let end = (start + slab).min(total);
        bounds.push((start, end));
        start = end;
    }

    let mem_ptr = memory.as_ptr() as usize;
    let mem_len = memory.len();

    let stats = std::thread::scope(|s| -> Result<(u64, u64), std::io::Error> {
        let mut handles = Vec::with_capacity(bounds.len());
        for (slab_start, slab_end) in bounds {
            let f_clone = f.try_clone()?;
            let h = s.spawn(move || -> std::io::Result<(u64, u64)> {
                // SAFETY: `memory` is borrowed for the duration of
                // the surrounding `thread::scope`; we reconstruct
                // a sub-slice with the same lifetime. The guest
                // is paused (caller invariant), so the bytes
                // don't change underneath.
                let _ = mem_len;
                let slab: &[u8] = unsafe {
                    std::slice::from_raw_parts(
                        (mem_ptr as *const u8).add(slab_start),
                        slab_end - slab_start,
                    )
                };
                let base = ram_offset + slab_start as u64;
                write_sparse_ram_slab(&f_clone, slab, base)
            });
            handles.push(h);
        }
        let mut data_bytes = 0u64;
        let mut zero_bytes = 0u64;
        for h in handles {
            let (d, z) = h
                .join()
                .map_err(|_| std::io::Error::other("snapshot write thread panicked"))??;
            data_bytes += d;
            zero_bytes += z;
        }
        Ok((data_bytes, zero_bytes))
    })?;

    Ok(SnapshotWriteStats {
        ram_bytes: total as u64,
        ram_data_bytes: stats.0,
        ram_zero_bytes: stats.1,
    })
}

fn write_sparse_ram_slab(
    f: &std::fs::File,
    memory: &[u8],
    base_offset: u64,
) -> std::io::Result<(u64, u64)> {
    use std::os::unix::fs::FileExt;
    let mut pos = 0usize;
    let mut data_bytes = 0u64;
    let mut zero_bytes = 0u64;

    while pos < memory.len() {
        let run_is_zero =
            is_zero_chunk(&memory[pos..(pos + (memory.len() - pos).min(SPARSE_RAM_CHUNK))]);
        let run_start = pos;
        pos += (memory.len() - pos).min(SPARSE_RAM_CHUNK);
        while pos < memory.len() {
            let next_len = (memory.len() - pos).min(SPARSE_RAM_CHUNK);
            let next_is_zero = is_zero_chunk(&memory[pos..pos + next_len]);
            if next_is_zero != run_is_zero {
                break;
            }
            pos += next_len;
        }
        let run_len = pos - run_start;
        if run_is_zero {
            zero_bytes += run_len as u64;
        } else {
            f.write_all_at(&memory[run_start..pos], base_offset + run_start as u64)?;
            data_bytes += run_len as u64;
        }
    }
    Ok((data_bytes, zero_bytes))
}

fn is_zero_chunk(chunk: &[u8]) -> bool {
    chunk.iter().all(|b| *b == 0)
}

pub fn load_from_file(path: &str) -> Result<Snapshot, FileError> {
    load_from_file_inner(path, /* skip_ram = */ false).map(|(snap, _, _)| snap)
}

/// Like `load_from_file` but skips RAM bytes (sets `memory = Vec::new()`).
/// Returns `(Snapshot, ram_offset, memory_bytes)` so callers can mmap
/// the RAM region directly for CoW restore.
pub fn load_meta(path: &str) -> Result<(Snapshot, u64, usize), FileError> {
    load_from_file_inner(path, true)
}

fn load_from_file_inner(path: &str, skip_ram: bool) -> Result<(Snapshot, u64, usize), FileError> {
    use std::io::{Seek, SeekFrom};
    let mut f = std::fs::File::open(path)?;
    let mut hdr = [0u8; 72];
    f.read_exact(&mut hdr).map_err(|_| FileError::Truncated)?;
    if hdr[0..8] != SNAPSHOT_MAGIC {
        return Err(FileError::BadMagic);
    }
    let version = le_u64(&hdr[8..16])?;
    if version != SNAPSHOT_VERSION {
        return Err(FileError::BadVersion(version));
    }
    let captured_mach_time = le_u64(&hdr[16..24])?;
    let captured_cntvct = le_u64(&hdr[24..32])?;
    let memory_bytes = le_u64(&hdr[32..40])? as usize;
    let gic_blob_len = le_u64(&hdr[40..48])? as usize;
    let n_vcpus = le_u32(&hdr[48..52])? as usize;
    if n_vcpus == 0 {
        return Err(FileError::Malformed("snapshot contains no vCPU state"));
    }
    let ram_gpa = le_u64(&hdr[56..64])?;
    let ram_offset = le_u64(&hdr[64..72])?;
    let mut gic_blob = vec![0u8; gic_blob_len];
    f.read_exact(&mut gic_blob)
        .map_err(|_| FileError::Truncated)?;
    let mut per_vcpu = Vec::with_capacity(n_vcpus);
    for _ in 0..n_vcpus {
        let mut hh = [0u8; 28];
        f.read_exact(&mut hh).map_err(|_| FileError::Truncated)?;
        let vtimer_offset = le_u64(&hh[0..8])?;
        let gp_n = le_u32(&hh[8..12])? as usize;
        let simd_n = le_u32(&hh[12..16])? as usize;
        let sys_n = le_u32(&hh[16..20])? as usize;
        let icc_n = le_u32(&hh[20..24])? as usize;
        let redist_n = le_u32(&hh[24..28])? as usize;
        let mut gp_regs = Vec::with_capacity(gp_n);
        for _ in 0..gp_n {
            let mut e = [0u8; 12];
            f.read_exact(&mut e).map_err(|_| FileError::Truncated)?;
            gp_regs.push((le_u32(&e[0..4])?, le_u64(&e[4..12])?));
        }
        let mut simd_regs = Vec::with_capacity(simd_n);
        for _ in 0..simd_n {
            let mut e = [0u8; 20];
            f.read_exact(&mut e).map_err(|_| FileError::Truncated)?;
            simd_regs.push((le_u32(&e[0..4])?, le_u128(&e[4..20])?));
        }
        let mut sys_regs = Vec::with_capacity(sys_n);
        for _ in 0..sys_n {
            let mut e = [0u8; 12];
            f.read_exact(&mut e).map_err(|_| FileError::Truncated)?;
            sys_regs.push((le_u32(&e[0..4])?, le_u64(&e[4..12])?));
        }
        let mut icc_regs = Vec::with_capacity(icc_n);
        for _ in 0..icc_n {
            let mut e = [0u8; 12];
            f.read_exact(&mut e).map_err(|_| FileError::Truncated)?;
            icc_regs.push((le_u32(&e[0..4])?, le_u64(&e[4..12])?));
        }
        let mut redist_regs = Vec::with_capacity(redist_n);
        for _ in 0..redist_n {
            let mut e = [0u8; 12];
            f.read_exact(&mut e).map_err(|_| FileError::Truncated)?;
            redist_regs.push((le_u32(&e[0..4])?, le_u64(&e[4..12])?));
        }
        per_vcpu.push(PerVcpuState {
            gp_regs,
            simd_regs,
            sys_regs,
            icc_regs,
            redist_regs,
            vtimer_offset,
        });
    }
    // Virtio section.
    let mut buf4 = [0u8; 4];
    f.read_exact(&mut buf4).map_err(|_| FileError::Truncated)?;
    let n_mmio = u32::from_le_bytes(buf4) as usize;
    let mut mmio = Vec::with_capacity(n_mmio);
    for _ in 0..n_mmio {
        let mut h = [0u8; 20];
        f.read_exact(&mut h).map_err(|_| FileError::Truncated)?;
        let driver_features = [le_u32(&h[0..4])?, le_u32(&h[4..8])?];
        let status = le_u32(&h[8..12])?;
        let interrupt_status = le_u32(&h[12..16])?;
        let n_q = le_u32(&h[16..20])? as usize;
        let mut queues = Vec::with_capacity(n_q);
        for _ in 0..n_q {
            let mut q = [0u8; 32];
            f.read_exact(&mut q).map_err(|_| FileError::Truncated)?;
            queues.push(QueueSnapshot {
                size: le_u16(&q[0..2])?,
                ready: q[2] != 0,
                desc_table: le_u64(&q[4..12])?,
                avail_ring: le_u64(&q[12..20])?,
                used_ring: le_u64(&q[20..28])?,
                last_avail_idx: le_u16(&q[28..30])?,
                next_used_idx: le_u16(&q[30..32])?,
            });
        }
        mmio.push(MmioSnapshot {
            driver_features,
            status,
            interrupt_status,
            queues,
        });
    }
    f.read_exact(&mut buf4).map_err(|_| FileError::Truncated)?;
    let n_lis = u32::from_le_bytes(buf4) as usize;
    let mut vsock_listeners = Vec::with_capacity(n_lis);
    for _ in 0..n_lis {
        let mut e = [0u8; 20];
        f.read_exact(&mut e).map_err(|_| FileError::Truncated)?;
        vsock_listeners.push(TsiListenerSnapshot {
            cid: le_u64(&e[0..8])?,
            peer_port: le_u32(&e[8..12])?,
            vm_port: le_u32(&e[12..16])?,
            family: le_u16(&e[16..18])?,
            socktype: le_u16(&e[18..20])?,
        });
    }

    let memory = if skip_ram {
        Vec::new()
    } else {
        f.seek(SeekFrom::Start(ram_offset))
            .map_err(|_| FileError::Truncated)?;
        let mut m = vec![0u8; memory_bytes];
        f.read_exact(&mut m).map_err(|_| FileError::Truncated)?;
        m
    };

    Ok((
        Snapshot {
            captured_mach_time,
            captured_cntvct,
            ram_gpa,
            memory,
            gic_blob,
            per_vcpu,
            virtio: VirtioSnapshot {
                mmio,
                vsock_listeners,
            },
        },
        ram_offset,
        memory_bytes,
    ))
}

/// mmap a snapshot file's RAM region into a new private mapping.
/// Returns the host-side pointer + length suitable for handing to
/// `MicroVm::new_with_ram` and `Vm::map`. The mapping is `MAP_PRIVATE`,
/// so guest writes go to anon pages backing the COW'd portion only —
/// the snapshot file on disk is never written.
pub fn mmap_ram_cow(path: &str) -> std::io::Result<(*mut u8, usize)> {
    let (_snap, ram_offset, memory_bytes) =
        load_meta(path).map_err(|e| std::io::Error::other(format!("load_meta: {e:?}")))?;
    mmap_ram_cow_at(path, ram_offset, memory_bytes)
}

pub fn mmap_ram_cow_at(
    path: &str,
    ram_offset: u64,
    memory_bytes: usize,
) -> std::io::Result<(*mut u8, usize)> {
    let f = std::fs::File::open(path)?;

    // SAFETY: kernel allocates pages.
    let fd = std::os::fd::AsRawFd::as_raw_fd(&f);
    let ptr = unsafe {
        libc::mmap(
            std::ptr::null_mut(),
            memory_bytes,
            libc::PROT_READ | libc::PROT_WRITE,
            libc::MAP_PRIVATE,
            fd,
            ram_offset as libc::off_t,
        )
    };
    if ptr == libc::MAP_FAILED {
        return Err(std::io::Error::last_os_error());
    }
    // NOTE: an earlier version called
    // `madvise(ptr, memory_bytes, MADV_WILLNEED)` here, claiming
    // a 1–3 ms cold-cache savings. Empirically on macOS this is
    // SYNCHRONOUS — `MADV_WILLNEED` blocks until the kernel has
    // populated the page cache for the entire range, even sparse
    // holes. For a 2 GiB snapshot file this turned a ~1 ms mmap
    // call into a 600 ms walk on cold cache. The earlier "saves
    // ~1–3 ms" claim was either measured wrong or against a
    // tiny snapshot. Removed; we let pages fault lazily on the
    // guest's first access — those that aren't accessed don't
    // pay the I/O cost.
    Ok((ptr as *mut u8, memory_bytes))
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;
    use std::path::PathBuf;

    fn temp_snapshot_path(name: &str) -> PathBuf {
        let mut path = std::env::temp_dir();
        path.push(format!(
            "snapshot-{name}-{}-{:?}.snap",
            std::process::id(),
            std::thread::current().id()
        ));
        path
    }

    fn write_bytes(path: &PathBuf, bytes: &[u8]) -> std::io::Result<()> {
        let mut file = std::fs::File::create(path)?;
        file.write_all(bytes)
    }

    fn minimal_header(n_vcpus: u32) -> [u8; 72] {
        let mut hdr = [0u8; 72];
        hdr[0..8].copy_from_slice(&SNAPSHOT_MAGIC);
        hdr[8..16].copy_from_slice(&SNAPSHOT_VERSION.to_le_bytes());
        hdr[32..40].copy_from_slice(&0u64.to_le_bytes());
        hdr[40..48].copy_from_slice(&0u64.to_le_bytes());
        hdr[48..52].copy_from_slice(&n_vcpus.to_le_bytes());
        hdr[64..72].copy_from_slice(&72u64.to_le_bytes());
        hdr
    }

    #[test]
    fn load_rejects_truncated_snapshot() -> std::io::Result<()> {
        let path = temp_snapshot_path("truncated");
        write_bytes(&path, b"SMS")?;

        let result = load_from_file(path.to_str().unwrap_or_default());
        let _ = std::fs::remove_file(path);

        assert!(matches!(result, Err(FileError::Truncated)));
        Ok(())
    }

    #[test]
    fn load_rejects_bad_magic() -> std::io::Result<()> {
        let path = temp_snapshot_path("bad-magic");
        write_bytes(&path, &[0u8; 72])?;

        let result = load_from_file(path.to_str().unwrap_or_default());
        let _ = std::fs::remove_file(path);

        assert!(matches!(result, Err(FileError::BadMagic)));
        Ok(())
    }

    #[test]
    fn load_rejects_missing_vcpu_state() -> std::io::Result<()> {
        let path = temp_snapshot_path("no-vcpu");
        write_bytes(&path, &minimal_header(0))?;

        let result = load_from_file(path.to_str().unwrap_or_default());
        let _ = std::fs::remove_file(path);

        assert!(matches!(
            result,
            Err(FileError::Malformed("snapshot contains no vCPU state"))
        ));
        Ok(())
    }
}