supermachine 0.7.80

//! VM construction helpers.
//!
//! This module is the reusable builder boundary behind the VMM runner. It
//! currently owns virtio MMIO planning and device registration; guest memory and
//! boot image loading will move here next.
//!
//! VM-execution module — gated to a platform with a hypervisor backend
//! (macOS/HVF today; broaden to include KVM when the Linux backend lands).
#![cfg(all(target_os = "macos", target_arch = "aarch64"))]

use std::fmt;
use std::sync::Arc;

use crate::arch::aarch64::fdt;
use crate::arch::aarch64::fdt::VirtioMmioEntry;
use crate::arch::aarch64::layout;
use crate::devices::mmio_bus::MmioBus;
use crate::devices::serial::{SerialPl011, SerialState};
use crate::devices::virtio::balloon::{VirtioBalloon, VirtioBalloonWithRam};
use crate::devices::virtio::blk::VirtioBlk;
use crate::devices::virtio::mmio::MmioVirtio;
use crate::devices::virtio::queue::GuestMem;
use crate::devices::virtio::rng::VirtioRng;
use crate::devices::virtio::vsock::device::Vsock as VirtioVsock;
use crate::hypervisor::HypervisorVm;
use crate::kernel::loader;
use crate::vmm::coord::VcpuCoordinator;
use crate::vmm::resources::VmResources;
use crate::vmm::snapshot;
use crate::vmm::vstate::{boot_linux, MicroVm};

#[derive(Clone)]
pub struct VirtioMmioPlan {
    pub entries: Vec<VirtioMmioEntry>,
    pub rng_base: u64,
    pub rng_irq: u32,
    pub balloon_base: u64,
    pub balloon_irq: u32,
    /// One (base, irq) pair per virtio-fs mount. Empty when no
    /// mounts are configured.
    pub fs_entries: Vec<(u64, u32)>,
}

pub struct DeviceSet {
    pub bus: MmioBus,
    pub all_mmio: Vec<Arc<MmioVirtio>>,
    pub vsock: Arc<VirtioVsock>,
    pub balloon: Arc<VirtioBalloon>,
    /// Per-mount PosixFs handles, in the same order as
    /// `resources.mounts`. Kept Arc-cloned so the snapshot pipeline
    /// can call `snapshot_state` on each at capture time and
    /// `restore_state` at hydrate time — the FUSE backend's
    /// `(nodeid → host_path)` table doesn't survive snapshot/restore
    /// without explicit serialisation, and an empty post-restore
    /// table is what surfaces the "MODULE_NOT_FOUND on paths not
    /// walked during warmup" symptom.
    pub posix_fs: Vec<Arc<crate::fuse::PosixFs>>,
    /// Per-mount DaxSession handles, in the same order as
    /// `resources.mounts` (and `posix_fs`). Kept Arc-cloned so the
    /// snapshot pipeline can `snapshot_state` / `restore_state`
    /// each session's slot table, AND so the vCPU worker can route
    /// stage-2 faults into `handle_stage2_fault` to lazily re-bind
    /// restored slots on first guest access. Without this, a
    /// cycle-restored VM's first touch of a DAX-mapped page that
    /// wasn't paged-in pre-snapshot SIGBUSes (the host-side mmap +
    /// HVF stage-2 mapping are lost across restore).
    pub dax_sessions: Vec<Arc<crate::fuse::DaxSession>>,
    /// Per-mount VirtioFs device handles, same order as `posix_fs`.
    /// Held so the runner can call `reset_for_restore` on each between
    /// pool cycle-restores (the `notif_pool` of guest-offered
    /// notification chains isn't in the snapshot and must be cleared).
    pub fs_devices: Vec<Arc<crate::devices::virtio::fs::VirtioFs>>,
    /// Per-VM PL011 host-side state. Shared (via `Arc`) with the
    /// `SerialPl011` device registered on `bus`. The runner / pool /
    /// bake driver read the boot-line markers and push RX bytes through
    /// this handle — see `Vmm::serial`.
    pub serial: Arc<SerialState>,
}

pub struct Vmm {
    pub vm: MicroVm,
    pub bus: Arc<MmioBus>,
    pub all_mmio: Vec<Arc<MmioVirtio>>,
    pub vsock: Arc<VirtioVsock>,
    pub coord: Arc<VcpuCoordinator>,
    /// Direct handle to the balloon device for the runner to
    /// drive `request_inflate` (see `--balloon-target-pages`).
    pub balloon: Arc<VirtioBalloon>,
    /// Per-mount PosixFs handles. See `DeviceSet::posix_fs`.
    pub posix_fs: Vec<Arc<crate::fuse::PosixFs>>,
    /// Per-mount DaxSession handles. See `DeviceSet::dax_sessions`.
    pub dax_sessions: Vec<Arc<crate::fuse::DaxSession>>,
    /// Per-mount VirtioFs device handles. See `DeviceSet::fs_devices`.
    /// The runner calls `reset_for_restore` on each between cycles.
    pub fs_devices: Vec<Arc<crate::devices::virtio::fs::VirtioFs>>,
    /// Per-VM PL011 host-side state. The bake driver reads its
    /// boot-line markers (`pre_exec_ready`, `workload_parked`,
    /// `heartbeat_count`, `smpark_state_gpa`, `kcache_ready`,
    /// `pre_exec_sync_ready`, …) and the restore/resume path pushes RX
    /// bytes through `push_rx_byte`. Shared (`Arc`) with the
    /// `SerialPl011` MMIO device, so the device's boot-line scanner and
    /// these readers observe the same per-VM state. See
    /// `devices::serial` module header for the rationale.
    pub serial: Arc<SerialState>,
}

#[derive(Default, Clone, Copy, Debug, PartialEq, Eq)]
pub struct VmmRestoreTimings {
    pub ram_copy_us: u128,
    pub gic_restore_us: u128,
    pub vcpu_restore_us: u128,
    pub vtimer_offset_us: u128,
    pub mmio_restore_us: u128,
    pub listener_restore_us: u128,
}

pub struct BuiltVm {
    pub vm: MicroVm,
    pub virtio_plan: VirtioMmioPlan,
}

#[derive(Debug)]
pub enum BuildError {
    MissingKernel,
    KernelImage {
        path: String,
        source: std::io::Error,
    },
    Initramfs {
        path: String,
        source: std::io::Error,
    },
    Fdt(std::io::Error),
    Hvf(crate::hypervisor::ActiveError),
    BlockDevice {
        path: String,
        source: std::io::Error,
    },
    Mount {
        host_path: String,
        source: std::io::Error,
    },
    VsockMuxer(crate::devices::virtio::vsock::muxer_thread::StartError),
}

impl fmt::Display for BuildError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            BuildError::MissingKernel => write!(f, "kernel path is required for cold boot"),
            BuildError::KernelImage { path, source } => {
                write!(f, "read kernel image {path}: {source}")
            }
            BuildError::Initramfs { path, source } => {
                write!(f, "read initramfs {path}: {source}")
            }
            BuildError::Fdt(e) => write!(f, "generate FDT: {e}"),
            BuildError::Hvf(e) => write!(f, "HVF operation failed: {e:?}"),
            BuildError::BlockDevice { path, source } => {
                write!(f, "open block device {path}: {source}")
            }
            BuildError::Mount { host_path, source } => {
                write!(f, "open virtio-fs mount {host_path}: {source}")
            }
            BuildError::VsockMuxer(e) => write!(f, "{e}"),
        }
    }
}

impl std::error::Error for BuildError {}

impl From<crate::hypervisor::ActiveError> for BuildError {
    fn from(value: crate::hypervisor::ActiveError) -> Self {
        Self::Hvf(value)
    }
}

impl From<crate::devices::virtio::vsock::muxer_thread::StartError> for BuildError {
    fn from(value: crate::devices::virtio::vsock::muxer_thread::StartError) -> Self {
        Self::VsockMuxer(value)
    }
}

impl Drop for Vmm {
    fn drop(&mut self) {
        // Stop + join each virtio-fs I/O worker BEFORE any field (notably
        // `vm: MicroVm`, declared first → dropped first) frees guest RAM:
        // the worker writes FUSE reply bytes + used-ring updates through
        // the device's captured `GuestMem` ptr. This also sets the
        // devices' stop gate, which closes the PosixFs kqueue-watcher's
        // `push_notification` path (the same non-vCPU guest-RAM writer
        // hazard, pre-existing). Can't rely on `VirtioFs::drop` — its
        // Arc is multiply-held (fs_devices + each MmioVirtio), so its
        // drop order vs `vm` is not deterministic.
        for fs in &self.fs_devices {
            fs.shutdown_io();
        }
        // Stop + join the vsock muxer's I/O thread for the same reason. The
        // muxer thread drains inbound packets into the guest's RX descriptors
        // (kick → try_drain_rx) through the device's captured `GuestMem` ptr —
        // which the `HOST_RAM_PTR` null-guard does NOT cover — so a still-running
        // thread would write freed memory after `MicroVm::drop`'s munmap. Same
        // use-after-free class fixed on the KVM backend's `LinuxVm::drop`.
        // Pool reuse keeps the muxer alive via `reset_vsock_transport` (no drop),
        // so this only fires at final VM teardown.
        self.vsock.shutdown();
    }
}

impl Vmm {
    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
    pub fn restore_snapshot(
        &self,
        snap: &snapshot::Snapshot,
    ) -> crate::hypervisor::ActiveResult<()> {
        self.restore_snapshot_timed(snap).map(|_| ())
    }

    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
    pub fn restore_snapshot_timed(
        &self,
        snap: &snapshot::Snapshot,
    ) -> crate::hypervisor::ActiveResult<VmmRestoreTimings> {
        self.restore_snapshot_timed_with_options(snap, snapshot::SnapshotRestoreOptions::default())
    }

    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
    pub fn restore_snapshot_timed_with_options(
        &self,
        snap: &snapshot::Snapshot,
        options: snapshot::SnapshotRestoreOptions,
    ) -> crate::hypervisor::ActiveResult<VmmRestoreTimings> {
        // Park the virtio-fs I/O workers while RAM + device state are
        // rewritten: they write FUSE replies into guest RAM from non-vCPU
        // threads, so "vCPUs parked" alone no longer quiesces guest RAM.
        // The mmio restore below re-activates each fs device, which
        // leaves a kick pending; it's serviced when this guard drops —
        // i.e. requests captured pending in the snapshot's avail ring
        // get answered after the restored RAM is coherent.
        let _fs_pause = crate::devices::virtio::fs::FsIoPauseGuard::new(&self.fs_devices);
        let core = snapshot::restore_snapshot_timed_with_options(&self.vm, snap, options)?;
        // After the boot vCPU has its CNTVOFF_EL2 freshly applied,
        // publish the same value to the coordinator so secondaries
        // (currently spawning, or hitting cycle-restore later) can
        // align their own CNTVOFF. Without this, only vcpu0 sees a
        // CNTVCT that's consistent with the snapshot's kernel
        // cycle_last; secondaries see the host's raw CNTPCT, and
        // Linux thread migration between cores breaks
        // CLOCK_MONOTONIC monotonicity in userspace (libuv asserts
        // in Node ≥24; older builds silently wrap). Additionally,
        // in-kernel per-CPU virtual-timer ticks programmed in
        // CNTV_CVAL_EL0 use CNTVCT_EL0 as their compare basis —
        // if CNTVOFF on a secondary is left at HVF default (0) while
        // boot is at the snapshot-aligned offset, the secondary's
        // CNTV_CVAL post-restore lies in the distant past relative
        // to raw CNTPCT, the timer fires every iteration of
        // hv_vcpu_run causing tight IRQ storms, the kernel can't
        // make forward progress on that vCPU, RCU stalls, and
        // userspace work scheduled on that vCPU never runs (e.g.
        // the postgres start script SIGKILL'd at 30s with no /tmp/pg.log
        // because the shell was scheduled on a stalled secondary).
        //
        // Use the value directly from the just-completed restore
        // rather than round-tripping through `get_vtimer_offset` —
        // that getter can fail (HVF returned Err), the prior code
        // silently swallowed the error in `if let Ok(applied)`, and
        // secondaries then read `vtimer_offset = 0` (the sentinel
        // for "no offset published") and SKIPPED applying any
        // offset. Intermittent 4/30 (~13%) workload-timeout failures
        // traced to this path.
        let applied_offset = core.applied_vtimer_offset;
        self.coord
            .vtimer_offset
            .store(applied_offset, std::sync::atomic::Ordering::Release);
        if applied_offset == 0 {
            // A literal 0 offset would collide with our "not set"
            // sentinel. Astronomically unlikely (would require host
            // mach_absolute_time to exactly equal the snapshot's
            // captured CNTVCT), but log loudly if it ever happens
            // — secondaries would silently skip the offset apply
            // and we'd see the exact stall this fix is preventing.
            eprintln!(
                "supermachine: warning: restored vtimer offset is exactly 0 \
                 (mach_now == snap.captured_clock_ref?); secondaries will keep \
                 HVF default CNTVOFF and may stall on per-CPU timer ticks."
            );
        }
        let t0 = std::time::Instant::now();
        // Positional restore: apply each device record's MMIO state to the
        // device reconstructed at the same bus index (kind/backing come from the
        // Image on HVF, so they're ignored here — see VirtioSnapshot).
        for (i, rec) in snap.virtio.devices.iter().enumerate() {
            if let Some(d) = self.all_mmio.get(i) {
                d.restore_state(&rec.mmio);
            }
        }
        let mmio_restore_us = t0.elapsed().as_micros();
        let t0 = std::time::Instant::now();
        self.vsock
            .muxer()
            .restore_tsi_listeners(&snap.virtio.vsock_listeners);
        let listener_restore_us = t0.elapsed().as_micros();
        Ok(VmmRestoreTimings {
            ram_copy_us: core.ram_copy_us,
            gic_restore_us: core.gic_restore_us,
            vcpu_restore_us: core.vcpu_restore_us,
            vtimer_offset_us: core.vtimer_offset_us,
            mmio_restore_us,
            listener_restore_us,
        })
    }

    pub fn reset_vsock_transport(&self) {
        self.vsock.muxer().reset();
        self.vsock.reset_pending_rx();
    }
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn build_vmm(
    resources: &VmResources,
    cow_ram: Option<(*mut u8, usize)>,
    restore_memory_len: Option<usize>,
) -> Result<Vmm, BuildError> {
    let built = build_vm(resources, cow_ram, restore_memory_len)?;
    let device_set = build_device_set(
        &built.vm,
        &resources.block_devices,
        &resources.volumes,
        &resources.mounts,
        &built.virtio_plan,
        resources.tsi_token,
    )?;
    let coord = VcpuCoordinator::new(resources.vcpus);
    // Mark vCPU 0 as running; the boot CPU runs immediately.
    coord.slots[0]
        .on
        .store(true, std::sync::atomic::Ordering::SeqCst);

    Ok(Vmm {
        vm: built.vm,
        bus: Arc::new(device_set.bus),
        all_mmio: device_set.all_mmio,
        vsock: device_set.vsock,
        coord,
        balloon: device_set.balloon,
        posix_fs: device_set.posix_fs,
        dax_sessions: device_set.dax_sessions,
        fs_devices: device_set.fs_devices,
        serial: device_set.serial,
    })
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn build_vm(
    resources: &VmResources,
    cow_ram: Option<(*mut u8, usize)>,
    restore_memory_len: Option<usize>,
) -> Result<BuiltVm, BuildError> {
    let mem_size = resources.memory_bytes();
    let block_paths = &resources.block_devices;

    let kernel = if resources.is_restore() {
        Vec::new()
    } else {
        let kernel_path = resources
            .kernel_path
            .as_deref()
            .ok_or(BuildError::MissingKernel)?;
        let k = loader::read_image(kernel_path).map_err(|source| BuildError::KernelImage {
            path: kernel_path.to_string(),
            source,
        })?;
        eprintln!("  kernel    {} bytes loaded, magic OK", k.len());
        k
    };
    let initrd = if resources.is_restore() {
        None
    } else {
        match resources.initrd_path.as_deref() {
            Some(path) => {
                let initrd =
                    loader::read_initramfs(path).map_err(|source| BuildError::Initramfs {
                        path: path.to_string(),
                        source,
                    })?;
                eprintln!("  initramfs {} bytes loaded", initrd.len());
                Some(initrd)
            }
            None => None,
        }
    };

    // Each `--volume` adds one more virtio-blk slot; the plan
    // reserves space for *all* block devices (RO layers + RW
    // volumes) before rng + balloon so the FDT entries match
    // build_device_set's numbering.
    let virtio_plan = virtio_mmio_plan_with_fs(
        block_paths.len() + resources.volumes.len(),
        resources.mounts.len(),
    );
    let actual_mem = cow_ram
        .map(|(_, len)| len)
        .or(restore_memory_len)
        .unwrap_or(mem_size);
    let vm = if let Some((ptr, len)) = cow_ram {
        MicroVm::new_with_ram(ptr, len, true)?
    } else {
        MicroVm::new(actual_mem)?
    };

    if !resources.is_restore() {
        let fdt = fdt::generate(
            resources.vcpus as usize,
            mem_size as u64,
            &resources.cmdline,
            initrd.as_ref().map(|i| {
                let initrd_gpa = crate::vmm::vstate::initrd_gpa(
                    layout::DRAM_MEM_START_KERNEL,
                    mem_size as u64,
                    kernel.len() as u64,
                    i.len() as u64,
                );
                (initrd_gpa, i.len() as u64)
            }),
            &virtio_plan.entries,
        )
        .map_err(BuildError::Fdt)?;
        eprintln!("  FDT       {} bytes generated", fdt.len());
        boot_linux(&vm, &kernel, initrd.as_deref(), &fdt)?;
    }

    Ok(BuiltVm { vm, virtio_plan })
}

pub fn virtio_mmio_plan(block_device_count: usize) -> VirtioMmioPlan {
    virtio_mmio_plan_with_fs(block_device_count, 0)
}

/// Like `virtio_mmio_plan` but also reserves `fs_count` MMIO slots
/// past the balloon device for virtio-fs mounts.
pub fn virtio_mmio_plan_with_fs(block_device_count: usize, fs_count: usize) -> VirtioMmioPlan {
    // Every virtio device is assigned one SPI in [IRQ_BASE, IRQ_MAX].
    // Guard the WHOLE enumeration — block devices, rng, balloon, AND fs —
    // so an over-large device count fails loudly at build time instead of
    // silently handing the guest an out-of-range IRQ (the GIC would reject
    // it and the device would never init). Previously only the fs loop was
    // checked, so ~125+ block devices overflowed rng/balloon/block IRQs
    // past IRQ_MAX undetected.
    let check_irq = |irq: u32, what: &str| {
        if irq > layout::IRQ_MAX {
            panic!(
                "virtio_mmio_plan: {what} IRQ {irq} exceeds IRQ_MAX={} \
                 ({block_device_count} block devices + {fs_count} fs mounts is too many)",
                layout::IRQ_MAX
            );
        }
    };

    let mut entries = vec![VirtioMmioEntry {
        base: layout::VIRTIO_MMIO_BASE,
        irq: layout::IRQ_BASE,
    }];
    let rng_idx = (1 + block_device_count) as u64;
    let rng_base = layout::VIRTIO_MMIO_BASE + rng_idx * layout::VIRTIO_MMIO_STRIDE;
    // PL011 (SERIAL_IRQ = 33 = SPI 1) collides with the default virtio IRQ
    // enumeration starting at IRQ_BASE+1; skip past it so the kernel accepts
    // our `interrupts` assignment.
    let rng_irq = layout::IRQ_BASE + rng_idx as u32 + 1;
    let balloon_idx = rng_idx + 1;
    let balloon_base = layout::VIRTIO_MMIO_BASE + balloon_idx * layout::VIRTIO_MMIO_STRIDE;
    let balloon_irq = layout::IRQ_BASE + balloon_idx as u32 + 1;
    for i in 0..block_device_count {
        let n = (i as u64) + 1;
        let irq = layout::IRQ_BASE + n as u32 + 1;
        check_irq(irq, "block device");
        entries.push(VirtioMmioEntry {
            base: layout::VIRTIO_MMIO_BASE + n * layout::VIRTIO_MMIO_STRIDE,
            irq,
        });
    }
    check_irq(rng_irq, "rng");
    check_irq(balloon_irq, "balloon");
    entries.push(VirtioMmioEntry {
        base: rng_base,
        irq: rng_irq,
    });
    entries.push(VirtioMmioEntry {
        base: balloon_base,
        irq: balloon_irq,
    });

    let mut fs_entries = Vec::with_capacity(fs_count);
    for j in 0..fs_count {
        let n = balloon_idx + 1 + j as u64;
        let base = layout::VIRTIO_MMIO_BASE + n * layout::VIRTIO_MMIO_STRIDE;
        let irq = layout::IRQ_BASE + n as u32 + 1;
        check_irq(irq, "virtio-fs");
        fs_entries.push((base, irq));
        entries.push(VirtioMmioEntry { base, irq });
    }

    VirtioMmioPlan {
        entries,
        rng_base,
        rng_irq,
        balloon_base,
        balloon_irq,
        fs_entries,
    }
}

#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub fn build_device_set(
    vm: &MicroVm,
    block_paths: &[String],
    volumes: &[crate::vmm::resources::VolumeSpec],
    mounts: &[crate::vmm::resources::MountSpec],
    plan: &VirtioMmioPlan,
    tsi_token: Option<[u8; 32]>,
) -> Result<DeviceSet, BuildError> {
    let bus = MmioBus::new();
    // 0.7.56+: per-VM PL011 state, shared (via `Arc`) between the MMIO
    // device registered here and the orchestration code that reads its
    // boot-line markers / pushes RX bytes (returned in
    // `DeviceSet::serial` → `Vmm::serial`). Before 0.7.56 this lived in
    // process-global statics and leaked console bytes + bake markers
    // across pooled clones in one host process.
    let serial = Arc::new(SerialState::new());
    bus.register(
        layout::SERIAL_MMIO_BASE,
        Arc::new(SerialPl011::new(serial.clone())),
    );
    // 0.7.43+: wire up this VM's PL011 RX IRQ. SERIAL_IRQ is GIC SPI 33;
    // raising it on a `push_rx_byte` delivers a UART RX interrupt to the
    // guest. HVF auto-deasserts on the kernel's EOI (edge-triggered
    // semantics under the hood even though FDT declares LEVEL_HI — see
    // serial.rs comment). Set once per `SerialState` (OnceLock).
    // Backend-agnostic IRQ-raise handle (HVF GIC / KVM irqchip) cloned into each
    // device's notify closure; see `HypervisorVm::irq_line`.
    let irq_line = vm.vm.irq_line();
    let serial_irq = irq_line.clone();
    serial.set_irq_raiser(move |level| {
        serial_irq(layout::SERIAL_IRQ, level);
    });
    let mut all_mmio: Vec<Arc<MmioVirtio>> = Vec::new();
    let mut posix_fs_list: Vec<Arc<crate::fuse::PosixFs>> = Vec::new();
    let mut dax_session_list: Vec<Arc<crate::fuse::DaxSession>> = Vec::new();
    let mut fs_device_list: Vec<Arc<crate::devices::virtio::fs::VirtioFs>> = Vec::new();

    let mem = GuestMem::new(vm.ram_host, vm.ram_gpa, vm.ram_size);
    let vsock = Arc::new(VirtioVsock::with_tsi_token(
        3, /* guest CID */
        tsi_token,
    )?);
    let vsock_irq = irq_line.clone();
    let raw_spi: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
        vsock_irq(layout::IRQ_BASE, true);
    });
    let vsock_mmio = Arc::new(MmioVirtio::new(vsock.clone(), mem.clone(), raw_spi));
    let device_irq = vsock_mmio.make_used_buffer_irq();
    vsock.set_irq_raise(device_irq);
    let vsock_for_kick = vsock.clone();
    let kick: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
        vsock_for_kick.kick();
    });
    vsock.muxer().set_kick(kick);
    bus.register(layout::VIRTIO_MMIO_BASE, vsock_mmio.clone());
    all_mmio.push(vsock_mmio);
    eprintln!("  vsock@{:x} CID=3", layout::VIRTIO_MMIO_BASE);

    let rng = Arc::new(VirtioRng::new());
    let rng_irq = plan.rng_irq;
    let rng_il = irq_line.clone();
    let rng_raw_spi: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
        rng_il(rng_irq, true);
    });
    let rng_mmio = Arc::new(MmioVirtio::new(rng.clone(), mem.clone(), rng_raw_spi));
    rng.set_irq_raise(rng_mmio.make_used_buffer_irq());
    bus.register(plan.rng_base, rng_mmio.clone());
    all_mmio.push(rng_mmio);
    eprintln!("  rng@{:x}", plan.rng_base);

    let balloon = Arc::new(VirtioBalloon::new());
    let balloon_dev = Arc::new(VirtioBalloonWithRam {
        inner: balloon.clone(),
        ram_host: vm.ram_host,
        ram_size: vm.ram_size,
        ram_gpa: vm.ram_gpa,
    });
    let balloon_irq = plan.balloon_irq;
    let balloon_il = irq_line.clone();
    let balloon_raw_spi: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
        balloon_il(balloon_irq, true);
    });
    let balloon_mmio = Arc::new(MmioVirtio::new(balloon_dev, mem.clone(), balloon_raw_spi));
    balloon.set_irq_raise(balloon_mmio.make_used_buffer_irq());
    balloon.set_config_irq_raise(balloon_mmio.make_config_change_irq());
    bus.register(plan.balloon_base, balloon_mmio.clone());
    all_mmio.push(balloon_mmio);
    eprintln!("  balloon@{:x}", plan.balloon_base);

    for (i, path) in block_paths.iter().enumerate() {
        let n = (i as u64) + 1;
        let blk = Arc::new(
            VirtioBlk::open_ro(&format!("blk{i}"), path).map_err(|source| {
                BuildError::BlockDevice {
                    path: path.clone(),
                    source,
                }
            })?,
        );
        let blk_irq_intid = layout::IRQ_BASE + n as u32 + 1;
        let blk_il = irq_line.clone();
        let blk_raw_spi: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
            blk_il(blk_irq_intid, true);
        });
        let blk_mmio = Arc::new(MmioVirtio::new(blk.clone(), mem.clone(), blk_raw_spi));
        let blk_dev_irq = blk_mmio.make_used_buffer_irq();
        blk.set_irq_raise(blk_dev_irq);
        let blk_base = layout::VIRTIO_MMIO_BASE + n * layout::VIRTIO_MMIO_STRIDE;
        bus.register(blk_base, blk_mmio.clone());
        all_mmio.push(blk_mmio);
        eprintln!("  blk{i}@{blk_base:x}");
    }

    // Writable volumes (`--volume HOST:GUEST`). MMIO slots continue
    // numbering after the read-only block devices so the guest sees
    // them as additional /dev/vd* entries past the layers.
    let ro_count = block_paths.len();
    for (j, vol) in volumes.iter().enumerate() {
        let i = ro_count + j;
        let n = (i as u64) + 1;
        let name = format!("vol{j}");
        let blk = Arc::new(
            VirtioBlk::open_rw(&name, &vol.host_path, vol.size_bytes).map_err(|source| {
                BuildError::BlockDevice {
                    path: vol.host_path.clone(),
                    source,
                }
            })?,
        );
        let blk_irq_intid = layout::IRQ_BASE + n as u32 + 1;
        let blk_il = irq_line.clone();
        let blk_raw_spi: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
            blk_il(blk_irq_intid, true);
        });
        let blk_mmio = Arc::new(MmioVirtio::new(blk.clone(), mem.clone(), blk_raw_spi));
        let blk_dev_irq = blk_mmio.make_used_buffer_irq();
        blk.set_irq_raise(blk_dev_irq);
        let blk_base = layout::VIRTIO_MMIO_BASE + n * layout::VIRTIO_MMIO_STRIDE;
        bus.register(blk_base, blk_mmio.clone());
        all_mmio.push(blk_mmio);
        eprintln!("  {name}@{blk_base:x} (rw, mount {})", vol.guest_path);
    }

    // virtio-fs mounts. Each mount gets its own virtio-fs device,
    // its own MMIO slot, and its own FuseServer + PosixFs backend.
    //
    // Each mount also gets a NON-OVERLAPPING slice of the DAX window.
    // Pre-0.5.2 all mounts declared the SAME `(gpa, len)` for shm
    // region 0, which made the guest's `ioremap_dax` for the second
    // device collide with the first (`__request_region` returns
    // -EBUSY) — the second device failed to probe and userspace
    // reported "wrong fs type" when trying to mount its tag.
    if !mounts.is_empty() {
        assert_eq!(
            mounts.len(),
            plan.fs_entries.len(),
            "fs plan entries must match mounts: plan={} mounts={}",
            plan.fs_entries.len(),
            mounts.len()
        );
        // Per-mount DAX slice size. Three constraints:
        //
        //   1. **2 MiB alignment.** Linux's virtiofs driver requires
        //      the DAX window length AND base GPA to be a multiple of
        //      2 MiB (the transparent-hugepage / PMD granule). A
        //      misaligned probe fails with `virtiofs virtioN: probe
        //      with driver virtiofs failed with error -22` and the
        //      device never mounts.
        //   2. **Power-of-two-ish bases.** With N slices laid out
        //      back-to-back from `VIRTIOFS_DAX_BASE`, base[i] =
        //      base + slice_len * i. For base[i] to land on a 2 MiB
        //      boundary, slice_len itself must be 2 MiB aligned.
        //   3. **Fits in `VIRTIOFS_DAX_LEN` (8 GiB).** With N=3 mounts
        //      and 2 MiB alignment, 8 GiB / 3 = 2730⅔ MiB → rounded
        //      *down* to 2 MiB → 2730 MiB per slice → 3 × 2730 MiB =
        //      8190 MiB, fits comfortably.
        //
        // Pre-0.7.18 the rounding was 16 KiB (host page granule). For
        // N=2 (single user mount + Rosetta) the result was 4 GiB,
        // 2 MiB aligned by happy coincidence. For N=3+ (any 2+ user
        // mounts on `linux/amd64`, since Rosetta auto-appends a third
        // virtio-fs) the result was 2730⅔ MiB *rounded to 16 KiB* =
        // 2863296512 bytes, which is NOT 2 MiB aligned (offset 0x40_0000
        // off). The base of the second and third mounts inherited the
        // misalignment, all three probes returned -EINVAL, none of the
        // mounts came up, and `mount("rosetta", ..., "virtiofs", ...)`
        // in init-oci failed silently. Without the Rosetta share,
        // binfmt_misc never gets registered, and the very next
        // `execve` of an amd64 ELF (= `/bin/sh` for an amd64 image)
        // returns 127. The integrator hit this as
        // `Image.build({platform: 'linux/amd64', mounts: [a, b]})`
        // produces `warmup failed (exit 127)` with empty
        // stdout/stderr — the shell genuinely never ran.
        const DAX_ALIGN: u64 = 2 * 1024 * 1024; // 2 MiB / PMD
        let slice_len = (layout::VIRTIOFS_DAX_LEN / mounts.len() as u64) & !(DAX_ALIGN - 1);
        // DAX kill-switch. With DAX on, the guest's mmap of a virtio-fs file
        // issues FUSE_SETUPMAPPING/REMOVEMAPPING which call hv_vm_map/hv_vm_unmap
        // on a vCPU thread while sibling vCPUs are inside hv_vcpu_run — racing the
        // stage-2 tables (the very hazard `MicroVm::remap_cow` guards by parking
        // all vCPUs). Setting SUPERMACHINE_VIRTIOFS_DAX=0 advertises no DAX window,
        // so the guest serves mmap via FUSE_READ/WRITE instead — no runtime stage-2
        // remapping, at the cost of mmap zero-copy. Used to A/B-confirm the DAX race
        // and as an immediate stability mitigation for concurrent pooled-VM load.
        let dax_enabled = !matches!(
            std::env::var("SUPERMACHINE_VIRTIOFS_DAX").ok().as_deref(),
            Some("0") | Some("off") | Some("false") | Some("no")
        );
        if !dax_enabled {
            eprintln!("[virtio-fs] DAX disabled (SUPERMACHINE_VIRTIOFS_DAX); mmap served via FUSE_READ/WRITE");
        }
        assert!(
            slice_len >= DAX_ALIGN,
            "per-mount DAX slice {slice_len} < 2 MiB: too many mounts \
             ({}) for the {}-byte DAX window",
            mounts.len(),
            layout::VIRTIOFS_DAX_LEN
        );
        for (mi, mount) in mounts.iter().enumerate() {
            use crate::devices::virtio::fs::{VirtioFs, VirtioFsConfig};
            use crate::fuse::{DaxSession, FsBackend, PosixFs};
            use std::sync::Arc as StdArc;

            // Convention: tag `"rosetta"` means this mount is Apple's
            // Rosetta-in-VM runtime share (`/Library/Apple/usr/libexec/
            // oah/RosettaLinux/`). Auto-enable the FUSE_IOCTL handler
            // that answers `rosettad`'s startup cache-settings query —
            // without it `rosettad` exits with "Failed to query the
            // cache settings" and the binfmt_misc-triggered exec of
            // an amd64 ELF segfaults. See
            // `docs/design/rosetta-in-vm-2026-05-16.md`. Matches
            // Apple's `containerization` convention (their
            // `Vminitd+Rosetta.swift` mounts at the same tag).
            let mut fs =
                PosixFs::new_with_symlinks(&mount.host_path, mount.symlinks).map_err(|source| {
                    BuildError::Mount {
                        host_path: mount.host_path.clone(),
                        source,
                    }
                })?;
            if mount.guest_tag == "rosetta" {
                fs = fs.with_rosetta();
            }
            let posix_fs = StdArc::new(fs);
            let backend: StdArc<dyn FsBackend> = posix_fs.clone();
            let dax_gpa = layout::VIRTIOFS_DAX_BASE + slice_len * (mi as u64);
            let fs_cfg = VirtioFsConfig {
                tag: mount.guest_tag.clone(),
                num_request_queues: 1,
                dax_window_gpa: dax_gpa,
                // 0 ⇒ shm_regions advertises no DAX window ⇒ guest mounts
                // virtio-fs without a DAX cache (no SETUPMAPPING ever issued).
                dax_window_len: if dax_enabled { slice_len } else { 0 },
            };
            let fs_dev = StdArc::new(VirtioFs::with_backend(fs_cfg, backend.clone()));
            let (fs_base, fs_irq) = plan.fs_entries[mi];
            let fs_il = irq_line.clone();
            let fs_raw_spi: Arc<dyn Fn() + Send + Sync> = Arc::new(move || {
                fs_il(fs_irq, true);
            });
            let fs_mmio = Arc::new(MmioVirtio::new(
                fs_dev.clone() as Arc<dyn crate::devices::virtio::VirtioDevice>,
                mem.clone(),
                fs_raw_spi,
            ));
            fs_dev.set_irq_raise(fs_mmio.make_used_buffer_irq());

            // Install the DAX session on the device's FUSE server so
            // SETUPMAPPING / REMOVEMAPPING dispatch through real HVF.
            // Session uses the SAME per-mount slice as the device's
            // shm region — guest sees one window per device, host
            // serves SETUPMAPPING requests bounded to it. Cap matches
            // the slice length so a single mount can't starve others.
            // Only wire the DAX session when DAX is enabled. With it off, the
            // device advertises no DAX window (above), the guest never issues
            // SETUPMAPPING, and there are no stage-2 slots to rehydrate — so the
            // session would be dead weight and `dax_session_list` must stay
            // index-aligned with the mounts that actually have one.
            if dax_enabled {
                let mapper = vm.vm.dax_mapper();
                let session = StdArc::new(DaxSession::new(dax_gpa, slice_len, backend, mapper));
                // Keep a strong ref so the vCPU worker can route stage-2
                // faults into `session.handle_stage2_fault` for lazy re-
                // bind of restored slots, and the snapshot pipeline can
                // `snapshot_state` / `restore_state` the slot table.
                // Order matches `mounts`; the sidecar reads/writes blobs
                // by index, same as `posix_fs_list` above.
                dax_session_list.push(session.clone());
                fs_dev.fuse_server().lock().unwrap().set_dax(session);
            }

            // Hook the PosixFs kqueue watcher up to the VirtioFs's
            // hipriority notification channel. Host changes to
            // open files now trigger FUSE_NOTIFY_INVAL_INODE so
            // the guest re-reads on next access.
            let notifier: StdArc<dyn crate::fuse::Notifier> = fs_dev.clone();
            if let Err(e) = posix_fs.set_notifier(notifier) {
                eprintln!(
                    "[virtio-fs] kqueue watcher unavailable for {}: {e}",
                    mount.host_path
                );
            }

            bus.register(fs_base, fs_mmio.clone());
            all_mmio.push(fs_mmio);
            // Keep a strong ref to the PosixFs so the snapshot pipeline
            // can call `snapshot_state` / `restore_state` on it. Order
            // matches `mounts`; the sidecar reads/writes blobs by index.
            posix_fs_list.push(posix_fs);
            // And to the VirtioFs device, so the runner can clear its
            // (un-snapshotted) notif_pool between pool cycle-restores.
            fs_device_list.push(fs_dev);
            eprintln!(
                "  virtio-fs@{fs_base:x} tag={:?} → {}",
                mount.guest_tag, mount.host_path
            );
        }
    }

    Ok(DeviceSet {
        bus,
        all_mmio,
        vsock,
        balloon,
        posix_fs: posix_fs_list,
        dax_sessions: dax_session_list,
        fs_devices: fs_device_list,
        serial,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn virtio_plan_preserves_expected_order() {
        let plan = virtio_mmio_plan(2);
        assert_eq!(plan.entries.len(), 5);
        assert_eq!(plan.entries[0].base, layout::VIRTIO_MMIO_BASE);
        assert_eq!(
            plan.entries[1].base,
            layout::VIRTIO_MMIO_BASE + layout::VIRTIO_MMIO_STRIDE
        );
        assert_eq!(
            plan.entries[2].base,
            layout::VIRTIO_MMIO_BASE + 2 * layout::VIRTIO_MMIO_STRIDE
        );
        assert_eq!(plan.entries[3].base, plan.rng_base);
        assert_eq!(plan.entries[4].base, plan.balloon_base);
    }

    #[test]
    fn virtio_plan_irqs_and_bases_are_well_formed() {
        // A representative layout: 3 block devices + 2 fs mounts. Every
        // assigned IRQ must be distinct, strictly ascending, and within
        // [IRQ_BASE, IRQ_MAX]; every MMIO base STRIDE-spaced and distinct.
        let plan = virtio_mmio_plan_with_fs(3, 2);
        assert_eq!(plan.entries.len(), 1 + 3 + 2 + 2); // entry0 + blk + rng/balloon + fs

        let irqs: Vec<u32> = plan.entries.iter().map(|e| e.irq).collect();
        for w in irqs.windows(2) {
            assert!(w[0] < w[1], "IRQs must be strictly ascending: {irqs:?}");
        }
        for &irq in &irqs {
            assert!(
                (layout::IRQ_BASE..=layout::IRQ_MAX).contains(&irq),
                "IRQ {irq} out of range"
            );
        }

        let bases: Vec<u64> = plan.entries.iter().map(|e| e.base).collect();
        for (i, &base) in bases.iter().enumerate() {
            assert_eq!(
                (base - layout::VIRTIO_MMIO_BASE) % layout::VIRTIO_MMIO_STRIDE,
                0,
                "base[{i}] not STRIDE-aligned"
            );
        }
        let mut sorted = bases.clone();
        sorted.sort_unstable();
        sorted.dedup();
        assert_eq!(sorted.len(), bases.len(), "MMIO bases must be distinct");

        // The struct's rng/balloon mirrors must match their entries.
        assert!(plan
            .entries
            .iter()
            .any(|e| e.base == plan.rng_base && e.irq == plan.rng_irq));
        assert!(plan
            .entries
            .iter()
            .any(|e| e.base == plan.balloon_base && e.irq == plan.balloon_irq));
    }

    #[test]
    fn virtio_plan_at_irq_capacity_boundary_is_accepted() {
        // balloon_irq = IRQ_BASE + (2 + block_count) + 1. The largest
        // block_count whose balloon IRQ still fits is IRQ_MAX-IRQ_BASE-3.
        let max_block = (layout::IRQ_MAX - layout::IRQ_BASE - 3) as usize;
        let plan = virtio_mmio_plan(max_block);
        assert_eq!(
            plan.balloon_irq,
            layout::IRQ_MAX,
            "the boundary config lands the balloon IRQ exactly on IRQ_MAX"
        );
    }

    #[test]
    #[should_panic(expected = "exceeds IRQ_MAX")]
    fn virtio_plan_block_count_over_capacity_panics() {
        // One past the boundary: balloon IRQ would exceed IRQ_MAX. Before
        // the guard extension this silently produced an out-of-range IRQ.
        let over = (layout::IRQ_MAX - layout::IRQ_BASE - 3) as usize + 1;
        let _ = virtio_mmio_plan(over);
    }

    #[test]
    #[should_panic(expected = "exceeds IRQ_MAX")]
    fn virtio_plan_fs_count_over_capacity_panics() {
        // The original fs guard is preserved: enough mounts pushes the
        // last fs IRQ past IRQ_MAX.
        let _ = virtio_mmio_plan_with_fs(120, 64);
    }
}