supermachine 0.7.70

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
//! HVF implementation of the portable [`crate::hypervisor`] backend
//! contract. This is the macOS/Apple-Silicon binding; the KVM binding
//! will be a sibling implementing the same traits for its own types.
//!
//! Each method is a thin map from a portable identifier to the native
//! `applevisor_sys` register/flag and a delegation to the inherent
//! `Vm`/`Vcpu` wrappers in `super`. No logic lives here — it is purely
//! the translation layer.

use applevisor_sys as av;

use super::{Error, ExitReason, Vcpu, Vm};
use crate::hypervisor::{CoreReg, HypervisorVcpu, HypervisorVm, SysReg, VcpuExit, VcpuHandle};

/// HVF cross-thread vCPU force-exit token: a raw `hv_vcpu_t`. `repr` is
/// transparent so a `&[HvfVcpuHandle]` reinterprets directly as the
/// `*const hv_vcpu_t` array `hv_vcpus_exit` expects.
#[repr(transparent)]
#[derive(Clone, Copy)]
pub struct HvfVcpuHandle(pub av::hv_vcpu_t);

impl VcpuHandle for HvfVcpuHandle {
    fn force_exit(handles: &[Self]) {
        if handles.is_empty() {
            return;
        }
        // SAFETY: HvfVcpuHandle is repr(transparent) over hv_vcpu_t, so
        // the slice is a contiguous array of hv_vcpu_t.
        unsafe {
            let _ = av::hv_vcpus_exit(
                handles.as_ptr() as *const av::hv_vcpu_t,
                handles.len() as u32,
            );
        }
    }
}

/// Map a portable [`CoreReg`] to its `hv_reg_t`. `X(n)` for n>30 has no
/// register; callers route those through `get_x`/`set_x`, which already
/// treat out-of-range indices as no-ops, so we never reach here for them.
fn core_to_av(reg: CoreReg) -> av::hv_reg_t {
    match reg {
        // X(n) is handled by the get/set fns directly; map X(0) as a
        // harmless default so the function stays total.
        CoreReg::X(_) => av::hv_reg_t::X0,
        CoreReg::Fp => av::hv_reg_t::FP,
        CoreReg::Lr => av::hv_reg_t::LR,
        CoreReg::Pc => av::hv_reg_t::PC,
        CoreReg::Cpsr => av::hv_reg_t::CPSR,
        CoreReg::Fpsr => av::hv_reg_t::FPSR,
        CoreReg::Fpcr => av::hv_reg_t::FPCR,
    }
}

impl HypervisorVcpu for Vcpu {
    type Error = Error;
    type Handle = HvfVcpuHandle;
    type SnapshotState = crate::hvf::vcpu_snapshot::PerVcpuState;

    fn exit_token(&self) -> HvfVcpuHandle {
        HvfVcpuHandle(self.handle())
    }

    fn capture_snapshot(&self) -> Result<Self::SnapshotState, Error> {
        crate::hvf::vcpu_snapshot::capture_vcpu_state(self)
    }

    fn restore_snapshot(&self, state: &Self::SnapshotState) -> Result<(), Error> {
        crate::hvf::vcpu_snapshot::restore_vcpu_state(self, state)
    }

    fn write_snapshot_state(
        state: &Self::SnapshotState,
        w: &mut dyn std::io::Write,
    ) -> std::io::Result<()> {
        // Byte-for-byte identical to the per-vCPU record consumed by
        // `vmm::snapshot` (v9): a fixed 32-byte sub-header — vtimer offset
        // (u64) followed by the six group counts gp/simd/sys/icc/redist/ich
        // (u32 each) — then the six (id, value) entry lists in that order.
        // Holding this invariant lets the snapshot pipeline route every
        // per-vCPU write/read through this seam without changing a single
        // on-disk byte, so existing snapshots stay valid.
        w.write_all(&state.vtimer_offset.to_le_bytes())?;
        w.write_all(&(state.gp_regs.len() as u32).to_le_bytes())?;
        w.write_all(&(state.simd_regs.len() as u32).to_le_bytes())?;
        w.write_all(&(state.sys_regs.len() as u32).to_le_bytes())?;
        w.write_all(&(state.icc_regs.len() as u32).to_le_bytes())?;
        w.write_all(&(state.redist_regs.len() as u32).to_le_bytes())?;
        w.write_all(&(state.ich_regs.len() as u32).to_le_bytes())?;
        for (id, val) in &state.gp_regs {
            w.write_all(&id.to_le_bytes())?;
            w.write_all(&val.to_le_bytes())?;
        }
        for (id, val) in &state.simd_regs {
            w.write_all(&id.to_le_bytes())?;
            w.write_all(&val.to_le_bytes())?;
        }
        for (id, val) in &state.sys_regs {
            w.write_all(&id.to_le_bytes())?;
            w.write_all(&val.to_le_bytes())?;
        }
        for (id, val) in &state.icc_regs {
            w.write_all(&id.to_le_bytes())?;
            w.write_all(&val.to_le_bytes())?;
        }
        for (off, val) in &state.redist_regs {
            w.write_all(&off.to_le_bytes())?;
            w.write_all(&val.to_le_bytes())?;
        }
        for (id, val) in &state.ich_regs {
            w.write_all(&id.to_le_bytes())?;
            w.write_all(&val.to_le_bytes())?;
        }
        Ok(())
    }

    fn read_snapshot_state(r: &mut dyn std::io::Read) -> std::io::Result<Self::SnapshotState> {
        fn ru32(r: &mut dyn std::io::Read) -> std::io::Result<u32> {
            let mut b = [0u8; 4];
            r.read_exact(&mut b)?;
            Ok(u32::from_le_bytes(b))
        }
        fn ru64(r: &mut dyn std::io::Read) -> std::io::Result<u64> {
            let mut b = [0u8; 8];
            r.read_exact(&mut b)?;
            Ok(u64::from_le_bytes(b))
        }
        fn ru128(r: &mut dyn std::io::Read) -> std::io::Result<u128> {
            let mut b = [0u8; 16];
            r.read_exact(&mut b)?;
            Ok(u128::from_le_bytes(b))
        }
        // 32-byte sub-header, matching the writer above and vmm::snapshot v9.
        let vtimer_offset = ru64(r)?;
        let gp_n = ru32(r)? as usize;
        let simd_n = ru32(r)? as usize;
        let sys_n = ru32(r)? as usize;
        let icc_n = ru32(r)? as usize;
        let redist_n = ru32(r)? as usize;
        let ich_n = ru32(r)? as usize;
        // Cap every pre-allocation so a corrupt/huge count can't OOM; a bogus
        // count then fails fast on the first truncated read in its loop.
        let mut gp_regs = Vec::with_capacity(gp_n.min(4096));
        for _ in 0..gp_n {
            gp_regs.push((ru32(r)?, ru64(r)?));
        }
        let mut simd_regs = Vec::with_capacity(simd_n.min(4096));
        for _ in 0..simd_n {
            simd_regs.push((ru32(r)?, ru128(r)?));
        }
        let mut sys_regs = Vec::with_capacity(sys_n.min(4096));
        for _ in 0..sys_n {
            sys_regs.push((ru32(r)?, ru64(r)?));
        }
        let mut icc_regs = Vec::with_capacity(icc_n.min(4096));
        for _ in 0..icc_n {
            icc_regs.push((ru32(r)?, ru64(r)?));
        }
        let mut redist_regs = Vec::with_capacity(redist_n.min(4096));
        for _ in 0..redist_n {
            redist_regs.push((ru32(r)?, ru64(r)?));
        }
        let mut ich_regs = Vec::with_capacity(ich_n.min(4096));
        for _ in 0..ich_n {
            ich_regs.push((ru32(r)?, ru64(r)?));
        }
        Ok(crate::hvf::vcpu_snapshot::PerVcpuState {
            gp_regs,
            simd_regs,
            sys_regs,
            icc_regs,
            redist_regs,
            vtimer_offset,
            ich_regs,
        })
    }

    fn capture_clock_ref(state: &Self::SnapshotState, host_now: u64) -> u64 {
        // The persisted reference is the guest virtual counter at capture:
        // mach_absolute_time() minus the live CNTVOFF (== state.vtimer_offset).
        // `restore_clock` re-derives CNTVOFF from a fresh host reading so the
        // guest counter resumes from exactly this value.
        host_now.wrapping_sub(state.vtimer_offset)
    }

    fn restore_clock(&self, captured_ref: u64, host_now: u64) -> Result<u64, Error> {
        // New CNTVOFF = host_now - captured_guest_counter, so the guest virtual
        // counter reads `captured_ref` right now and advances with host time.
        let new_offset = host_now.wrapping_sub(captured_ref);
        self.set_vtimer_offset(new_offset)?;
        Ok(new_offset)
    }

    fn get_core(&self, reg: CoreReg) -> Result<u64, Error> {
        match reg {
            CoreReg::X(n) => self.get_x(n as u32),
            other => self.get_reg(core_to_av(other)),
        }
    }

    fn set_core(&self, reg: CoreReg, value: u64) -> Result<(), Error> {
        match reg {
            CoreReg::X(n) => self.set_x(n as u32, value),
            other => self.set_reg(core_to_av(other), value),
        }
    }

    fn get_sys(&self, reg: SysReg) -> Result<u64, Error> {
        match reg {
            SysReg::MpidrEl1 => self.get_sys_reg(av::hv_sys_reg_t::MPIDR_EL1),
        }
    }

    fn set_sys(&self, reg: SysReg, value: u64) -> Result<(), Error> {
        match reg {
            SysReg::MpidrEl1 => self.set_sys_reg(av::hv_sys_reg_t::MPIDR_EL1, value),
        }
    }

    fn step(&self) -> Result<VcpuExit, Error> {
        let exit = self.run()?;
        Ok(match ExitReason::from(exit.reason as u32) {
            ExitReason::Canceled => VcpuExit::Canceled,
            ExitReason::Exception => VcpuExit::Exception {
                syndrome: exit.exception.syndrome,
                phys_addr: exit.exception.physical_address,
                virt_addr: exit.exception.virtual_address,
            },
            ExitReason::VTimerActivated => VcpuExit::VTimerActivated,
            ExitReason::Unknown(v) => VcpuExit::Unknown(v),
        })
    }
}

impl HypervisorVm for Vm {
    type Error = Error;
    type Vcpu = Vcpu;

    fn create() -> Result<Self, Error> {
        Vm::new()
    }

    unsafe fn map_ram(
        &self,
        host_ptr: *mut u8,
        gpa: u64,
        len: usize,
        prot: u64,
    ) -> Result<(), Error> {
        // The portable `prot` bit layout (READ=1, WRITE=2, EXEC=4) is the
        // same as HVF's `hv_vm_map` flags, so this passes through; the
        // KVM backend will translate to its own mapping flags.
        unsafe { self.map(host_ptr, gpa, len, prot) }
    }

    unsafe fn unmap_ram(&self, gpa: u64, len: usize) -> Result<(), Error> {
        unsafe { self.unmap(gpa, len) }
    }

    fn create_vcpu(&self) -> Result<Vcpu, Error> {
        Vcpu::new()
    }

    fn set_irq(&self, intid: u32, level: bool) -> Result<(), Error> {
        // HVF's GICv3 is a process-global object (one per VM process), so the
        // SPI drive is a free function rather than a per-`Vm` method.
        crate::hvf::gic_set_spi(intid, level)
    }

    fn irq_line(&self) -> std::sync::Arc<dyn Fn(u32, bool) + Send + Sync> {
        // The GIC is process-global, so the handle captures nothing.
        std::sync::Arc::new(|intid, level| {
            let _ = crate::hvf::gic_set_spi(intid, level);
        })
    }

    fn capture_intc(&self) -> Result<Vec<u8>, Error> {
        // The GICv3 distributor state is process-global on HVF.
        crate::hvf::gic_state_capture()
    }

    fn restore_intc(&self, blob: &[u8]) -> Result<(), Error> {
        crate::hvf::gic_state_restore(blob)
    }

    fn dax_mapper(self: &std::sync::Arc<Self>) -> std::sync::Arc<dyn crate::fuse::HvfMapper> {
        // HVF maps via the process-global `hv_vm_map`, so the mapper is a unit
        // type that ignores the `Vm` handle.
        std::sync::Arc::new(crate::hvf::HvfDaxMapper::new())
    }

    fn host_monotonic_ticks() -> u64 {
        #[link(name = "System", kind = "framework")]
        extern "C" {
            fn mach_absolute_time() -> u64;
        }
        // SAFETY: mach_absolute_time has no preconditions; reads the host
        // monotonic timebase counter.
        unsafe { mach_absolute_time() }
    }

    fn boot_linux(
        &self,
        vcpu: &Vcpu,
        mem: &mut [u8],
        cfg: &crate::hypervisor::LinuxBootConfig,
    ) -> Result<(), Error> {
        // aarch64 Linux boot protocol (Documentation/arm64/booting.rst):
        //   kernel at RAM_START + KERNEL_LOAD_OFFSET; FDT in RAM (X0); X1..X3=0;
        //   PC = kernel entry; PSTATE = EL1h, DAIF masked, MMU off (0x3c5).
        // The caller (vmm::builder) prebuilds the FDT (with the virtio-mmio nodes
        // + cmdline in chosen/bootargs), so `cfg.cmdline` is unused here.
        use crate::arch::aarch64::layout;
        let fdt = cfg.fdt.expect("aarch64 Linux boot requires a prebuilt FDT");
        let kernel_off = layout::KERNEL_LOAD_OFFSET as usize;
        mem[kernel_off..kernel_off + cfg.kernel.len()].copy_from_slice(cfg.kernel);
        let fdt_off = cfg.ram_size - layout::FDT_MAX_SIZE;
        mem[fdt_off..fdt_off + fdt.len()].copy_from_slice(fdt);
        let fdt_gpa = cfg.ram_gpa + fdt_off as u64;
        if let Some(initrd) = cfg.initrd {
            let initrd_gpa = crate::vmm::vstate::initrd_gpa(
                cfg.ram_gpa,
                cfg.ram_size as u64,
                cfg.kernel.len() as u64,
                initrd.len() as u64,
            );
            let off = (initrd_gpa - cfg.ram_gpa) as usize;
            mem[off..off + initrd.len()].copy_from_slice(initrd);
        }
        let kernel_gpa = cfg.ram_gpa + layout::KERNEL_LOAD_OFFSET;
        vcpu.set_core(CoreReg::Cpsr, 0x3c5)?;
        vcpu.set_core(CoreReg::X(0), fdt_gpa)?;
        vcpu.set_core(CoreReg::X(1), 0)?;
        vcpu.set_core(CoreReg::X(2), 0)?;
        vcpu.set_core(CoreReg::X(3), 0)?;
        vcpu.set_core(CoreReg::Pc, kernel_gpa)?;
        Ok(())
    }
}