supermachine 0.4.13

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// PORT TARGET: vmm/src/macos/vstate.rs
// Status: minimal — VM create + map RAM + 1 vCPU + run an
// embedded test instruction sequence + decode exit. Enough to
// prove HVF works under our control. Kernel boot, FDT, GIC reg
// init, multi-vCPU all come in subsequent commits.

#![cfg(all(target_os = "macos", target_arch = "aarch64"))]

use applevisor_sys as av;

use crate::hvf;
use crate::hvf::{prot, ExitReason, Vcpu, Vm};

use super::super::arch::aarch64::layout;

/// Allocate `size` bytes of host memory as page-aligned read-write
/// backing for guest RAM. `mmap(MAP_ANON | MAP_PRIVATE)`.
pub fn alloc_guest_ram(size: usize) -> *mut u8 {
    // SAFETY: standard mmap; we own the resulting region for the
    // VM's lifetime.
    let p = unsafe {
        libc::mmap(
            std::ptr::null_mut(),
            size,
            libc::PROT_READ | libc::PROT_WRITE,
            libc::MAP_ANON | libc::MAP_PRIVATE,
            -1,
            0,
        )
    };
    assert!(!p.is_null() && p != libc::MAP_FAILED, "mmap failed");
    p as *mut u8
}

/// A live VM with a single vCPU, RAM mapped at
/// `layout::DRAM_MEM_START_KERNEL`. Drop tears everything down.
pub struct MicroVm {
    pub vm: Vm,
    pub vcpu: Vcpu,
    pub ram_host: *mut u8,
    pub ram_size: usize,
    pub ram_gpa: u64,
    /// True when `ram_host` is a `mmap(MAP_PRIVATE)` region (e.g.
    /// from a CoW restore). Drop uses `munmap` either way; the flag
    /// just signals to debug/log paths that we're CoW-backed.
    pub ram_is_cow: bool,
}

impl MicroVm {
    /// Create the VM, allocate `ram_size` bytes of RAM, map it at
    /// `layout::DRAM_MEM_START_KERNEL`, create one vCPU.
    pub fn new(ram_size: usize) -> hvf::Result<Self> {
        let ram_host = alloc_guest_ram(ram_size);
        Self::new_with_ram(ram_host, ram_size, false)
    }

    /// Create the VM but use an already-allocated host RAM pointer
    /// (e.g. an `mmap(MAP_PRIVATE)` of a snapshot file). `ram_host`
    /// must be valid for `ram_size` bytes for the VM's lifetime.
    pub fn new_with_ram(ram_host: *mut u8, ram_size: usize, is_cow: bool) -> hvf::Result<Self> {
        let timings = std::env::var_os("SUPERMACHINE_TIMINGS").is_some();
        let t0 = std::time::Instant::now();
        let vm = Vm::new()?;
        if timings {
            eprintln!(
                "[timing] microvm.vm_create_gic={}us",
                t0.elapsed().as_micros()
            );
        }
        let ram_gpa = layout::DRAM_MEM_START_KERNEL;
        // SAFETY: caller owns ram_host for the VM lifetime.
        unsafe {
            vm.map(ram_host, ram_gpa, ram_size, prot::RWX)?;
        }
        if timings {
            eprintln!("[timing] microvm.map_ram={}us", t0.elapsed().as_micros());
        }
        let vcpu = Vcpu::new()?;
        if timings {
            eprintln!(
                "[timing] microvm.vcpu_create={}us",
                t0.elapsed().as_micros()
            );
        }
        vcpu.set_sys_reg(applevisor_sys::hv_sys_reg_t::MPIDR_EL1, 0)?;
        if timings {
            eprintln!("[timing] microvm.ready={}us", t0.elapsed().as_micros());
        }
        Ok(Self {
            vm,
            vcpu,
            ram_host,
            ram_size,
            ram_gpa,
            ram_is_cow: is_cow,
        })
    }

    /// Re-mmap RAM from the snapshot file at `path` (warm RESTORE).
    /// Used by pool-worker mode between dispatches: the previous CoW
    /// pages are unmapped, a fresh `MAP_PRIVATE` is faulted in, and
    /// the new pointer replaces `ram_host`.
    ///
    /// Apple Silicon caveat: macOS's MADV_DONTNEED on MAP_PRIVATE
    /// file-backed mappings does NOT re-fault from the file (it
    /// behaves like Linux's MADV_FREE — pages get zeroed instead).
    /// So we do the portable unmap+remap dance: hv_vm_unmap +
    /// munmap + mmap + hv_vm_map. Costs ~100 µs total on M-series.
    ///
    /// SAFETY: caller must guarantee no vCPU is currently in
    /// hv_vcpu_run (would race with hv_vm_unmap).
    pub unsafe fn remap_cow(&mut self, path: &str) -> hvf::Result<()> {
        let snap_meta =
            crate::vmm::snapshot::load_meta(path).map_err(|_| crate::hvf::Error::Hv(-1))?;
        // SAFETY: caller upholds the no-running-vCPU contract for this method.
        unsafe { self.remap_cow_at(path, snap_meta.1, snap_meta.2) }
    }

    /// Same as `remap_cow`, but the caller has already parsed snapshot
    /// metadata and can provide the RAM region directly.
    ///
    /// SAFETY: caller must guarantee no vCPU is currently in hv_vcpu_run.
    pub unsafe fn remap_cow_at(
        &mut self,
        path: &str,
        ram_offset: u64,
        memory_bytes: usize,
    ) -> hvf::Result<()> {
        let f = std::fs::File::open(path).map_err(|_| crate::hvf::Error::Hv(-1))?;
        // SAFETY: caller upholds the no-running-vCPU contract for this method.
        unsafe { self.remap_cow_from_file(&f, ram_offset, memory_bytes) }
    }

    /// Same as `remap_cow_at`, but reuses an already-open snapshot file.
    ///
    /// SAFETY: caller must guarantee no vCPU is currently in hv_vcpu_run.
    pub unsafe fn remap_cow_from_file(
        &mut self,
        file: &std::fs::File,
        ram_offset: u64,
        memory_bytes: usize,
    ) -> hvf::Result<()> {
        use std::os::fd::AsRawFd;

        if memory_bytes != self.ram_size {
            return Err(crate::hvf::Error::Hv(-1));
        }
        // SAFETY: HVF API; we drop the old mapping next.
        unsafe {
            let _ = applevisor_sys::hv_vm_unmap(self.ram_gpa, self.ram_size);
            let _ = libc::munmap(self.ram_host as *mut _, self.ram_size);
        }
        let new_ptr = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                self.ram_size,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE,
                file.as_raw_fd(),
                ram_offset as libc::off_t,
            )
        };
        if new_ptr == libc::MAP_FAILED {
            return Err(crate::hvf::Error::Hv(-1));
        }
        // Note: do NOT MADV_WILLNEED on warm-remap. We tried that
        // briefly and it fought virtio-balloon: balloon's
        // MADV_FREE_REUSABLE on idle pages got synchronously
        // un-done by the kernel's read-ahead on the next remap,
        // re-faulting hundreds of MiB on every cycle and pushing
        // skip=false cycle latency from 5 ms to 250 ms. The
        // cold-restore path (`mmap_ram_cow_at`) still uses
        // WILLNEED — there's no balloon state to fight on first
        // restore. On warm cycles we let pages fault lazily; the
        // page cache from the previous cycle keeps the working
        // set warm anyway.
        self.ram_host = new_ptr as *mut u8;
        self.ram_is_cow = true;
        // SAFETY: ram_host now points to the freshly mapped region.
        unsafe {
            self.vm
                .map(self.ram_host, self.ram_gpa, self.ram_size, prot::RWX)?;
        }
        Ok(())
    }

    /// Same as `remap_cow_from_file`, but asks mmap to replace the RAM
    /// mapping at the current host virtual address. This mirrors the
    /// faster swap path: avoid VA churn and the separate munmap,
    /// while still doing the HVF GPA unmap/map boundary required for a
    /// fresh stage-2 mapping.
    ///
    /// SAFETY: caller must guarantee no vCPU is currently in hv_vcpu_run.
    pub unsafe fn remap_cow_from_file_fixed(
        &mut self,
        file: &std::fs::File,
        ram_offset: u64,
        memory_bytes: usize,
    ) -> hvf::Result<()> {
        use std::os::fd::AsRawFd;

        if memory_bytes != self.ram_size {
            return Err(crate::hvf::Error::Hv(-1));
        }
        let current_host = self.ram_host;
        // SAFETY: HVF API; mmap(MAP_FIXED) replaces the host VA mapping next.
        unsafe {
            let _ = applevisor_sys::hv_vm_unmap(self.ram_gpa, self.ram_size);
        }
        let new_ptr = unsafe {
            libc::mmap(
                current_host as *mut _,
                self.ram_size,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_FIXED,
                file.as_raw_fd(),
                ram_offset as libc::off_t,
            )
        };
        if new_ptr == libc::MAP_FAILED {
            return Err(crate::hvf::Error::Hv(-1));
        }
        debug_assert_eq!(new_ptr as *mut u8, current_host);
        // No WILLNEED on warm-remap (balloon conflict — see
        // `remap_cow_from_file` for the rationale).
        self.ram_host = new_ptr as *mut u8;
        self.ram_is_cow = true;
        // SAFETY: ram_host now points to the freshly replaced CoW region.
        unsafe {
            self.vm
                .map(self.ram_host, self.ram_gpa, self.ram_size, prot::RWX)?;
        }
        Ok(())
    }

    /// Write bytes into guest RAM at `gpa`. Caller must ensure the
    /// range fits within the mapped RAM region.
    ///
    /// SAFETY: `gpa..gpa+bytes.len()` must be within
    /// `[ram_gpa, ram_gpa + ram_size)`.
    pub unsafe fn write_ram(&self, gpa: u64, bytes: &[u8]) {
        let off = (gpa - self.ram_gpa) as usize;
        debug_assert!(off + bytes.len() <= self.ram_size);
        unsafe {
            std::ptr::copy_nonoverlapping(bytes.as_ptr(), self.ram_host.add(off), bytes.len());
        }
    }

    /// Set the program counter for the boot vCPU.
    pub fn set_pc(&self, pc: u64) -> hvf::Result<()> {
        self.vcpu.set_reg(av::hv_reg_t::PC, pc)
    }

    /// Linux aarch64 boot protocol expects the boot CPU to enter at
    /// EL1h with DAIF masked and MMU off. PSTATE: D=1 A=1 I=1 F=1 +
    /// EL1h (M[3:0]=0b0101). Bits 9..6 = DAIF, bit 4 = M[4]=0
    /// (AArch64), bits 3..0 = M[3:0]=0b0101 = EL1h. Result = 0x3c5.
    pub fn set_boot_cpsr(&self) -> hvf::Result<()> {
        self.vcpu.set_reg(av::hv_reg_t::CPSR, 0x3c5)
    }

    /// Set X0 (used for FDT pointer per Linux aarch64 boot protocol).
    pub fn set_x0(&self, val: u64) -> hvf::Result<()> {
        self.vcpu.set_reg(av::hv_reg_t::X0, val)
    }

    /// Drive the vCPU. Returns (reason, ESR_EL2 syndrome,
    /// faulting GPA for data aborts, faulting VA).
    pub fn run_once(&self) -> hvf::Result<(ExitReason, u64, u64, u64)> {
        let exit = self.vcpu.run()?;
        let reason = ExitReason::from(exit.reason as u32);
        let syndrome = exit.exception.syndrome;
        let gpa = exit.exception.physical_address;
        let va = exit.exception.virtual_address;
        Ok((reason, syndrome, gpa, va))
    }
}

impl Drop for MicroVm {
    fn drop(&mut self) {
        // SAFETY: we own ram_host and ram_size came from alloc.
        unsafe {
            libc::munmap(self.ram_host as _, self.ram_size);
        }
    }
}

/// Compute the initrd GPA. Place it just below the FDT but above
/// kernel image_size + slack. The kernel decompresses past its file
/// length; we read `image_size` from the Image header (offset 0x10,
/// LE u64) and add 1 MiB slack. Falls back to file-end + 32 MiB.
pub fn initrd_gpa(ram_gpa: u64, ram_size: u64, kernel_len: u64, initrd_len: u64) -> u64 {
    let fdt_gpa = ram_gpa + ram_size - layout::FDT_MAX_SIZE as u64;
    let kernel_gpa = ram_gpa + layout::KERNEL_LOAD_OFFSET;
    // Place initrd as high as possible (just below the FDT, aligned
    // 4 KiB). This keeps it well clear of any kernel decompression /
    // BSS extension.
    let initrd_gpa = (fdt_gpa - initrd_len - 0xfff) & !0xfff;
    debug_assert!(initrd_gpa > kernel_gpa + kernel_len);
    initrd_gpa
}

/// Test fixture: a tiny aarch64 program that loads 0x42 into X0 and
/// triggers a hypervisor call. Used by the proof-of-life path
/// (no kernel needed). On exit: `ExitReason::Exception` with
/// `ESR_EL2` syndrome carrying EC=0x16 (HVC executed in AArch64).
pub const TEST_PROGRAM: [u8; 8] = [
    0x42, 0x08, 0x80, 0xd2, // mov x0, #0x42
    0x02, 0x00, 0x00, 0xd4, // hvc #0
];

/// Boot a Linux kernel + optional initrd on this MicroVm. Returns
/// after staging memory + registers; caller drives `run_once` in a
/// loop and dispatches MMIO traps.
///
/// Linux aarch64 boot protocol (Documentation/arm64/booting.rst):
///   - kernel loaded at RAM_START + KERNEL_LOAD_OFFSET (0x80000)
///   - X0 = FDT physical address (must be RAM, max 2 MiB FDT)
///   - X1, X2, X3 = 0
///   - PC = kernel entry (= load address)
///   - PSTATE = EL1h, DAIF masked, MMU off
pub fn boot_linux(
    vm: &MicroVm,
    kernel: &[u8],
    initrd: Option<&[u8]>,
    fdt: &[u8],
) -> hvf::Result<()> {
    let kernel_gpa = vm.ram_gpa + layout::KERNEL_LOAD_OFFSET;
    // SAFETY: caller verified sizes fit within RAM.
    unsafe {
        vm.write_ram(kernel_gpa, kernel);
    }
    // FDT at top of RAM minus FDT_MAX_SIZE.
    let fdt_gpa = vm.ram_gpa + vm.ram_size as u64 - layout::FDT_MAX_SIZE as u64;
    // SAFETY: same.
    unsafe {
        vm.write_ram(fdt_gpa, fdt);
    }
    // Initrd placed BELOW the FDT and above the kernel (the kernel
    // decompresses + relocates well past its file size — read
    // image_size from the kernel header to find the actual end).
    // Use the conservative initrd_gpa() helper so main.rs and we
    // agree.
    if let Some(initrd) = initrd {
        let initrd_gpa = initrd_gpa(
            vm.ram_gpa,
            vm.ram_size as u64,
            kernel.len() as u64,
            initrd.len() as u64,
        );
        // SAFETY: caller-verified.
        unsafe {
            vm.write_ram(initrd_gpa, initrd);
        }
    }
    vm.set_boot_cpsr()?;
    vm.set_x0(fdt_gpa)?;
    vm.vcpu.set_reg(av::hv_reg_t::X1, 0)?;
    vm.vcpu.set_reg(av::hv_reg_t::X2, 0)?;
    vm.vcpu.set_reg(av::hv_reg_t::X3, 0)?;
    vm.set_pc(kernel_gpa)?;
    Ok(())
}