supermachine 0.7.69

//! The hypervisor backend contract.
//!
//! This is the portable seam between the VMM and whatever hypervisor it
//! runs on: Apple Hypervisor.framework (HVF) on macOS today, KVM on Linux
//! next. It is deliberately drawn HIGH — guest memory, vCPU register
//! state, and the run/exit primitive live here; everything BELOW it (how a
//! backend injects interrupts, services device doorbells, or runs its
//! dataplane) is the backend's own business, so each platform can use its
//! fastest native mechanism (userspace on HVF; irqfd / ioeventfd / vhost
//! in-kernel on KVM) without the portable layer forcing a slower pattern.
//!
//! Dispatch is static: backends are bound as a concrete type at compile
//! time (see the per-platform `ActiveVm`), so trait calls monomorphize to
//! direct calls — zero overhead on the vCPU hot path. There is no `dyn`
//! here by design.
//!
//! Scope: the register model is aarch64-architectural (both HVF and
//! aarch64 KVM expose the same ARM registers; each backend maps them to
//! its native id). An eventual x86_64 backend would extend this contract.

/// aarch64 core (general-purpose + a few special) registers the VMM
/// touches. Backends map each to their native register id.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CoreReg {
    /// X0..=X30. Values >30 are rejected by the backend.
    X(u8),
    /// Frame pointer (X29).
    Fp,
    /// Link register (X30).
    Lr,
    /// Program counter.
    Pc,
    /// Processor state (PSTATE / CPSR).
    Cpsr,
    /// Floating-point status register.
    Fpsr,
    /// Floating-point control register.
    Fpcr,
}

/// aarch64 system registers the VMM touches. Kept to exactly what is
/// used; extend as backends migrate more of the snapshot/boot surface.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SysReg {
    /// Multiprocessor Affinity Register (per-vCPU identity).
    MpidrEl1,
}

/// Guest-memory protection flags for [`HypervisorVm::map_ram`]. Portable
/// bit layout; each backend translates to its native mapping flags.
pub mod prot {
    pub const READ: u64 = 1;
    pub const WRITE: u64 = 1 << 1;
    pub const EXEC: u64 = 1 << 2;
    pub const RW: u64 = READ | WRITE;
    pub const RX: u64 = READ | EXEC;
    pub const RWX: u64 = READ | WRITE | EXEC;
}

/// Why a vCPU stopped running. Small and `Copy` — there is no allocation
/// on the run/exit hot path.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum VcpuExit {
    /// The run was cancelled (e.g. another thread forced an exit).
    Canceled,
    /// A guest exception trapped to the host. For data aborts the
    /// addresses locate the fault; `syndrome` is the ESR value.
    Exception {
        syndrome: u64,
        phys_addr: u64,
        virt_addr: u64,
    },
    /// The virtual timer fired.
    VTimerActivated,
    /// x86 port-I/O exit (KVM_EXIT_IO). `data` holds the written value for an
    /// OUT (`write=true`); an IN is completed by the backend supplying data
    /// before the next `step()`.
    Io {
        port: u16,
        write: bool,
        size: u8,
        data: u32,
    },
    /// MMIO exit (KVM_EXIT_MMIO). `data` holds the written value for a write;
    /// a read is completed by the backend supplying data before the next step.
    Mmio {
        phys_addr: u64,
        write: bool,
        len: u8,
        data: u64,
    },
    /// The guest halted (KVM_EXIT_HLT) or requested shutdown.
    Halt,
    /// A reason this VMM does not model; carries the raw backend code.
    Unknown(u32),
}

/// An opaque, thread-safe token identifying a vCPU, usable from a
/// *different* thread to force it out of [`HypervisorVcpu::step`]. vCPUs
/// are thread-bound, but the coordinator must be able to interrupt them
/// (e.g. to rendezvous all of them for a snapshot) without owning their
/// `Vcpu` objects — it holds these tokens instead. Backends choose the
/// representation (HVF: `hv_vcpu_t`; KVM: the vCPU thread's signal target).
pub trait VcpuHandle: Copy + Send + Sync + 'static {
    /// Force the given vCPUs out of `step()`, callable from any thread.
    /// Batched on purpose: HVF does it in one syscall (`hv_vcpus_exit`);
    /// KVM signals each thread.
    fn force_exit(handles: &[Self]);
}

/// A single vCPU. The VMM drives a vCPU only through this trait, so the
/// run loop and register plumbing are written once against the contract
/// and each backend supplies the native implementation.
pub trait HypervisorVcpu {
    /// Backend-native error (HVF's `hv_return_t` wrapper, KVM's errno).
    type Error;

    /// Cross-thread token to force this vCPU out of [`Self::step`].
    type Handle: VcpuHandle;

    /// Backend-specific captured vCPU state (register file + interrupt-
    /// controller per-CPU state). Opaque to the portable orchestration,
    /// which only captures, stores, and restores it.
    type SnapshotState;

    /// A token usable from another thread to force-exit this vCPU.
    fn exit_token(&self) -> Self::Handle;

    /// Capture this vCPU's restorable state. Must run on the vCPU's own
    /// thread (HVF requires it; KVM likewise reads via the vCPU fd).
    fn capture_snapshot(&self) -> Result<Self::SnapshotState, Self::Error>;

    /// Restore a state previously captured by [`Self::capture_snapshot`]
    /// on the same backend. Must run before the vCPU re-enters `step()`.
    fn restore_snapshot(&self, state: &Self::SnapshotState) -> Result<(), Self::Error>;

    /// Serialize a captured [`SnapshotState`](Self::SnapshotState) to a snapshot
    /// stream. The orchestration writes one of these per vCPU into the snapshot
    /// container without knowing the backend-specific register-file shape — the
    /// groundwork for a single cross-backend snapshot pipeline. Inverse of
    /// [`read_snapshot_state`](Self::read_snapshot_state); the byte layout is the
    /// backend's own concern.
    fn write_snapshot_state(
        state: &Self::SnapshotState,
        w: &mut dyn std::io::Write,
    ) -> std::io::Result<()>;

    /// Read a [`SnapshotState`](Self::SnapshotState) written by
    /// [`write_snapshot_state`](Self::write_snapshot_state) from a snapshot
    /// stream, advancing the reader past exactly this vCPU's bytes.
    fn read_snapshot_state(r: &mut dyn std::io::Read) -> std::io::Result<Self::SnapshotState>;

    /// Compute the opaque guest-clock reference to persist in the snapshot, from
    /// the boot vCPU's captured [`SnapshotState`](Self::SnapshotState) and a host
    /// monotonic reading ([`HypervisorVm::host_monotonic_ticks`]) taken at the
    /// same instant. Paired with [`restore_clock`](Self::restore_clock) so the
    /// guest's monotonic timebase keeps advancing across snapshot/restore instead
    /// of jumping. The reference is a single `u64` in the backend's clock unit; on
    /// HVF it is the guest virtual counter at capture (`host_now - CNTVOFF`), on
    /// x86/KVM the guest TSC/kvmclock reference. The orchestration stores it
    /// without interpreting it.
    fn capture_clock_ref(state: &Self::SnapshotState, host_now: u64) -> u64;

    /// Re-anchor this vCPU's guest timer on restore: given the `captured_ref`
    /// persisted by [`capture_clock_ref`](Self::capture_clock_ref) and a fresh
    /// `host_now` reading, set the guest timebase so it resumes from where it
    /// paused (advancing by the host time elapsed since capture). Returns the
    /// applied offset for diagnostics. On HVF this writes CNTVOFF; on x86/KVM it
    /// re-anchors the TSC/kvmclock offset.
    fn restore_clock(&self, captured_ref: u64, host_now: u64) -> Result<u64, Self::Error>;

    fn get_core(&self, reg: CoreReg) -> Result<u64, Self::Error>;
    fn set_core(&self, reg: CoreReg, value: u64) -> Result<(), Self::Error>;
    fn get_sys(&self, reg: SysReg) -> Result<u64, Self::Error>;
    fn set_sys(&self, reg: SysReg, value: u64) -> Result<(), Self::Error>;

    /// Run the vCPU until it exits. Hot path: returns a `Copy` exit, no
    /// allocation, fully inlinable under static dispatch.
    fn step(&self) -> Result<VcpuExit, Self::Error>;
}

/// A VM: owns guest physical memory and creates vCPUs. One backend type
/// implements this per platform; it is selected as a concrete type at
/// compile time, never behind `dyn`.
pub trait HypervisorVm: Sized {
    type Error;
    type Vcpu: HypervisorVcpu<Error = Self::Error>;

    /// Create the VM (and its in-kernel interrupt controller, etc.).
    fn create() -> Result<Self, Self::Error>;

    /// Map `len` bytes of host memory at `host_ptr` as guest physical
    /// memory at `gpa` with the given [`prot`] flags.
    ///
    /// # Safety
    /// `host_ptr` must stay valid and readable/writable per `prot` for
    /// the VM's lifetime (or until [`HypervisorVm::unmap_ram`]).
    unsafe fn map_ram(
        &self,
        host_ptr: *mut u8,
        gpa: u64,
        len: usize,
        prot: u64,
    ) -> Result<(), Self::Error>;

    /// Unmap a range previously passed to [`HypervisorVm::map_ram`].
    ///
    /// # Safety
    /// No vCPU may be accessing `[gpa, gpa+len)` during the call.
    unsafe fn unmap_ram(&self, gpa: u64, len: usize) -> Result<(), Self::Error>;

    /// Create a vCPU bound to the calling thread.
    fn create_vcpu(&self) -> Result<Self::Vcpu, Self::Error>;

    /// Drive an interrupt line into the guest's in-kernel interrupt controller:
    /// raise (`level = true`) or lower (`level = false`) the SPI / GSI numbered
    /// `intid`. This is the backend-agnostic IRQ-raise the device-emulation layer
    /// calls when a virtio device has work for the guest, abstracting the
    /// concrete controller (HVF GIC `gic_set_spi` on aarch64; KVM in-kernel
    /// IOAPIC/PIC `KVM_IRQ_LINE` on x86). Edge-triggered virtio IRQs raise then
    /// lower; level semantics are the backend's.
    fn set_irq(&self, intid: u32, level: bool) -> Result<(), Self::Error>;

    /// A cloneable, `'static` IRQ-raise handle for the same line drive as
    /// [`set_irq`](Self::set_irq), suitable for moving into device threads /
    /// MMIO notify closures (which can't borrow the `Vm`). The device-emulation
    /// layer captures a clone per device and calls `raiser(intid, level)`.
    ///
    /// HVF returns a unit closure over the process-global GIC; KVM returns one
    /// holding a shared handle to the per-VM in-kernel irqchip fd. This is the
    /// backend-agnostic form of the `gic_set_spi` / `set_irq_line` closures the
    /// orchestration scatters through device setup.
    fn irq_line(&self) -> std::sync::Arc<dyn Fn(u32, bool) + Send + Sync>;

    /// Capture the VM-global in-kernel interrupt-controller + timer state to an
    /// opaque, backend-defined blob for snapshotting. HVF: the GICv3 distributor
    /// state (`hv_gic_state_get_data`). KVM: PIT + PIC/IOAPIC + kvmclock. The
    /// orchestration stores the blob in the snapshot and feeds it back to
    /// [`restore_intc`](Self::restore_intc). Per-vCPU interrupt state (GIC
    /// ICC/ICH regs / x86 LAPIC) rides in the vCPU snapshot
    /// ([`HypervisorVcpu::capture_snapshot`]), not here.
    fn capture_intc(&self) -> Result<Vec<u8>, Self::Error>;

    /// Restore interrupt-controller + timer state from a blob produced by
    /// [`capture_intc`](Self::capture_intc) on the same backend.
    fn restore_intc(&self, blob: &[u8]) -> Result<(), Self::Error>;

    /// Set up the guest to boot Linux: load the kernel image (and any initrd)
    /// into `mem`, write the backend's boot data structure (aarch64: the caller's
    /// `fdt`; x86: a boot_params/GDT built from `cmdline`), and set the boot
    /// vCPU's registers to enter the kernel. `mem` is the guest-RAM slice whose
    /// byte 0 is `cfg.ram_gpa`. Abstracts the two arch boot protocols behind one
    /// call so the orchestration is backend-agnostic. Only sets up the boot
    /// (BSP) vCPU; SMP bring-up (PSCI / MP table) is the caller's concern.
    fn boot_linux(
        &self,
        vcpu: &Self::Vcpu,
        mem: &mut [u8],
        cfg: &LinuxBootConfig,
    ) -> Result<(), Self::Error>;

    /// The backend's DAX mapper — maps host file pages into a guest DAX window
    /// for zero-copy virtio-fs reads (HVF: `hv_vm_map`; KVM: a `KVM_MEM_*`
    /// memslot). Takes `&Arc<Self>` because the KVM mapper must retain a shared
    /// handle to the VM (for memslot bookkeeping) past the borrow, while HVF's
    /// is a unit type over the process-global mapping. The device-attach layer
    /// hands the returned mapper to a `fuse::DaxSession`.
    fn dax_mapper(self: &std::sync::Arc<Self>) -> std::sync::Arc<dyn crate::fuse::HvfMapper>;

    /// Read the host's monotonic timebase in the backend's native clock unit
    /// (HVF: mach absolute ticks; x86/KVM: nanoseconds of `CLOCK_MONOTONIC`).
    /// The snapshot pipeline samples this at capture and again at restore and
    /// hands both readings to [`HypervisorVcpu::capture_clock_ref`] /
    /// [`HypervisorVcpu::restore_clock`] so the guest timebase advances by the
    /// wall-clock gap rather than freezing across the snapshot. Associated (no
    /// `&self`): it is a property of the backend, not of a particular VM.
    fn host_monotonic_ticks() -> u64;
}

/// Backend-agnostic Linux boot inputs for [`HypervisorVm::boot_linux`]. Each
/// backend uses the fields its arch boot protocol needs: aarch64/HVF consumes
/// `fdt` (and ignores `cmdline`, which lives in the FDT's `chosen/bootargs`);
/// x86/KVM consumes `cmdline` to build boot_params (and ignores `fdt`).
pub struct LinuxBootConfig<'a> {
    /// Kernel image bytes (aarch64 Image / x86 bzImage).
    pub kernel: &'a [u8],
    /// Optional initramfs the kernel unpacks as its initial rootfs.
    pub initrd: Option<&'a [u8]>,
    /// Kernel command line (x86 boot_params). aarch64 carries it in the FDT.
    pub cmdline: &'a str,
    /// Guest-physical base of `mem` (byte 0 of the slice maps to this GPA).
    pub ram_gpa: u64,
    /// Guest RAM size in bytes (== `mem.len()`).
    pub ram_size: usize,
    /// Prebuilt device-tree blob (aarch64 boot protocol). `None`/ignored on x86.
    pub fdt: Option<&'a [u8]>,
}

/// Lets the backend-agnostic orchestration construct a backend error from a
/// free-form message without naming the concrete error type — the portable form
/// of the HVF `Error::Hv(-1)` "internal failure" placeholder the VMM scattered
/// through `map_err` closures. Both backend error types implement it.
pub trait BackendError {
    /// A backend error carrying an internal-failure message (best-effort: HVF's
    /// `Error` has no string payload, so it maps to its generic `Hv(-1)`).
    fn other(msg: &str) -> Self;
}

/// The hypervisor backend selected for this build, bound as a concrete
/// type so all dispatch is static (no `dyn`, fully monomorphized). macOS
/// on Apple Silicon → HVF; the Linux/KVM branch lands with that backend.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub type ActiveVm = crate::hvf::Vm;

/// The active backend's vCPU type. See [`ActiveVm`].
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub type ActiveVcpu = <ActiveVm as HypervisorVm>::Vcpu;

/// The active backend's cross-thread vCPU force-exit token. See
/// [`VcpuHandle`].
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub type ActiveVcpuHandle = <ActiveVcpu as HypervisorVcpu>::Handle;

/// The active backend's error type (HVF `Error` / KVM `KvmError`). The
/// orchestration returns this instead of naming a concrete backend error.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub type ActiveError = <ActiveVm as HypervisorVm>::Error;

/// `Result` over [`ActiveError`] — the backend-agnostic form of `hvf::Result`.
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
pub type ActiveResult<T> = std::result::Result<T, ActiveError>;

/// Linux/x86_64 → KVM. The aarch64 register methods (`CoreReg`/`SysReg`) are
/// vestigial here — the x86 orchestration sets RIP/CR*/EFER on the concrete
/// `KvmVcpu` directly and drives the run loop via `step()` → `VcpuExit::{Io,
/// Mmio,Halt}`.
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub type ActiveVm = crate::kvm::KvmVm;

/// The active backend's vCPU type. See [`ActiveVm`].
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub type ActiveVcpu = <ActiveVm as HypervisorVm>::Vcpu;

/// The active backend's cross-thread vCPU force-exit token. See [`VcpuHandle`].
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub type ActiveVcpuHandle = <ActiveVcpu as HypervisorVcpu>::Handle;

/// The active backend's error type. See the macOS definition.
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub type ActiveError = <ActiveVm as HypervisorVm>::Error;

/// `Result` over [`ActiveError`].
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
pub type ActiveResult<T> = std::result::Result<T, ActiveError>;