supermachine 0.4.20

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// Source size: 444 LOC
// Status: minimal — root, cpus, memory, chosen, intc (GIC v3),
//         timer, psci. Enough to boot a kernel.
//
// Re-implementation in our own Rust using the same vm-fdt crate.
// Layout values come from `arch::aarch64::layout` so the GIC base
// addresses match what `hvf::Vm::new` registers via
// `hv_gic_config_set_*_base`.

use std::io;
use vm_fdt::FdtWriter;

use super::layout;

const GIC_PHANDLE: u32 = 1;
const CLOCK_PHANDLE: u32 = 2;

/// FDT IRQ-type constants (from Linux DT bindings).
const GIC_FDT_IRQ_TYPE_PPI: u32 = 1;
const GIC_FDT_IRQ_TYPE_SPI: u32 = 0;
const IRQ_TYPE_EDGE_RISING: u32 = 1;
const IRQ_TYPE_LEVEL_HI: u32 = 4;

/// One virtio-mmio device entry: (base GPA, IRQ INTID).
#[derive(Clone, Copy)]
pub struct VirtioMmioEntry {
    pub base: u64,
    pub irq: u32,
}

/// Generate the FDT blob for a microVM with `n_vcpus` vCPUs,
/// `mem_size` bytes of RAM at `layout::DRAM_MEM_START_KERNEL`,
/// `cmdline` boot args, optional initrd at `initrd_gpa`
/// (`initrd_size` bytes), and a list of virtio-mmio devices.
pub fn generate(
    n_vcpus: usize,
    mem_size: u64,
    cmdline: &str,
    initrd: Option<(u64, u64)>,
    virtio_devs: &[VirtioMmioEntry],
) -> io::Result<Vec<u8>> {
    let mut fdt =
        FdtWriter::new().map_err(|e| io::Error::other(format!("FdtWriter::new: {e:?}")))?;

    let root = fdt.begin_node("").map_err(map_fdt)?;
    fdt.property_string("compatible", "linux,dummy-virt")
        .map_err(map_fdt)?;
    fdt.property_u32("#address-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("#size-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("interrupt-parent", GIC_PHANDLE)
        .map_err(map_fdt)?;

    // /cpus
    let cpus = fdt.begin_node("cpus").map_err(map_fdt)?;
    fdt.property_u32("#address-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("#size-cells", 0x0).map_err(map_fdt)?;
    for i in 0..n_vcpus {
        let cpu = fdt.begin_node(&format!("cpu@{i:x}")).map_err(map_fdt)?;
        fdt.property_string("device_type", "cpu").map_err(map_fdt)?;
        fdt.property_string("compatible", "arm,arm-v8")
            .map_err(map_fdt)?;
        if n_vcpus > 1 {
            fdt.property_string("enable-method", "psci")
                .map_err(map_fdt)?;
        }
        // MPIDR_EL1 affinity bits — bottom 24 bits.
        fdt.property_u64("reg", (i as u64) & 0x7fffff)
            .map_err(map_fdt)?;
        fdt.end_node(cpu).map_err(map_fdt)?;
    }
    fdt.end_node(cpus).map_err(map_fdt)?;

    // /memory
    let mem = fdt.begin_node("memory").map_err(map_fdt)?;
    fdt.property_string("device_type", "memory")
        .map_err(map_fdt)?;
    let mem_reg = be_u64_pair(layout::DRAM_MEM_START_KERNEL, mem_size);
    fdt.property("reg", &mem_reg).map_err(map_fdt)?;
    fdt.end_node(mem).map_err(map_fdt)?;

    // /chosen — cmdline + optional initrd.
    let chosen = fdt.begin_node("chosen").map_err(map_fdt)?;
    fdt.property_string("bootargs", cmdline).map_err(map_fdt)?;
    if let Some((start, size)) = initrd {
        fdt.property_u64("linux,initrd-start", start)
            .map_err(map_fdt)?;
        fdt.property_u64("linux,initrd-end", start + size)
            .map_err(map_fdt)?;
    }
    fdt.end_node(chosen).map_err(map_fdt)?;

    // /intc — GIC v3 distributor + redistributor.
    let intc = fdt
        .begin_node(&format!("intc@{:x}", layout::GICV3_DIST_BASE))
        .map_err(map_fdt)?;
    fdt.property_string("compatible", "arm,gic-v3")
        .map_err(map_fdt)?;
    fdt.property_null("interrupt-controller").map_err(map_fdt)?;
    fdt.property_u32("#interrupt-cells", 3).map_err(map_fdt)?;
    fdt.property_u32("#address-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("#size-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("phandle", GIC_PHANDLE).map_err(map_fdt)?;
    // Two reg pairs: distributor, redistributor.
    let mut intc_reg = Vec::new();
    intc_reg.extend_from_slice(&be_u64_pair(
        layout::GICV3_DIST_BASE,
        layout::GICV3_DIST_SIZE,
    ));
    intc_reg.extend_from_slice(&be_u64_pair(
        layout::GICV3_REDIST_BASE,
        layout::GICV3_REDIST_STRIDE * (n_vcpus as u64),
    ));
    fdt.property("reg", &intc_reg).map_err(map_fdt)?;
    fdt.end_node(intc).map_err(map_fdt)?;

    // /timer — generic ARM timer, 4 PPI INTIDs (sec, hyp, virt, phys).
    // Each entry: type=PPI, intid - 16, flags=level-hi.
    let timer = fdt.begin_node("timer").map_err(map_fdt)?;
    fdt.property_string("compatible", "arm,armv8-timer")
        .map_err(map_fdt)?;
    fdt.property_null("always-on").map_err(map_fdt)?;
    let timer_irqs = [
        // sec, hyp, virt, phys — PPIs 13, 14, 11, 10 (intid - 16).
        (13u32, IRQ_TYPE_LEVEL_HI),
        (14, IRQ_TYPE_LEVEL_HI),
        (11, IRQ_TYPE_LEVEL_HI),
        (10, IRQ_TYPE_LEVEL_HI),
    ];
    let mut tirqs: Vec<u8> = Vec::new();
    for (intid, flags) in timer_irqs {
        tirqs.extend_from_slice(&be_u32(GIC_FDT_IRQ_TYPE_PPI));
        tirqs.extend_from_slice(&be_u32(intid));
        tirqs.extend_from_slice(&be_u32(flags));
    }
    fdt.property("interrupts", &tirqs).map_err(map_fdt)?;
    fdt.end_node(timer).map_err(map_fdt)?;

    // /pl011 — serial console at SERIAL_MMIO_BASE on SERIAL_IRQ.
    let serial = fdt
        .begin_node(&format!("pl011@{:x}", layout::SERIAL_MMIO_BASE))
        .map_err(map_fdt)?;
    fdt.property_string_list(
        "compatible",
        vec!["arm,pl011".to_string(), "arm,primecell".to_string()],
    )
    .map_err(map_fdt)?;
    let serial_reg = be_u64_pair(layout::SERIAL_MMIO_BASE, layout::SERIAL_MMIO_SIZE);
    fdt.property("reg", &serial_reg).map_err(map_fdt)?;
    let mut serial_irq = Vec::new();
    serial_irq.extend_from_slice(&be_u32(GIC_FDT_IRQ_TYPE_SPI));
    serial_irq.extend_from_slice(&be_u32(layout::SERIAL_IRQ - 32));
    serial_irq.extend_from_slice(&be_u32(IRQ_TYPE_LEVEL_HI));
    fdt.property("interrupts", &serial_irq).map_err(map_fdt)?;
    fdt.property_u32("clocks", CLOCK_PHANDLE).map_err(map_fdt)?;
    fdt.property_string("clock-names", "apb_pclk")
        .map_err(map_fdt)?;
    fdt.end_node(serial).map_err(map_fdt)?;

    // /clocks/apb_pclk — required by some kernel drivers.
    let clk = fdt.begin_node("apb-pclk").map_err(map_fdt)?;
    fdt.property_string("compatible", "fixed-clock")
        .map_err(map_fdt)?;
    fdt.property_u32("#clock-cells", 0x0).map_err(map_fdt)?;
    fdt.property_u32("clock-frequency", 24_000_000)
        .map_err(map_fdt)?;
    fdt.property_string("clock-output-names", "clk24mhz")
        .map_err(map_fdt)?;
    fdt.property_u32("phandle", CLOCK_PHANDLE)
        .map_err(map_fdt)?;
    fdt.end_node(clk).map_err(map_fdt)?;

    // /psci — Power State Coordination Interface; HVC method so SMP
    // bringup uses CPU_ON via HVC.
    let psci = fdt.begin_node("psci").map_err(map_fdt)?;
    fdt.property_string("compatible", "arm,psci-0.2")
        .map_err(map_fdt)?;
    fdt.property_string("method", "hvc").map_err(map_fdt)?;
    fdt.end_node(psci).map_err(map_fdt)?;

    // /virtio_mmio@<base> for each device. 0x200 stride.
    // IRQ_TYPE_EDGE_RISING required for HVF in-kernel GIC: it
    // auto-deasserts the SPI on guest EOI, which only works with
    // edge-triggered semantics. Level-triggered would need the
    // host to deassert manually (not supported by hv_gic_set_spi).
    for ent in virtio_devs {
        let n = fdt
            .begin_node(&format!("virtio_mmio@{:x}", ent.base))
            .map_err(map_fdt)?;
        fdt.property_string("compatible", "virtio,mmio")
            .map_err(map_fdt)?;
        let reg = be_u64_pair(ent.base, 0x200);
        fdt.property("reg", &reg).map_err(map_fdt)?;
        let mut irq = Vec::new();
        irq.extend_from_slice(&be_u32(GIC_FDT_IRQ_TYPE_SPI));
        irq.extend_from_slice(&be_u32(ent.irq - 32));
        irq.extend_from_slice(&be_u32(IRQ_TYPE_EDGE_RISING));
        fdt.property("interrupts", &irq).map_err(map_fdt)?;
        fdt.end_node(n).map_err(map_fdt)?;
    }

    fdt.end_node(root).map_err(map_fdt)?;
    fdt.finish().map_err(map_fdt)
}

fn map_fdt(e: vm_fdt::Error) -> io::Error {
    io::Error::other(format!("FDT: {e:?}"))
}

fn be_u32(v: u32) -> [u8; 4] {
    v.to_be_bytes()
}

fn be_u64_pair(a: u64, b: u64) -> [u8; 16] {
    let mut o = [0u8; 16];
    o[..8].copy_from_slice(&a.to_be_bytes());
    o[8..].copy_from_slice(&b.to_be_bytes());
    o
}