supermachine 0.7.70

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
// Source size: 444 LOC
// Status: minimal — root, cpus, memory, chosen, intc (GIC v3),
//         timer, psci. Enough to boot a kernel.
//
// Re-implementation in our own Rust using the same vm-fdt crate.
// Layout values come from `arch::aarch64::layout` so the GIC base
// addresses match what `hvf::Vm::new` registers via
// `hv_gic_config_set_*_base`.

use std::io;
use vm_fdt::FdtWriter;

use super::layout;

const GIC_PHANDLE: u32 = 1;
const CLOCK_PHANDLE: u32 = 2;

/// FDT IRQ-type constants (from Linux DT bindings).
const GIC_FDT_IRQ_TYPE_PPI: u32 = 1;
const GIC_FDT_IRQ_TYPE_SPI: u32 = 0;
const IRQ_TYPE_EDGE_RISING: u32 = 1;
const IRQ_TYPE_LEVEL_HI: u32 = 4;

/// One virtio-mmio device entry: (base GPA, IRQ INTID).
#[derive(Clone, Copy)]
pub struct VirtioMmioEntry {
    pub base: u64,
    pub irq: u32,
}

/// Generate the FDT blob for a microVM with `n_vcpus` vCPUs,
/// `mem_size` bytes of RAM at `layout::DRAM_MEM_START_KERNEL`,
/// `cmdline` boot args, optional initrd at `initrd_gpa`
/// (`initrd_size` bytes), and a list of virtio-mmio devices.
pub fn generate(
    n_vcpus: usize,
    mem_size: u64,
    cmdline: &str,
    initrd: Option<(u64, u64)>,
    virtio_devs: &[VirtioMmioEntry],
) -> io::Result<Vec<u8>> {
    let mut fdt =
        FdtWriter::new().map_err(|e| io::Error::other(format!("FdtWriter::new: {e:?}")))?;

    let root = fdt.begin_node("").map_err(map_fdt)?;
    fdt.property_string("compatible", "linux,dummy-virt")
        .map_err(map_fdt)?;
    fdt.property_u32("#address-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("#size-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("interrupt-parent", GIC_PHANDLE)
        .map_err(map_fdt)?;

    // /cpus
    let cpus = fdt.begin_node("cpus").map_err(map_fdt)?;
    fdt.property_u32("#address-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("#size-cells", 0x0).map_err(map_fdt)?;
    for i in 0..n_vcpus {
        let cpu = fdt.begin_node(&format!("cpu@{i:x}")).map_err(map_fdt)?;
        fdt.property_string("device_type", "cpu").map_err(map_fdt)?;
        fdt.property_string("compatible", "arm,arm-v8")
            .map_err(map_fdt)?;
        if n_vcpus > 1 {
            fdt.property_string("enable-method", "psci")
                .map_err(map_fdt)?;
        }
        // MPIDR_EL1 affinity bits — bottom 24 bits.
        fdt.property_u64("reg", (i as u64) & 0x7fffff)
            .map_err(map_fdt)?;
        fdt.end_node(cpu).map_err(map_fdt)?;
    }
    fdt.end_node(cpus).map_err(map_fdt)?;

    // /memory
    let mem = fdt.begin_node("memory").map_err(map_fdt)?;
    fdt.property_string("device_type", "memory")
        .map_err(map_fdt)?;
    let mem_reg = be_u64_pair(layout::DRAM_MEM_START_KERNEL, mem_size);
    fdt.property("reg", &mem_reg).map_err(map_fdt)?;
    fdt.end_node(mem).map_err(map_fdt)?;

    // /chosen — cmdline + optional initrd.
    let chosen = fdt.begin_node("chosen").map_err(map_fdt)?;
    fdt.property_string("bootargs", cmdline).map_err(map_fdt)?;
    if let Some((start, size)) = initrd {
        fdt.property_u64("linux,initrd-start", start)
            .map_err(map_fdt)?;
        fdt.property_u64("linux,initrd-end", start + size)
            .map_err(map_fdt)?;
    }
    fdt.end_node(chosen).map_err(map_fdt)?;

    // /intc — GIC v3 distributor + redistributor.
    let intc = fdt
        .begin_node(&format!("intc@{:x}", layout::GICV3_DIST_BASE))
        .map_err(map_fdt)?;
    fdt.property_string("compatible", "arm,gic-v3")
        .map_err(map_fdt)?;
    fdt.property_null("interrupt-controller").map_err(map_fdt)?;
    fdt.property_u32("#interrupt-cells", 3).map_err(map_fdt)?;
    fdt.property_u32("#address-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("#size-cells", 0x2).map_err(map_fdt)?;
    fdt.property_u32("phandle", GIC_PHANDLE).map_err(map_fdt)?;
    // Two reg pairs: distributor, redistributor.
    let mut intc_reg = Vec::new();
    intc_reg.extend_from_slice(&be_u64_pair(
        layout::GICV3_DIST_BASE,
        layout::GICV3_DIST_SIZE,
    ));
    intc_reg.extend_from_slice(&be_u64_pair(
        layout::GICV3_REDIST_BASE,
        layout::GICV3_REDIST_STRIDE * (n_vcpus as u64),
    ));
    fdt.property("reg", &intc_reg).map_err(map_fdt)?;
    fdt.end_node(intc).map_err(map_fdt)?;

    // /timer — generic ARM timer, 4 PPI INTIDs (sec, hyp, virt, phys).
    // Each entry: type=PPI, intid - 16, flags=level-hi.
    let timer = fdt.begin_node("timer").map_err(map_fdt)?;
    fdt.property_string("compatible", "arm,armv8-timer")
        .map_err(map_fdt)?;
    fdt.property_null("always-on").map_err(map_fdt)?;
    let timer_irqs = [
        // sec, hyp, virt, phys — PPIs 13, 14, 11, 10 (intid - 16).
        (13u32, IRQ_TYPE_LEVEL_HI),
        (14, IRQ_TYPE_LEVEL_HI),
        (11, IRQ_TYPE_LEVEL_HI),
        (10, IRQ_TYPE_LEVEL_HI),
    ];
    let mut tirqs: Vec<u8> = Vec::new();
    for (intid, flags) in timer_irqs {
        tirqs.extend_from_slice(&be_u32(GIC_FDT_IRQ_TYPE_PPI));
        tirqs.extend_from_slice(&be_u32(intid));
        tirqs.extend_from_slice(&be_u32(flags));
    }
    fdt.property("interrupts", &tirqs).map_err(map_fdt)?;
    fdt.end_node(timer).map_err(map_fdt)?;

    // /pl011 — serial console at SERIAL_MMIO_BASE on SERIAL_IRQ.
    let serial = fdt
        .begin_node(&format!("pl011@{:x}", layout::SERIAL_MMIO_BASE))
        .map_err(map_fdt)?;
    fdt.property_string_list(
        "compatible",
        vec!["arm,pl011".to_string(), "arm,primecell".to_string()],
    )
    .map_err(map_fdt)?;
    let serial_reg = be_u64_pair(layout::SERIAL_MMIO_BASE, layout::SERIAL_MMIO_SIZE);
    fdt.property("reg", &serial_reg).map_err(map_fdt)?;
    let mut serial_irq = Vec::new();
    serial_irq.extend_from_slice(&be_u32(GIC_FDT_IRQ_TYPE_SPI));
    serial_irq.extend_from_slice(&be_u32(layout::SERIAL_IRQ - 32));
    serial_irq.extend_from_slice(&be_u32(IRQ_TYPE_LEVEL_HI));
    fdt.property("interrupts", &serial_irq).map_err(map_fdt)?;
    fdt.property_u32("clocks", CLOCK_PHANDLE).map_err(map_fdt)?;
    fdt.property_string("clock-names", "apb_pclk")
        .map_err(map_fdt)?;
    fdt.end_node(serial).map_err(map_fdt)?;

    // /clocks/apb_pclk — required by some kernel drivers.
    let clk = fdt.begin_node("apb-pclk").map_err(map_fdt)?;
    fdt.property_string("compatible", "fixed-clock")
        .map_err(map_fdt)?;
    fdt.property_u32("#clock-cells", 0x0).map_err(map_fdt)?;
    fdt.property_u32("clock-frequency", 24_000_000)
        .map_err(map_fdt)?;
    fdt.property_string("clock-output-names", "clk24mhz")
        .map_err(map_fdt)?;
    fdt.property_u32("phandle", CLOCK_PHANDLE)
        .map_err(map_fdt)?;
    fdt.end_node(clk).map_err(map_fdt)?;

    // /psci — Power State Coordination Interface; HVC method so SMP
    // bringup uses CPU_ON via HVC.
    let psci = fdt.begin_node("psci").map_err(map_fdt)?;
    fdt.property_string("compatible", "arm,psci-0.2")
        .map_err(map_fdt)?;
    fdt.property_string("method", "hvc").map_err(map_fdt)?;
    fdt.end_node(psci).map_err(map_fdt)?;

    // /virtio_mmio@<base> for each device. 0x200 stride.
    // IRQ_TYPE_EDGE_RISING required for HVF in-kernel GIC: it
    // auto-deasserts the SPI on guest EOI, which only works with
    // edge-triggered semantics. Level-triggered would need the
    // host to deassert manually (not supported by hv_gic_set_spi).
    for ent in virtio_devs {
        let n = fdt
            .begin_node(&format!("virtio_mmio@{:x}", ent.base))
            .map_err(map_fdt)?;
        fdt.property_string("compatible", "virtio,mmio")
            .map_err(map_fdt)?;
        let reg = be_u64_pair(ent.base, 0x200);
        fdt.property("reg", &reg).map_err(map_fdt)?;
        let mut irq = Vec::new();
        irq.extend_from_slice(&be_u32(GIC_FDT_IRQ_TYPE_SPI));
        irq.extend_from_slice(&be_u32(ent.irq - 32));
        irq.extend_from_slice(&be_u32(IRQ_TYPE_EDGE_RISING));
        fdt.property("interrupts", &irq).map_err(map_fdt)?;
        fdt.end_node(n).map_err(map_fdt)?;
    }

    fdt.end_node(root).map_err(map_fdt)?;
    fdt.finish().map_err(map_fdt)
}

fn map_fdt(e: vm_fdt::Error) -> io::Error {
    io::Error::other(format!("FDT: {e:?}"))
}

fn be_u32(v: u32) -> [u8; 4] {
    v.to_be_bytes()
}

fn be_u64_pair(a: u64, b: u64) -> [u8; 16] {
    let mut o = [0u8; 16];
    o[..8].copy_from_slice(&a.to_be_bytes());
    o[8..].copy_from_slice(&b.to_be_bytes());
    o
}

#[cfg(test)]
mod tests {
    use super::*;

    const FDT_MAGIC: u32 = 0xd00d_feed;

    fn magic(b: &[u8]) -> u32 {
        u32::from_be_bytes([b[0], b[1], b[2], b[3]])
    }
    fn totalsize(b: &[u8]) -> u32 {
        u32::from_be_bytes([b[4], b[5], b[6], b[7]])
    }
    fn has(b: &[u8], s: &str) -> bool {
        b.windows(s.len()).any(|w| w == s.as_bytes())
    }
    fn count(b: &[u8], s: &str) -> usize {
        b.windows(s.len()).filter(|w| *w == s.as_bytes()).count()
    }

    fn gen(n_vcpus: usize, devs: &[VirtioMmioEntry]) -> Vec<u8> {
        generate(
            n_vcpus,
            256 << 20,
            "console=ttyAMA0 root=/dev/vda",
            None,
            devs,
        )
        .unwrap()
    }

    #[test]
    fn produces_a_structurally_valid_dtb() {
        let blob = gen(1, &[]);
        assert!(blob.len() > 8);
        assert_eq!(magic(&blob), FDT_MAGIC, "DTB magic");
        // The header totalsize must match the buffer the writer returned.
        assert_eq!(totalsize(&blob) as usize, blob.len(), "totalsize == len");
    }

    #[test]
    fn contains_the_required_nodes_and_compatibles() {
        let blob = gen(1, &[]);
        for needle in [
            "linux,dummy-virt", // root compatible
            "arm,gic-v3",       // intc
            "arm,armv8-timer",  // timer
            "arm,pl011",        // serial
            "fixed-clock",      // apb-pclk
            "arm,psci-0.2",     // psci
        ] {
            assert!(has(&blob, needle), "DTB should contain {needle:?}");
        }
    }

    #[test]
    fn cmdline_is_embedded_as_bootargs() {
        let blob = generate(1, 256 << 20, "earlycon panic=-1 sm.flag=42", None, &[]).unwrap();
        assert!(
            has(&blob, "earlycon panic=-1 sm.flag=42"),
            "bootargs embedded"
        );
    }

    #[test]
    fn empty_cmdline_is_accepted() {
        let blob = generate(1, 256 << 20, "", None, &[]).unwrap();
        assert_eq!(magic(&blob), FDT_MAGIC);
    }

    #[test]
    fn cpu_node_count_matches_vcpus() {
        // One `cpu@<n>` node name per vCPU (node names live inline in the
        // struct block, so counting them is robust).
        assert_eq!(count(&gen(1, &[]), "cpu@"), 1);
        assert_eq!(count(&gen(4, &[]), "cpu@"), 4);
        assert_eq!(count(&gen(16, &[]), "cpu@"), 16);
    }

    #[test]
    fn enable_method_psci_only_for_smp() {
        // Single vCPU: no enable-method (UP boot doesn't need PSCI CPU_ON).
        assert!(!has(&gen(1, &[]), "enable-method"));
        // Multi vCPU: enable-method present.
        assert!(has(&gen(2, &[]), "enable-method"));
    }

    #[test]
    fn high_vcpu_count_does_not_panic_or_overflow() {
        // redist stride * n_vcpus and the MPIDR affinity masking must
        // stay sane at large counts.
        let blob = gen(64, &[]);
        assert_eq!(magic(&blob), FDT_MAGIC);
        assert_eq!(count(&blob, "cpu@"), 64);
    }

    #[test]
    fn zero_vcpus_is_handled_gracefully() {
        // Degenerate but must not panic: empty cpus list, redist stride*0.
        let blob = gen(0, &[]);
        assert_eq!(magic(&blob), FDT_MAGIC);
        assert_eq!(count(&blob, "cpu@"), 0);
    }

    #[test]
    fn initrd_properties_present_only_when_supplied() {
        let without = generate(1, 256 << 20, "x", None, &[]).unwrap();
        assert!(!has(&without, "linux,initrd-start"));

        let with = generate(1, 256 << 20, "x", Some((0x4000_0000, 0x10_0000)), &[]).unwrap();
        assert!(has(&with, "linux,initrd-start"));
        assert!(has(&with, "linux,initrd-end"));
    }

    #[test]
    fn virtio_device_count_matches_entries() {
        let devs = [
            VirtioMmioEntry {
                base: 0x0a00_0000,
                irq: 48,
            },
            VirtioMmioEntry {
                base: 0x0a00_0200,
                irq: 49,
            },
            VirtioMmioEntry {
                base: 0x0a00_0400,
                irq: 50,
            },
        ];
        assert_eq!(count(&gen(1, &devs), "virtio_mmio@"), 3);
        assert_eq!(count(&gen(1, &[]), "virtio_mmio@"), 0);
    }

    #[test]
    fn be_u64_pair_is_big_endian_address_then_size() {
        let p = be_u64_pair(0x1122_3344_5566_7788, 0x00aa_bb00);
        assert_eq!(&p[..8], &0x1122_3344_5566_7788u64.to_be_bytes());
        assert_eq!(&p[8..], &0x00aa_bb00u64.to_be_bytes());
        assert_eq!(p[0], 0x11, "MSB first");
    }
}