supermachine 0.7.72

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
Documentation
//! Intel MultiProcessor (MP) table — tells the guest kernel how many CPUs the
//! machine has (and where the LAPIC/IOAPIC live) so it brings up the secondary
//! cores.
//!
//! We hand the kernel no ACPI (no RSDP), so on x86 the only way it discovers
//! more than one CPU is the legacy MP table (MP spec 1.4). The kernel scans a
//! few fixed windows for the "_MP_" floating-pointer signature; we place it (and
//! the configuration table it points at) in the reserved KiB just above base
//! RAM (0x9fc00..0xa0000), which the e820 map leaves unclaimed.
//!
//! Pure logic over the guest-RAM slice (gpa 0 == `mem[0]`) — no KVM dependency,
//! unit-tested. The KVM backend calls [`write_mptable`] before starting the
//! vCPUs; KVM's in-kernel LAPIC then handles the INIT-SIPI-SIPI AP bring-up the
//! kernel drives from this table.
//!
//! Reference: Intel MultiProcessor Specification v1.4.

/// MP floating-pointer structure location (within the reserved 0x9fc00..0xa0000
/// window the kernel scans).
pub const MP_FPS_ADDR: u64 = 0x9_fc00;
/// MP configuration table location (immediately after the 16-byte FPS).
const MP_CONFIG_ADDR: u64 = MP_FPS_ADDR + 16;

/// Local APIC MMIO base (default).
const APIC_BASE: u32 = 0xfee0_0000;
/// I/O APIC MMIO base (default).
const IOAPIC_BASE: u32 = 0xfec0_0000;

// MP configuration-table entry type tags.
const ENTRY_PROCESSOR: u8 = 0;
const ENTRY_BUS: u8 = 1;
const ENTRY_IOAPIC: u8 = 2;
const ENTRY_IOINTERRUPT: u8 = 3;

// Entry sizes (bytes).
const LEN_PROCESSOR: usize = 20;
const LEN_BUS: usize = 8;
const LEN_IOAPIC: usize = 8;
const LEN_IOINTERRUPT: usize = 8;

// Processor cpu_flags.
const CPU_ENABLED: u8 = 0x1;
const CPU_BSP: u8 = 0x2;

// Number of legacy ISA IRQ lines we describe routing for (0..16).
const NUM_ISA_IRQS: u8 = 16;

/// The MP table did not fit in the reserved window.
#[derive(Debug, PartialEq, Eq)]
pub struct MpTableTooBig {
    pub need: usize,
    pub avail: usize,
}

impl std::fmt::Display for MpTableTooBig {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "MP table too big: need {} bytes, {} available in the reserved window",
            self.need, self.avail
        )
    }
}

impl std::error::Error for MpTableTooBig {}

/// 8-bit two's-complement checksum: the byte that makes the region sum to 0.
fn checksum(bytes: &[u8]) -> u8 {
    let sum = bytes.iter().fold(0u8, |a, &b| a.wrapping_add(b));
    (!sum).wrapping_add(1) // -sum mod 256
}

/// Little-endian field writers into a byte buffer.
fn put_u16(buf: &mut [u8], off: usize, v: u16) {
    buf[off..off + 2].copy_from_slice(&v.to_le_bytes());
}
fn put_u32(buf: &mut [u8], off: usize, v: u32) {
    buf[off..off + 4].copy_from_slice(&v.to_le_bytes());
}

/// Write the MP floating pointer + configuration table for `num_cpus` CPUs into
/// guest RAM. CPU `0` is the bootstrap processor (BSP); APIC ids are `0..num_cpus`
/// and the I/O APIC takes id `num_cpus`.
///
/// `mem` is the full guest RAM slice (gpa 0 == `mem[0]`).
pub fn write_mptable(mem: &mut [u8], num_cpus: u8) -> Result<(), MpTableTooBig> {
    assert!(num_cpus >= 1, "num_cpus must be >= 1");

    // Build the configuration table body (header + entries) in a scratch
    // buffer, then checksum it, then copy into guest RAM.
    let header_len = 44usize;
    let entries_len = num_cpus as usize * LEN_PROCESSOR
        + LEN_BUS
        + LEN_IOAPIC
        + NUM_ISA_IRQS as usize * LEN_IOINTERRUPT;
    let table_len = header_len + entries_len;

    // Everything (16-byte FPS + the table) must fit in 0x9fc00..0xa0000.
    let window = 0xa_0000usize - MP_FPS_ADDR as usize;
    let total = 16 + table_len;
    if total > window || MP_CONFIG_ADDR as usize + table_len > mem.len() {
        return Err(MpTableTooBig {
            need: total,
            avail: window,
        });
    }

    let mut table = vec![0u8; table_len];

    // --- Configuration table header (44 bytes) ---
    table[0..4].copy_from_slice(b"PCMP");
    put_u16(&mut table, 4, table_len as u16); // base_table_length
    table[6] = 4; // spec rev 1.4
                  // table[7] = checksum, filled after entries
    table[8..16].copy_from_slice(b"SUPRMCHN"); // oem id (8)
    table[16..28].copy_from_slice(b"x86-64 KVM  "); // product id (12)
                                                    // oem_table_ptr u32 @28 = 0, oem_table_size u16 @32 = 0
    let entry_count = num_cpus as u16 + 1 /*bus*/ + 1 /*ioapic*/ + NUM_ISA_IRQS as u16;
    put_u16(&mut table, 34, entry_count);
    put_u32(&mut table, 36, APIC_BASE); // local APIC address
                                        // ext_table_length u16 @40 = 0, ext_checksum u8 @42 = 0, reserved @43 = 0

    // --- Entries ---
    let mut off = header_len;

    // Processor entries (one per CPU).
    for apic_id in 0..num_cpus {
        table[off] = ENTRY_PROCESSOR;
        table[off + 1] = apic_id; // local APIC id
        table[off + 2] = 0x14; // local APIC version
        table[off + 3] = CPU_ENABLED | if apic_id == 0 { CPU_BSP } else { 0 };
        put_u32(&mut table, off + 4, 0x600); // cpu signature (family 6)
        put_u32(&mut table, off + 8, 0x201); // feature flags (FPU + APIC)
                                             // reserved 8 bytes @ off+12
        off += LEN_PROCESSOR;
    }

    // Bus entry (one ISA bus, id 0).
    table[off] = ENTRY_BUS;
    table[off + 1] = 0; // bus id
    table[off + 2..off + 8].copy_from_slice(b"ISA   ");
    off += LEN_BUS;

    // I/O APIC entry (id = num_cpus, enabled).
    let ioapic_id = num_cpus;
    table[off] = ENTRY_IOAPIC;
    table[off + 1] = ioapic_id;
    table[off + 2] = 0x11; // I/O APIC version
    table[off + 3] = 0x1; // flags: enabled
    put_u32(&mut table, off + 4, IOAPIC_BASE);
    off += LEN_IOAPIC;

    // I/O interrupt assignment entries: identity-map ISA IRQ i → IOAPIC INTIN i.
    for irq in 0..NUM_ISA_IRQS {
        table[off] = ENTRY_IOINTERRUPT;
        table[off + 1] = 0; // interrupt type: INT (vectored)
        put_u16(&mut table, off + 2, 0); // flags: conforming polarity/trigger
        table[off + 4] = 0; // source bus id (the ISA bus)
        table[off + 5] = irq; // source bus IRQ
        table[off + 6] = ioapic_id; // destination I/O APIC id
        table[off + 7] = irq; // destination I/O APIC INTIN
        off += LEN_IOINTERRUPT;
    }
    debug_assert_eq!(off, table_len);

    // Configuration-table checksum (whole table sums to 0).
    table[7] = checksum(&table);

    // --- Floating pointer structure (16 bytes) ---
    let mut fps = [0u8; 16];
    fps[0..4].copy_from_slice(b"_MP_");
    put_u32(&mut fps, 4, MP_CONFIG_ADDR as u32); // pointer to the config table
    fps[8] = 1; // length in 16-byte units
    fps[9] = 4; // spec rev 1.4
                // fps[10] = checksum, filled below
                // fps[11..16] = feature bytes; 0 means "configuration table present"
    fps[10] = checksum(&fps);

    // --- Copy into guest RAM ---
    let fps_at = MP_FPS_ADDR as usize;
    mem[fps_at..fps_at + 16].copy_from_slice(&fps);
    let table_at = MP_CONFIG_ADDR as usize;
    mem[table_at..table_at + table_len].copy_from_slice(&table);
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    fn read_u16(mem: &[u8], gpa: u64) -> u16 {
        let s = gpa as usize;
        u16::from_le_bytes(mem[s..s + 2].try_into().unwrap())
    }
    fn read_u32(mem: &[u8], gpa: u64) -> u32 {
        let s = gpa as usize;
        u32::from_le_bytes(mem[s..s + 4].try_into().unwrap())
    }

    #[test]
    fn checksum_makes_region_sum_to_zero() {
        let data = [1u8, 2, 3, 4, 250];
        let mut buf = data.to_vec();
        buf.push(checksum(&data));
        let sum = buf.iter().fold(0u8, |a, &b| a.wrapping_add(b));
        assert_eq!(sum, 0);
    }

    #[test]
    fn writes_valid_floating_pointer() {
        let mut mem = vec![0u8; 1024 * 1024];
        write_mptable(&mut mem, 4).unwrap();
        let fps = MP_FPS_ADDR as usize;
        assert_eq!(&mem[fps..fps + 4], b"_MP_");
        assert_eq!(read_u32(&mem, MP_FPS_ADDR + 4), MP_CONFIG_ADDR as u32);
        assert_eq!(mem[fps + 8], 1, "length in 16-byte units");
        assert_eq!(mem[fps + 9], 4, "spec rev");
        // The 16-byte FPS must checksum to zero.
        let sum = mem[fps..fps + 16]
            .iter()
            .fold(0u8, |a, &b| a.wrapping_add(b));
        assert_eq!(sum, 0, "FPS checksum");
    }

    #[test]
    fn writes_valid_config_table_for_n_cpus() {
        let n = 4u8;
        let mut mem = vec![0u8; 1024 * 1024];
        write_mptable(&mut mem, n).unwrap();
        let cfg = MP_CONFIG_ADDR as usize;
        assert_eq!(&mem[cfg..cfg + 4], b"PCMP");
        let len = read_u16(&mem, MP_CONFIG_ADDR + 4) as usize;
        assert_eq!(read_u32(&mem, MP_CONFIG_ADDR + 36), APIC_BASE, "lapic addr");
        // entry_count = n procs + bus + ioapic + 16 int entries.
        assert_eq!(
            read_u16(&mem, MP_CONFIG_ADDR + 34),
            n as u16 + 1 + 1 + NUM_ISA_IRQS as u16
        );
        // The whole config table checksums to zero.
        let sum = mem[cfg..cfg + len]
            .iter()
            .fold(0u8, |a, &b| a.wrapping_add(b));
        assert_eq!(sum, 0, "config table checksum");
    }

    #[test]
    fn processor_entries_flag_bsp_and_enable_all() {
        let n = 3u8;
        let mut mem = vec![0u8; 1024 * 1024];
        write_mptable(&mut mem, n).unwrap();
        let mut off = MP_CONFIG_ADDR as usize + 44; // after header
        for apic_id in 0..n {
            assert_eq!(mem[off], ENTRY_PROCESSOR);
            assert_eq!(mem[off + 1], apic_id, "apic id");
            let flags = mem[off + 3];
            assert_eq!(flags & CPU_ENABLED, CPU_ENABLED, "cpu enabled");
            assert_eq!(flags & CPU_BSP != 0, apic_id == 0, "only cpu 0 is the BSP");
            off += LEN_PROCESSOR;
        }
    }

    #[test]
    fn ioapic_entry_follows_the_processors() {
        let n = 2u8;
        let mut mem = vec![0u8; 1024 * 1024];
        write_mptable(&mut mem, n).unwrap();
        let ioapic = MP_CONFIG_ADDR as usize + 44 + n as usize * LEN_PROCESSOR + LEN_BUS;
        assert_eq!(mem[ioapic], ENTRY_IOAPIC);
        assert_eq!(mem[ioapic + 1], n, "ioapic id = num_cpus");
        assert_eq!(read_u32(&mem, ioapic as u64 + 4), IOAPIC_BASE);
    }

    #[test]
    fn rejects_when_table_would_overflow_the_window() {
        // A huge CPU count blows past the ~1 KiB reserved window.
        let mut mem = vec![0u8; 1024 * 1024];
        let err = write_mptable(&mut mem, 200).unwrap_err();
        assert!(err.need > err.avail);
    }
}