supermachine 0.5.0

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// Status: compact port. Descriptor walking against raw guest mem
// pointer. No vm-memory dep — we treat guest RAM as a flat
// (host_ram_base, ram_gpa, ram_size) tuple and translate addresses.

#![allow(dead_code)]

use std::sync::atomic::Ordering;
use std::sync::Arc;

/// virtq descriptor flags.
pub const VRING_DESC_F_NEXT: u16 = 1;
pub const VRING_DESC_F_WRITE: u16 = 2;
pub const VRING_DESC_F_INDIRECT: u16 = 4;

/// Cached `$SUPERMACHINE_VQ_TRACE` flag. Same rationale as
/// `vsock_trace_enabled` in muxer.rs — we don't want a libc
/// `getenv` global-locked lookup on every descriptor pop / push.
#[inline]
fn vq_trace_enabled() -> bool {
    static CACHED: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
    let v = CACHED.load(std::sync::atomic::Ordering::Relaxed);
    if v != 0 {
        return v == 2;
    }
    let on = std::env::var_os("SUPERMACHINE_VQ_TRACE").is_some();
    CACHED.store(
        if on { 2 } else { 1 },
        std::sync::atomic::Ordering::Relaxed,
    );
    on
}

/// Single virtq descriptor as seen in guest memory.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct Desc {
    pub addr: u64,
    pub len: u32,
    pub flags: u16,
    pub next: u16,
}

/// Per-queue state: descriptor table / avail ring / used ring
/// addresses, queue size, indices.
#[derive(Clone)]
pub struct Queue {
    pub size: u16,
    pub ready: bool,
    pub desc_table: u64,
    pub avail_ring: u64,
    pub used_ring: u64,
    pub last_avail_idx: u16,
    pub next_used_idx: u16,
    pub mem: GuestMem,
}

impl Queue {
    pub fn new(mem: GuestMem) -> Self {
        Self {
            size: 256,
            ready: false,
            desc_table: 0,
            avail_ring: 0,
            used_ring: 0,
            last_avail_idx: 0,
            next_used_idx: 0,
            mem,
        }
    }

    /// Read the avail-ring `idx` field (= total descriptors the
    /// driver has made available). LE u16 at offset 2 of avail ring.
    pub fn avail_idx(&self) -> u16 {
        self.mem.read_u16(self.avail_ring + 2)
    }

    /// Pop the next available descriptor chain head, if any.
    /// Returns (head_index, descriptor_chain_vec).
    pub fn pop_chain(&mut self) -> Option<(u16, Vec<Desc>)> {
        let avail = self.avail_idx();
        if avail == self.last_avail_idx {
            return None;
        }
        // Load the head index from avail.ring[last_avail_idx % size]
        let off = self.avail_ring + 4 + ((self.last_avail_idx % self.size) as u64) * 2;
        let head = self.mem.read_u16(off);
        if vq_trace_enabled() {
            eprintln!("[vq desc=0x{:x}] pop_chain: avail.idx={avail} last_avail_idx={} slot={} head={head}",
                self.desc_table, self.last_avail_idx, self.last_avail_idx % self.size);
        }
        self.last_avail_idx = self.last_avail_idx.wrapping_add(1);

        let mut chain = Vec::new();
        let mut idx = head;
        for _ in 0..self.size {
            let d_addr = self.desc_table + (idx as u64) * 16;
            let desc = Desc {
                addr: self.mem.read_u64(d_addr),
                len: self.mem.read_u32(d_addr + 8),
                flags: self.mem.read_u16(d_addr + 12),
                next: self.mem.read_u16(d_addr + 14),
            };
            chain.push(desc);
            if desc.flags & VRING_DESC_F_NEXT == 0 {
                break;
            }
            idx = desc.next;
        }
        Some((head, chain))
    }

    /// Push (id, used_len) onto the used ring + bump used.idx.
    pub fn add_used(&mut self, head: u16, used_len: u32) {
        // used.ring[next_used_idx % size] = { id: u32, len: u32 }
        let entry_off = self.used_ring + 4 + ((self.next_used_idx % self.size) as u64) * 8;
        self.mem.write_u32(entry_off, head as u32);
        self.mem.write_u32(entry_off + 4, used_len);
        if vq_trace_enabled() {
            eprintln!("[vq desc=0x{:x}] add_used: slot={} head={head} len={used_len} next_used_idx_after={}",
                self.desc_table, self.next_used_idx % self.size, self.next_used_idx + 1);
        }
        self.next_used_idx = self.next_used_idx.wrapping_add(1);
        // Release fence then publish the new idx.
        std::sync::atomic::fence(Ordering::Release);
        self.mem.write_u16(self.used_ring + 2, self.next_used_idx);
    }
}

/// Cheaply-cloneable handle to guest physical memory. Backed by
/// the mmap region the VMM created. Reads/writes are unsynchronized
/// raw pointer accesses — the guest uses the same memory but with
/// a virtio rmb/wmb cadence we cooperate with via release fences in
/// `add_used`.
#[derive(Clone)]
pub struct GuestMem {
    inner: Arc<GuestMemInner>,
}

struct GuestMemInner {
    host: *mut u8,
    base_gpa: u64,
    len: usize,
}

// SAFETY: mmap pages are stable for the VM lifetime; raw pointer
// access is intentional (the device side races the guest via virtio
// memory ordering, not Rust ownership).
unsafe impl Send for GuestMemInner {}
unsafe impl Sync for GuestMemInner {}

impl GuestMem {
    pub fn new(host: *mut u8, base_gpa: u64, len: usize) -> Self {
        Self {
            inner: Arc::new(GuestMemInner {
                host,
                base_gpa,
                len,
            }),
        }
    }

    fn translate(&self, gpa: u64, n: usize) -> *mut u8 {
        let off = (gpa - self.inner.base_gpa) as usize;
        debug_assert!(off + n <= self.inner.len, "GPA OOB");
        // SAFETY: bounds checked.
        unsafe { self.inner.host.add(off) }
    }

    pub fn read_u16(&self, gpa: u64) -> u16 {
        // SAFETY: translate bounds-checks; guest mem readable.
        unsafe { std::ptr::read_unaligned(self.translate(gpa, 2) as *const u16) }
    }
    pub fn read_u32(&self, gpa: u64) -> u32 {
        unsafe { std::ptr::read_unaligned(self.translate(gpa, 4) as *const u32) }
    }
    pub fn read_u64(&self, gpa: u64) -> u64 {
        unsafe { std::ptr::read_unaligned(self.translate(gpa, 8) as *const u64) }
    }
    pub fn write_u16(&self, gpa: u64, v: u16) {
        unsafe { std::ptr::write_unaligned(self.translate(gpa, 2) as *mut u16, v) }
    }
    pub fn write_u32(&self, gpa: u64, v: u32) {
        unsafe { std::ptr::write_unaligned(self.translate(gpa, 4) as *mut u32, v) }
    }
    pub fn write_u64(&self, gpa: u64, v: u64) {
        unsafe { std::ptr::write_unaligned(self.translate(gpa, 8) as *mut u64, v) }
    }

    /// Read a byte slice from guest memory.
    pub fn read_slice(&self, gpa: u64, dst: &mut [u8]) {
        let p = self.translate(gpa, dst.len()) as *const u8;
        unsafe { std::ptr::copy_nonoverlapping(p, dst.as_mut_ptr(), dst.len()) }
    }
    /// Write a byte slice into guest memory.
    pub fn write_slice(&self, gpa: u64, src: &[u8]) {
        let p = self.translate(gpa, src.len());
        unsafe { std::ptr::copy_nonoverlapping(src.as_ptr(), p, src.len()) }
    }

    /// Direct host pointer for a `(gpa, len)` range. Use when a
    /// device wants to recv/read straight into guest memory.
    /// SAFETY: caller must respect the range's bounds + lifetime.
    pub fn host_ptr(&self, gpa: u64, len: usize) -> *mut u8 {
        self.translate(gpa, len)
    }
}