supermachine 0.7.6

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// Status: compact port. Descriptor walking against raw guest mem
// pointer. No vm-memory dep — we treat guest RAM as a flat
// (host_ram_base, ram_gpa, ram_size) tuple and translate addresses.

#![allow(dead_code)]

use std::sync::atomic::Ordering;
use std::sync::Arc;

/// virtq descriptor flags.
pub const VRING_DESC_F_NEXT: u16 = 1;
pub const VRING_DESC_F_WRITE: u16 = 2;
pub const VRING_DESC_F_INDIRECT: u16 = 4;

/// Cached `$SUPERMACHINE_VQ_TRACE` flag. Same rationale as
/// `vsock_trace_enabled` in muxer.rs — we don't want a libc
/// `getenv` global-locked lookup on every descriptor pop / push.
#[inline]
fn vq_trace_enabled() -> bool {
    static CACHED: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
    let v = CACHED.load(std::sync::atomic::Ordering::Relaxed);
    if v != 0 {
        return v == 2;
    }
    let on = std::env::var_os("SUPERMACHINE_VQ_TRACE").is_some();
    CACHED.store(
        if on { 2 } else { 1 },
        std::sync::atomic::Ordering::Relaxed,
    );
    on
}

/// Single virtq descriptor as seen in guest memory.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct Desc {
    pub addr: u64,
    pub len: u32,
    pub flags: u16,
    pub next: u16,
}

/// Per-queue state: descriptor table / avail ring / used ring
/// addresses, queue size, indices.
#[derive(Clone)]
pub struct Queue {
    pub size: u16,
    pub ready: bool,
    pub desc_table: u64,
    pub avail_ring: u64,
    pub used_ring: u64,
    pub last_avail_idx: u16,
    pub next_used_idx: u16,
    pub mem: GuestMem,
}

impl Queue {
    pub fn new(mem: GuestMem) -> Self {
        Self {
            size: 256,
            ready: false,
            desc_table: 0,
            avail_ring: 0,
            used_ring: 0,
            last_avail_idx: 0,
            next_used_idx: 0,
            mem,
        }
    }

    /// Read the avail-ring `idx` field (= total descriptors the
    /// driver has made available). LE u16 at offset 2 of avail ring.
    pub fn avail_idx(&self) -> u16 {
        self.mem.read_u16(self.avail_ring + 2)
    }

    /// Pop the next available descriptor chain head, if any.
    /// Returns (head_index, descriptor_chain_vec).
    pub fn pop_chain(&mut self) -> Option<(u16, Vec<Desc>)> {
        let avail = self.avail_idx();
        if avail == self.last_avail_idx {
            return None;
        }
        // Load the head index from avail.ring[last_avail_idx % size]
        let off = self.avail_ring + 4 + ((self.last_avail_idx % self.size) as u64) * 2;
        let head = self.mem.read_u16(off);
        if vq_trace_enabled() {
            eprintln!("[vq desc=0x{:x}] pop_chain: avail.idx={avail} last_avail_idx={} slot={} head={head}",
                self.desc_table, self.last_avail_idx, self.last_avail_idx % self.size);
        }
        self.last_avail_idx = self.last_avail_idx.wrapping_add(1);

        let mut chain = Vec::new();
        let mut idx = head;
        // Hard cap on chain length. `for _ in 0..self.size` already
        // prevents infinite loops (queue size is 256 by default; the
        // chain can't exceed that without re-visiting descriptors),
        // but a guest can craft a chain that visits self.size
        // descriptors with `next` indices > self.size. The
        // descriptor table is `self.size` entries, so an out-of-range
        // `next` value is undefined behavior in the virtio spec —
        // treat it as end-of-chain.
        for _ in 0..self.size {
            if idx >= self.size {
                // OOB next index — terminate the chain. The
                // descriptor we've already collected is consumed; the
                // guest sees the partial chain ack'd.
                break;
            }
            let d_addr = self.desc_table + (idx as u64) * 16;
            let desc = Desc {
                addr: self.mem.read_u64(d_addr),
                len: self.mem.read_u32(d_addr + 8),
                flags: self.mem.read_u16(d_addr + 12),
                next: self.mem.read_u16(d_addr + 14),
            };
            chain.push(desc);
            if desc.flags & VRING_DESC_F_NEXT == 0 {
                break;
            }
            idx = desc.next;
        }
        Some((head, chain))
    }

    /// Push (id, used_len) onto the used ring + bump used.idx.
    pub fn add_used(&mut self, head: u16, used_len: u32) {
        // used.ring[next_used_idx % size] = { id: u32, len: u32 }
        let entry_off = self.used_ring + 4 + ((self.next_used_idx % self.size) as u64) * 8;
        self.mem.write_u32(entry_off, head as u32);
        self.mem.write_u32(entry_off + 4, used_len);
        if vq_trace_enabled() {
            eprintln!("[vq desc=0x{:x}] add_used: slot={} head={head} len={used_len} next_used_idx_after={}",
                self.desc_table, self.next_used_idx % self.size, self.next_used_idx + 1);
        }
        self.next_used_idx = self.next_used_idx.wrapping_add(1);
        // Release fence then publish the new idx.
        std::sync::atomic::fence(Ordering::Release);
        self.mem.write_u16(self.used_ring + 2, self.next_used_idx);
    }
}

/// Cheaply-cloneable handle to guest physical memory. Backed by
/// the mmap region the VMM created. Reads/writes are unsynchronized
/// raw pointer accesses — the guest uses the same memory but with
/// a virtio rmb/wmb cadence we cooperate with via release fences in
/// `add_used`.
#[derive(Clone)]
pub struct GuestMem {
    inner: Arc<GuestMemInner>,
}

struct GuestMemInner {
    host: *mut u8,
    base_gpa: u64,
    len: usize,
}

// SAFETY: mmap pages are stable for the VM lifetime; raw pointer
// access is intentional (the device side races the guest via virtio
// memory ordering, not Rust ownership).
unsafe impl Send for GuestMemInner {}
unsafe impl Sync for GuestMemInner {}

impl GuestMem {
    pub fn new(host: *mut u8, base_gpa: u64, len: usize) -> Self {
        Self {
            inner: Arc::new(GuestMemInner {
                host,
                base_gpa,
                len,
            }),
        }
    }

    /// Resolve `(gpa, n)` to a host pointer with **release-mode**
    /// bounds checking. Returns `None` when the range falls outside
    /// the VM's RAM window — typically because a hostile or buggy
    /// guest put an OOB address into a virtqueue descriptor.
    ///
    /// SECURITY: this MUST stay enabled in release. Replacing it with
    /// `debug_assert!` (as it was prior to 0.5.2) lets a guest read
    /// or write arbitrary host memory by crafting descriptors with
    /// `addr = base_gpa - K`, since `(gpa - base_gpa) as usize`
    /// underflows to a huge offset and `host.add(off)` produces a
    /// pointer well outside our mmap. The bounds check is single-
    /// digit nanoseconds (one compare + branch) on the hot virtio
    /// path; the perf cost is unmeasurable.
    fn translate(&self, gpa: u64, n: usize) -> Option<*mut u8> {
        let off = gpa.checked_sub(self.inner.base_gpa)? as usize;
        // off + n <= len, computed without overflow.
        if off > self.inner.len {
            return None;
        }
        if n > self.inner.len - off {
            return None;
        }
        // SAFETY: off+n ≤ len bounds-checked above.
        Some(unsafe { self.inner.host.add(off) })
    }

    fn translate_or_zero(&self, gpa: u64, n: usize) -> *mut u8 {
        match self.translate(gpa, n) {
            Some(p) => p,
            None => {
                eprintln!(
                    "[guest-mem] OOB access: gpa={gpa:#x} len={n} (base={:#x} len={:#x}); zero-filling",
                    self.inner.base_gpa, self.inner.len
                );
                std::ptr::null_mut()
            }
        }
    }

    pub fn read_u16(&self, gpa: u64) -> u16 {
        let p = self.translate_or_zero(gpa, 2);
        if p.is_null() {
            return 0;
        }
        unsafe { std::ptr::read_unaligned(p as *const u16) }
    }
    pub fn read_u32(&self, gpa: u64) -> u32 {
        let p = self.translate_or_zero(gpa, 4);
        if p.is_null() {
            return 0;
        }
        unsafe { std::ptr::read_unaligned(p as *const u32) }
    }
    pub fn read_u64(&self, gpa: u64) -> u64 {
        let p = self.translate_or_zero(gpa, 8);
        if p.is_null() {
            return 0;
        }
        unsafe { std::ptr::read_unaligned(p as *const u64) }
    }
    pub fn write_u16(&self, gpa: u64, v: u16) {
        if let Some(p) = self.translate(gpa, 2) {
            unsafe { std::ptr::write_unaligned(p as *mut u16, v) }
        }
    }
    pub fn write_u32(&self, gpa: u64, v: u32) {
        if let Some(p) = self.translate(gpa, 4) {
            unsafe { std::ptr::write_unaligned(p as *mut u32, v) }
        }
    }
    pub fn write_u64(&self, gpa: u64, v: u64) {
        if let Some(p) = self.translate(gpa, 8) {
            unsafe { std::ptr::write_unaligned(p as *mut u64, v) }
        }
    }

    /// Read a byte slice from guest memory. On OOB the destination is
    /// filled with zeros and a log line emitted; the worker keeps
    /// running so the guest sees deterministic (if useless) data
    /// rather than crashing the whole worker for one bad descriptor.
    pub fn read_slice(&self, gpa: u64, dst: &mut [u8]) {
        match self.translate(gpa, dst.len()) {
            Some(p) => unsafe {
                std::ptr::copy_nonoverlapping(p as *const u8, dst.as_mut_ptr(), dst.len())
            },
            None => {
                eprintln!(
                    "[guest-mem] OOB read_slice: gpa={gpa:#x} len={} (base={:#x} len={:#x}); zero-filling",
                    dst.len(), self.inner.base_gpa, self.inner.len
                );
                dst.fill(0);
            }
        }
    }
    /// Write a byte slice into guest memory. OOB writes are dropped
    /// with a log line.
    pub fn write_slice(&self, gpa: u64, src: &[u8]) {
        match self.translate(gpa, src.len()) {
            Some(p) => unsafe {
                std::ptr::copy_nonoverlapping(src.as_ptr(), p, src.len())
            },
            None => {
                eprintln!(
                    "[guest-mem] OOB write_slice: gpa={gpa:#x} len={} (base={:#x} len={:#x}); dropping",
                    src.len(), self.inner.base_gpa, self.inner.len
                );
            }
        }
    }

    /// Direct host pointer for a `(gpa, len)` range. Use when a
    /// device wants to recv/read straight into guest memory.
    /// Returns null on OOB — callers MUST check.
    pub fn host_ptr(&self, gpa: u64, len: usize) -> *mut u8 {
        self.translate(gpa, len).unwrap_or(std::ptr::null_mut())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Allocate a known-good 4 KiB host buffer + wrap it. Tests can
    /// then issue GPA reads/writes inside [0x10_0000, 0x10_1000).
    fn make_mem() -> (GuestMem, Vec<u8>) {
        let mut buf = vec![0u8; 4096];
        // Pre-fill with a sentinel so reads can tell zero-on-OOB
        // (the security path) from zero-because-empty (data).
        for (i, b) in buf.iter_mut().enumerate() {
            *b = (i & 0xff) as u8;
        }
        let mem = GuestMem::new(buf.as_mut_ptr(), 0x10_0000, buf.len());
        (mem, buf)
    }

    #[test]
    fn in_bounds_read_returns_real_data() {
        let (mem, _buf) = make_mem();
        // offset 0 in our mmap → sentinel byte 0
        assert_eq!(mem.read_u32(0x10_0000), 0x03020100);
        // last 4 bytes
        assert_eq!(mem.read_u32(0x10_0000 + 4096 - 4), {
            // bytes 4092..4096 = 0xfc, 0xfd, 0xfe, 0xff
            u32::from_le_bytes([0xfc, 0xfd, 0xfe, 0xff])
        });
    }

    #[test]
    fn underflow_is_caught_not_uaf() {
        // gpa < base → underflow. Pre-0.5.2 this would compute a
        // huge usize offset and host.add(off) would produce an
        // arbitrary pointer, then read_unaligned would dereference.
        // Now translate returns None and we return 0.
        let (mem, _buf) = make_mem();
        assert_eq!(mem.read_u32(0x10_0000 - 1), 0);
        assert_eq!(mem.read_u32(0x0), 0);
        assert_eq!(mem.read_u64(u64::MAX - 100), 0);
    }

    #[test]
    fn overflow_past_end_is_caught() {
        let (mem, _buf) = make_mem();
        // Last legitimate u32 read.
        let _ = mem.read_u32(0x10_0000 + 4096 - 4);
        // One byte past end.
        assert_eq!(mem.read_u32(0x10_0000 + 4096 - 3), 0);
        // Huge len that wraps.
        let mut dst = vec![0u8; 8];
        mem.read_slice(0x10_0000 + 4096 - 4, &mut dst[..]);
        // First 4 bytes filled with real data, next 4 with zeros
        // because the entire request was OOB? Actually the entire
        // request is OOB (start + len > buf size), so it zero-fills
        // the whole dst. That's the defensive default.
        assert!(dst.iter().all(|&b| b == 0) || dst[..4] == [0xfc, 0xfd, 0xfe, 0xff]);
    }

    #[test]
    fn oob_write_is_silently_dropped() {
        // Hostile guest tells us to write to an address before our
        // mmap. Must NOT corrupt anything; just log + drop.
        let (mem, buf) = make_mem();
        let _ = buf;
        // Capture mem before the write
        mem.write_u32(0x0, 0xdeadbeef);
        mem.write_slice(u64::MAX - 100, &[0xff; 100]);
        // No assertion — the test passes if we don't segfault.
    }

    #[test]
    fn descriptor_with_oob_next_terminates_chain() {
        // Pre-0.5.2, a descriptor.next > queue.size produced an
        // arbitrary read (off the desc table). Now we treat OOB
        // next as end-of-chain.
        //
        // This is more of a smoke check; we don't spin a full
        // queue here, just confirm the bounds-check function
        // is plumbed correctly via translate.
        let (mem, _buf) = make_mem();
        // Read at exactly the boundary — last legal 8 bytes.
        let _ = mem.read_u64(0x10_0000 + 4096 - 8);
        // Read past the boundary by 1 byte.
        assert_eq!(mem.read_u64(0x10_0000 + 4096 - 7), 0);
    }
}