supermachine 0.4.13

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// Status: compact port. Implements virtio-mmio register layout v2
// (the only layout Linux uses today).
//
// Register map (virtio spec 1.1 ยง4.2.2):
//   0x000  MagicValue       'virt'
//   0x004  Version          2
//   0x008  DeviceID         (per-device)
//   0x00c  VendorID
//   0x010  DeviceFeatures   (read-only, paged by DeviceFeaturesSel)
//   0x014  DeviceFeaturesSel
//   0x020  DriverFeatures   (RW, paged by DriverFeaturesSel)
//   0x024  DriverFeaturesSel
//   0x030  QueueSel         (selects queue index)
//   0x034  QueueNumMax      (R: max ring entries for selected queue)
//   0x038  QueueNum         (RW: actual ring size)
//   0x044  QueueReady       (RW)
//   0x050  QueueNotify      (W: kick from driver)
//   0x060  InterruptStatus  (R)
//   0x064  InterruptACK     (W: clear bits)
//   0x070  Status           (RW)
//   0x080  QueueDescLow
//   0x084  QueueDescHigh
//   0x090  QueueDriverLow   (avail ring)
//   0x094  QueueDriverHigh
//   0x0a0  QueueDeviceLow   (used ring)
//   0x0a4  QueueDeviceHigh
//   0x100+ device-specific config space

#![allow(dead_code)]

use std::sync::{Arc, Mutex};

use super::queue::{GuestMem, Queue};
use super::VirtioDevice;
use crate::devices::mmio_bus::MmioDevice;

const MAGIC: u32 = 0x74726976; // 'virt'
const VERSION: u32 = 2;

/// Captured per-queue state (subset of `Queue`'s mutable fields).
#[derive(Clone, Debug)]
pub struct QueueSnapshot {
    pub size: u16,
    pub ready: bool,
    pub desc_table: u64,
    pub avail_ring: u64,
    pub used_ring: u64,
    pub last_avail_idx: u16,
    pub next_used_idx: u16,
}

/// Captured per-device MMIO state. Restored by `MmioVirtio::restore_state`.
#[derive(Clone, Debug)]
pub struct MmioSnapshot {
    pub driver_features: [u32; 2],
    pub status: u32,
    /// Pending IRQ status the guest hasn't ACK'd. Important if the
    /// host had bumped `next_used_idx` (so a new used entry is
    /// visible in guest RAM) but the guest hadn't yet processed the
    /// IRQ at capture time โ€” guest needs that bit set on resume to
    /// know there's work.
    pub interrupt_status: u32,
    pub queues: Vec<QueueSnapshot>,
}

struct State {
    device_features_sel: u32,
    driver_features: [u32; 2],
    driver_features_sel: u32,
    queue_sel: u32,
    status: u32,
    interrupt_status: u32,
    /// Per-queue state arrays.
    queues: Vec<Queue>,
    activated: bool,
    /// Notification: callback when guest writes to QueueNotify or
    /// when status flips DRIVER_OK.
    irq_raise: Arc<dyn Fn() + Send + Sync>,
}

pub struct MmioVirtio {
    dev: Arc<dyn VirtioDevice>,
    state: Mutex<State>,
}

impl MmioVirtio {
    pub fn new(
        dev: Arc<dyn VirtioDevice>,
        mem: GuestMem,
        irq_raise: Arc<dyn Fn() + Send + Sync>,
    ) -> Self {
        let queues = (0..dev.num_queues())
            .map(|_| Queue::new(mem.clone()))
            .collect();
        Self {
            dev,
            state: Mutex::new(State {
                device_features_sel: 0,
                driver_features: [0; 2],
                driver_features_sel: 0,
                queue_sel: 0,
                status: 0,
                interrupt_status: 0,
                queues,
                activated: false,
                irq_raise,
            }),
        }
    }

    /// Snapshot of mutable per-device MMIO state. Captures the bits
    /// the guest set during driver init: negotiated features, status
    /// (DRIVER_OK et al.), and per-queue addresses + cursors. Selector
    /// regs and interrupt_status are intentionally dropped โ€” they are
    /// scratch / ephemeral.
    pub fn capture_state(&self) -> MmioSnapshot {
        let st = self.state.lock().unwrap();
        // Live cursors live in the DEVICE's queues post-activate (the
        // device clones queues out of MmioVirtio in `activate` and
        // bumps last_avail_idx / next_used_idx in its own copy). Read
        // from the device when we can; fall back to the mirror for
        // pre-activate state.
        let live = self.dev.snapshot_queues();
        let queues: Vec<QueueSnapshot> = (0..st.queues.len())
            .map(|i| {
                let q = live.get(i).unwrap_or(&st.queues[i]);
                QueueSnapshot {
                    size: q.size,
                    ready: q.ready,
                    desc_table: q.desc_table,
                    avail_ring: q.avail_ring,
                    used_ring: q.used_ring,
                    last_avail_idx: q.last_avail_idx,
                    next_used_idx: q.next_used_idx,
                }
            })
            .collect();
        MmioSnapshot {
            driver_features: st.driver_features,
            status: st.status,
            interrupt_status: st.interrupt_status,
            queues,
        }
    }

    /// Replay a captured State into a fresh MmioVirtio. If the
    /// snapshot has DRIVER_OK set, also re-activates the device with
    /// the restored queues so the muxer/device starts using the same
    /// ring addresses + cursors the guest expects.
    pub fn restore_state(&self, snap: &MmioSnapshot) {
        let mut st = self.state.lock().unwrap();
        st.driver_features = snap.driver_features;
        st.status = snap.status;
        st.interrupt_status = snap.interrupt_status;
        for (i, qs) in snap.queues.iter().enumerate() {
            if let Some(q) = st.queues.get_mut(i) {
                q.size = qs.size;
                q.ready = qs.ready;
                q.desc_table = qs.desc_table;
                q.avail_ring = qs.avail_ring;
                q.used_ring = qs.used_ring;
                q.last_avail_idx = qs.last_avail_idx;
                q.next_used_idx = qs.next_used_idx;
            }
        }
        if snap.status & super::STATUS_DRIVER_OK != 0 {
            st.activated = true;
            let queues = st.queues.clone();
            drop(st);
            self.dev.activate(queues);
        }
    }

    /// Build a closure that asserts the config-change IRQ
    /// (interrupt_status bit 1). Used by virtio-balloon when num_pages
    /// changes โ€” the guest reads InterruptStatus, sees bit 1, then
    /// reads the device's config register space.
    pub fn make_config_change_irq(self: &Arc<Self>) -> Arc<dyn Fn() + Send + Sync> {
        let me = self.clone();
        Arc::new(move || {
            let mut st = me.state.lock().unwrap();
            st.interrupt_status |= 0x2;
            let f = st.irq_raise.clone();
            drop(st);
            f();
        })
    }

    /// Build the closure devices use to raise their used-buffer IRQ.
    /// It sets the device's `interrupt_status |= 1` (so the guest's
    /// IRQ handler reads VIRTIO_MMIO_INT_VRING and dispatches), then
    /// pulses the SPI line.
    pub fn make_used_buffer_irq(self: &Arc<Self>) -> Arc<dyn Fn() + Send + Sync> {
        let me = self.clone();
        Arc::new(move || {
            let mut st = me.state.lock().unwrap();
            st.interrupt_status |= 0x1;
            let f = st.irq_raise.clone();
            drop(st);
            f();
        })
    }
}

impl MmioDevice for MmioVirtio {
    fn read(&self, offset: u64, _size: u8) -> u64 {
        let st = self.state.lock().unwrap();
        let v: u32 = match offset {
            0x000 => MAGIC,
            0x004 => VERSION,
            0x008 => self.dev.device_id(),
            0x00c => self.dev.vendor_id(),
            0x010 => {
                // DeviceFeatures, paged by sel. We expose 64 bits.
                let f = self.dev.features();
                if st.device_features_sel == 0 {
                    f as u32
                } else {
                    (f >> 32) as u32
                }
            }
            0x034 => self.dev.queue_max_size() as u32,
            0x038 => st
                .queues
                .get(st.queue_sel as usize)
                .map(|q| q.size as u32)
                .unwrap_or(0),
            0x044 => st
                .queues
                .get(st.queue_sel as usize)
                .map(|q| if q.ready { 1 } else { 0 })
                .unwrap_or(0),
            0x060 => st.interrupt_status,
            0x070 => st.status,
            0x100.. => {
                let cfg = self.dev.config();
                let off = (offset - 0x100) as usize;
                off.checked_add(4)
                    .and_then(|end| cfg.get(off..end))
                    .map(|bytes| u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]))
                    .unwrap_or(0)
            }
            _ => 0,
        };
        v as u64
    }

    fn write(&self, offset: u64, value: u64, _size: u8) {
        let mut st = self.state.lock().unwrap();
        let v32 = value as u32;
        match offset {
            0x014 => st.device_features_sel = v32,
            0x020 => {
                let i = (st.driver_features_sel & 1) as usize;
                st.driver_features[i] = v32;
            }
            0x024 => st.driver_features_sel = v32,
            0x030 => st.queue_sel = v32,
            0x038 => {
                let sel = st.queue_sel as usize;
                if let Some(q) = st.queues.get_mut(sel) {
                    q.size = v32 as u16;
                }
            }
            0x044 => {
                let sel = st.queue_sel as usize;
                if let Some(q) = st.queues.get_mut(sel) {
                    q.ready = v32 != 0;
                }
            }
            0x050 => {
                // QueueNotify โ€” guest kicks queue v32. Drop the lock
                // before invoking notify, in case the device wants to
                // call back into us (e.g. raise IRQ).
                drop(st);
                self.dev.notify(v32 as u16);
                return;
            }
            0x064 => st.interrupt_status &= !v32,
            0x070 => {
                st.status = v32;
                // On DRIVER_OK transition, hand the queues to the device.
                if v32 & super::STATUS_DRIVER_OK != 0 && !st.activated {
                    st.activated = true;
                    let queues = st.queues.clone();
                    drop(st);
                    self.dev.activate(queues);
                    return;
                }
            }
            // Per-queue address triples (low/high u32). Combine into u64.
            0x080 => set_low(&mut st, |q| &mut q.desc_table, v32),
            0x084 => set_high(&mut st, |q| &mut q.desc_table, v32),
            0x090 => set_low(&mut st, |q| &mut q.avail_ring, v32),
            0x094 => set_high(&mut st, |q| &mut q.avail_ring, v32),
            0x0a0 => set_low(&mut st, |q| &mut q.used_ring, v32),
            0x0a4 => set_high(&mut st, |q| &mut q.used_ring, v32),
            // Device-specific config writes (e.g. virtio-balloon `actual`
            // at offset 0x104). Forward to the device.
            0x100.. => {
                drop(st);
                self.dev.config_write((offset - 0x100) as usize, v32);
                return;
            }
            _ => {}
        }
    }

    fn len(&self) -> u64 {
        0x200
    }
}

fn set_low(st: &mut State, accessor: impl FnOnce(&mut Queue) -> &mut u64, v: u32) {
    let sel = st.queue_sel as usize;
    if let Some(q) = st.queues.get_mut(sel) {
        let r = accessor(q);
        *r = (*r & !0xffff_ffff) | (v as u64);
    }
}
fn set_high(st: &mut State, accessor: impl FnOnce(&mut Queue) -> &mut u64, v: u32) {
    let sel = st.queue_sel as usize;
    if let Some(q) = st.queues.get_mut(sel) {
        let r = accessor(q);
        *r = (*r & 0xffff_ffff) | ((v as u64) << 32);
    }
}

/// Helper for the device's `notify` impl to raise the device IRQ.
/// Called from inside the device after queue processing.
pub fn raise_used_buffer_irq(mmio: &MmioVirtio) {
    let mut st = mmio.state.lock().unwrap();
    st.interrupt_status |= 0x1; // bit 0 = "used buffer notification"
    let f = st.irq_raise.clone();
    drop(st);
    f();
}