supermachine 0.4.13

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// Read-only and read-write block device backed by an mmap'd file.
// RO mode is used for OCI image layers (squashfs); RW is used for
// user-supplied volumes (`--volume HOST:GUEST`). Supports
// VIRTIO_BLK_T_IN (read), VIRTIO_BLK_T_OUT (write, RW only),
// VIRTIO_BLK_T_FLUSH (msync, RW only), and VIRTIO_BLK_T_GET_ID.

use std::fs::{File, OpenOptions};
use std::os::unix::io::AsRawFd;
use std::sync::{Arc, Mutex};

use super::queue::Queue;
use super::{VirtioDevice, VIRTIO_ID_BLOCK};

const VIRTIO_BLK_T_IN: u32 = 0;
const VIRTIO_BLK_T_OUT: u32 = 1;
const VIRTIO_BLK_T_FLUSH: u32 = 4;
const VIRTIO_BLK_T_GET_ID: u32 = 8;

const VIRTIO_BLK_S_OK: u8 = 0;
const VIRTIO_BLK_S_IOERR: u8 = 1;
const VIRTIO_BLK_S_UNSUPP: u8 = 2;

/// virtio-blk feature bits.
const VIRTIO_BLK_F_SIZE_MAX: u64 = 1 << 1;
const VIRTIO_BLK_F_SEG_MAX: u64 = 1 << 2;
const VIRTIO_BLK_F_RO: u64 = 1 << 5;
const VIRTIO_BLK_F_BLK_SIZE: u64 = 1 << 6;
const VIRTIO_F_VERSION_1: u64 = 1 << 32;

const SECTOR_SIZE: u64 = 512;

pub struct VirtioBlk {
    name: String,
    /// mmap pointer to the backing file. PROT_READ in RO mode,
    /// PROT_READ|PROT_WRITE + MAP_SHARED in RW mode.
    backing_ptr: *mut u8,
    backing_len: usize,
    /// True for `open_rw`; controls whether `VIRTIO_BLK_T_OUT` /
    /// `VIRTIO_BLK_T_FLUSH` are honored and whether the
    /// `VIRTIO_BLK_F_RO` feature bit is advertised.
    writable: bool,
    queues: Mutex<Vec<Queue>>,
    activated: std::sync::atomic::AtomicBool,
    irq_raise: Mutex<Option<Arc<dyn Fn() + Send + Sync>>>,
}

unsafe impl Send for VirtioBlk {}
unsafe impl Sync for VirtioBlk {}

impl VirtioBlk {
    /// Mount `path` as a read-only block device (mmap MAP_PRIVATE +
    /// PROT_READ; OS handles paging, we never write).
    pub fn open_ro(name: &str, path: &str) -> std::io::Result<Self> {
        let f = File::open(path)?;
        let len = f.metadata()?.len() as usize;
        // SAFETY: standard mmap call.
        let p = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                len,
                libc::PROT_READ,
                libc::MAP_PRIVATE,
                f.as_raw_fd(),
                0,
            )
        };
        if p == libc::MAP_FAILED {
            return Err(std::io::Error::last_os_error());
        }
        // Hint kernel: we'll read sequentially.
        unsafe {
            libc::madvise(p, len, libc::MADV_SEQUENTIAL);
        }
        eprintln!("[virtio-blk:{name}] mmap ro {} bytes from {path}", len);
        Ok(Self {
            name: name.to_string(),
            backing_ptr: p as *mut u8,
            backing_len: len,
            writable: false,
            queues: Mutex::new(Vec::new()),
            activated: std::sync::atomic::AtomicBool::new(false),
            irq_raise: Mutex::new(None),
        })
    }

    /// Mount `path` as a read-write block device (mmap MAP_SHARED +
    /// PROT_READ|PROT_WRITE). Used for `--volume` persistent
    /// volumes — the host file is the canonical store; guest writes
    /// land directly in it.
    ///
    /// The file size is fixed at open time (`size_bytes`). If the
    /// file is smaller than `size_bytes`, it's grown via `truncate`
    /// before mapping. Subsequent runs reuse the same file at the
    /// same size; growing or shrinking would invalidate any
    /// filesystem the guest formatted.
    pub fn open_rw(name: &str, path: &str, size_bytes: u64) -> std::io::Result<Self> {
        let f = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .open(path)?;
        let cur_len = f.metadata()?.len();
        if cur_len < size_bytes {
            f.set_len(size_bytes)?;
        }
        let len = f.metadata()?.len() as usize;
        // SAFETY: mmap with shared mapping for the file's length.
        let p = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                len,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_SHARED,
                f.as_raw_fd(),
                0,
            )
        };
        if p == libc::MAP_FAILED {
            return Err(std::io::Error::last_os_error());
        }
        // Random access pattern: filesystems poke all over.
        unsafe {
            libc::madvise(p, len, libc::MADV_RANDOM);
        }
        eprintln!("[virtio-blk:{name}] mmap rw {} bytes from {path}", len);
        Ok(Self {
            name: name.to_string(),
            backing_ptr: p as *mut u8,
            backing_len: len,
            writable: true,
            queues: Mutex::new(Vec::new()),
            activated: std::sync::atomic::AtomicBool::new(false),
            irq_raise: Mutex::new(None),
        })
    }

    pub fn set_irq_raise(&self, f: Arc<dyn Fn() + Send + Sync>) {
        *self.irq_raise.lock().unwrap() = Some(f);
    }

    fn drain_q(&self) {
        let mut qs = self.queues.lock().unwrap();
        let q = match qs.get_mut(0) {
            Some(q) => q,
            None => return,
        };
        if !q.ready {
            return;
        }
        let mut any_used = false;
        while let Some((head, chain)) = q.pop_chain() {
            // Request layout (virtio-blk):
            //   desc[0] (read-only): struct virtio_blk_req {
            //     u32 type, u32 reserved, u64 sector }  — 16 bytes
            //   desc[1..n-1]: data buffers
            //   desc[n-1]  (write-only): u8 status
            if chain.len() < 2 {
                q.add_used(head, 0);
                any_used = true;
                continue;
            }
            let hdr = chain[0];
            let status_desc = chain[chain.len() - 1];
            // Read header.
            let req_type = q.mem.read_u32(hdr.addr);
            let _reserved = q.mem.read_u32(hdr.addr + 4);
            let sector = q.mem.read_u64(hdr.addr + 8);

            let mut status = VIRTIO_BLK_S_OK;
            let mut bytes_written: u32 = 1; // status byte
            match req_type {
                VIRTIO_BLK_T_IN => {
                    // Copy from backing[sector*512..] into each
                    // data descriptor in middle of chain.
                    let mut off = (sector * SECTOR_SIZE) as usize;
                    for d in &chain[1..chain.len() - 1] {
                        let want = d.len as usize;
                        if off + want > self.backing_len {
                            status = VIRTIO_BLK_S_IOERR;
                            break;
                        }
                        // SAFETY: backing_ptr is mmap'd
                        // backing_len bytes, off+want bounds-checked.
                        unsafe {
                            let src = self.backing_ptr.add(off);
                            let slice = std::slice::from_raw_parts(src, want);
                            q.mem.write_slice(d.addr, slice);
                        }
                        bytes_written += want as u32;
                        off += want;
                    }
                }
                VIRTIO_BLK_T_FLUSH => {
                    // RO: no-op. RW: MS_SYNC because ext4's journal
                    // depends on FLUSH being a real barrier — if we
                    // ack a flush and the data isn't on disk yet,
                    // the next mount's journal replay points at
                    // garbage and the FS reports EBADMSG. macOS
                    // Apple Silicon SSDs make this tolerable
                    // perf-wise; if it bites a hot workload we'll
                    // revisit with a finer-grained writeback path.
                    if self.writable {
                        unsafe {
                            libc::msync(
                                self.backing_ptr as *mut libc::c_void,
                                self.backing_len,
                                libc::MS_SYNC,
                            );
                        }
                    }
                }
                VIRTIO_BLK_T_GET_ID => {
                    let id = format!("{:>20}", self.name);
                    let bytes = id.as_bytes();
                    if let Some(d) = chain.get(1) {
                        let take = (d.len as usize).min(bytes.len());
                        q.mem.write_slice(d.addr, &bytes[..take]);
                        bytes_written += take as u32;
                    }
                }
                VIRTIO_BLK_T_OUT => {
                    if !self.writable {
                        status = VIRTIO_BLK_S_UNSUPP;
                    } else {
                        // Copy each data desc into backing[sector*512..].
                        let mut off = (sector * SECTOR_SIZE) as usize;
                        for d in &chain[1..chain.len() - 1] {
                            let n = d.len as usize;
                            if off + n > self.backing_len {
                                status = VIRTIO_BLK_S_IOERR;
                                break;
                            }
                            // SAFETY: backing_ptr is mmap'd RW for
                            // `backing_len` bytes; bounds checked.
                            let mut tmp = vec![0u8; n];
                            q.mem.read_slice(d.addr, &mut tmp);
                            unsafe {
                                let dst = self.backing_ptr.add(off);
                                std::ptr::copy_nonoverlapping(tmp.as_ptr(), dst, n);
                            }
                            off += n;
                        }
                    }
                }
                _ => {
                    status = VIRTIO_BLK_S_UNSUPP;
                }
            }
            // Write status byte.
            q.mem.write_slice(status_desc.addr, &[status]);
            q.add_used(head, bytes_written);
            any_used = true;
        }
        if any_used {
            let f_opt = self.irq_raise.lock().unwrap().clone();
            drop(qs);
            if let Some(f) = f_opt {
                f();
            }
        }
    }
}

impl VirtioDevice for VirtioBlk {
    fn device_id(&self) -> u32 {
        VIRTIO_ID_BLOCK
    }
    fn num_queues(&self) -> usize {
        1
    }
    fn config(&self) -> Vec<u8> {
        // We only emit the first 2 fields (capacity u64) — Linux's
        // virtio-blk driver tolerates a short config space.
        let nsectors = (self.backing_len as u64) / SECTOR_SIZE;
        nsectors.to_le_bytes().to_vec()
    }
    fn features(&self) -> u64 {
        let mut f = VIRTIO_F_VERSION_1;
        if !self.writable {
            f |= VIRTIO_BLK_F_RO;
        }
        f
    }
    fn notify(&self, _q: u16) {
        self.drain_q();
    }
    fn activate(&self, queues: Vec<Queue>) {
        *self.queues.lock().unwrap() = queues;
        self.activated
            .store(true, std::sync::atomic::Ordering::Release);
        eprintln!(
            "[virtio-blk:{}] activated, {} sectors",
            self.name,
            self.backing_len as u64 / SECTOR_SIZE
        );
    }
    fn snapshot_queues(&self) -> Vec<Queue> {
        self.queues.lock().unwrap().clone()
    }
}