supermachine 0.7.70

// Read-only and read-write block device backed by an mmap'd file.
// RO mode is used for OCI image layers (squashfs); RW is used for
// user-supplied volumes (`--volume HOST:GUEST`). Supports
// VIRTIO_BLK_T_IN (read), VIRTIO_BLK_T_OUT (write, RW only),
// VIRTIO_BLK_T_FLUSH (msync, RW only), and VIRTIO_BLK_T_GET_ID.

use std::fs::{File, OpenOptions};
use std::os::unix::io::AsRawFd;
use std::sync::{Arc, Mutex};

use super::queue::Queue;
use super::{VirtioDevice, VIRTIO_ID_BLOCK};

const VIRTIO_BLK_T_IN: u32 = 0;
const VIRTIO_BLK_T_OUT: u32 = 1;
const VIRTIO_BLK_T_FLUSH: u32 = 4;
const VIRTIO_BLK_T_GET_ID: u32 = 8;

const VIRTIO_BLK_S_OK: u8 = 0;
const VIRTIO_BLK_S_IOERR: u8 = 1;
const VIRTIO_BLK_S_UNSUPP: u8 = 2;

/// virtio-blk feature bits.
const VIRTIO_BLK_F_SIZE_MAX: u64 = 1 << 1;
const VIRTIO_BLK_F_SEG_MAX: u64 = 1 << 2;
const VIRTIO_BLK_F_RO: u64 = 1 << 5;
const VIRTIO_BLK_F_BLK_SIZE: u64 = 1 << 6;
/// FLUSH support — set this bit on a writable device so the Linux
/// guest issues `VIRTIO_BLK_T_FLUSH` on `sync`/`fsync` (which the
/// host translates to msync + F_FULLFSYNC). Without it, the guest
/// driver assumes the device has no flush primitive and skips the
/// barrier entirely: `sync(2)` returns from the guest with the
/// ack'd journal commit still buffered in the worker's mmap'd
/// region, and a SIGKILL of the worker before the macOS UBC flush
/// loses the metadata. Next mount surfaces `Bad message`
/// (EBADMSG) on the data blocks the on-disk file actually
/// contains. (Field-report bug fixed 0.7.30.)
const VIRTIO_BLK_F_FLUSH: u64 = 1 << 9;
const VIRTIO_F_VERSION_1: u64 = 1 << 32;

const SECTOR_SIZE: u64 = 512;

pub struct VirtioBlk {
    name: String,
    /// mmap pointer to the backing file. PROT_READ in RO mode,
    /// PROT_READ|PROT_WRITE + MAP_SHARED in RW mode.
    backing_ptr: *mut u8,
    backing_len: usize,
    /// True for `open_rw`; controls whether `VIRTIO_BLK_T_OUT` /
    /// `VIRTIO_BLK_T_FLUSH` are honored and whether the
    /// `VIRTIO_BLK_F_RO` feature bit is advertised.
    writable: bool,
    queues: Mutex<Vec<Queue>>,
    activated: std::sync::atomic::AtomicBool,
    irq_raise: Mutex<Option<Arc<dyn Fn() + Send + Sync>>>,
}

unsafe impl Send for VirtioBlk {}
unsafe impl Sync for VirtioBlk {}

impl VirtioBlk {
    /// Mount `path` as a read-only block device (mmap MAP_PRIVATE +
    /// PROT_READ; OS handles paging, we never write).
    pub fn open_ro(name: &str, path: &str) -> std::io::Result<Self> {
        let f = File::open(path)?;
        let len = f.metadata()?.len() as usize;
        // SAFETY: standard mmap call.
        let p = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                len,
                libc::PROT_READ,
                libc::MAP_PRIVATE,
                f.as_raw_fd(),
                0,
            )
        };
        if p == libc::MAP_FAILED {
            return Err(std::io::Error::last_os_error());
        }
        // Hint kernel: we'll read sequentially.
        unsafe {
            libc::madvise(p, len, libc::MADV_SEQUENTIAL);
        }
        eprintln!("[virtio-blk:{name}] mmap ro {} bytes from {path}", len);
        // The mmap keeps the underlying inode reference alive
        // after `f` is dropped; we don't need the fd for any
        // fsync path. Match the pre-flush-handler-change lifecycle
        // exactly.
        drop(f);
        Ok(Self {
            name: name.to_string(),
            backing_ptr: p as *mut u8,
            backing_len: len,
            writable: false,
            queues: Mutex::new(Vec::new()),
            activated: std::sync::atomic::AtomicBool::new(false),
            irq_raise: Mutex::new(None),
        })
    }

    /// Mount `path` as a read-write block device (mmap MAP_SHARED +
    /// PROT_READ|PROT_WRITE). Used for `--volume` persistent
    /// volumes — the host file is the canonical store; guest writes
    /// land directly in it.
    ///
    /// The file size is fixed at open time (`size_bytes`). If the
    /// file is smaller than `size_bytes`, it's grown via `truncate`
    /// before mapping. Subsequent runs reuse the same file at the
    /// same size; growing or shrinking would invalidate any
    /// filesystem the guest formatted.
    pub fn open_rw(name: &str, path: &str, size_bytes: u64) -> std::io::Result<Self> {
        let f = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            // MUST NOT truncate: this is the persistent volume backing
            // file; truncating would destroy the guest's formatted
            // filesystem on every open. set_len below right-sizes it.
            .truncate(false)
            .open(path)?;
        let cur_len = f.metadata()?.len();
        if cur_len < size_bytes {
            f.set_len(size_bytes)?;
        }
        let len = f.metadata()?.len() as usize;
        // SAFETY: mmap with shared mapping for the file's length.
        let p = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                len,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_SHARED,
                f.as_raw_fd(),
                0,
            )
        };
        if p == libc::MAP_FAILED {
            return Err(std::io::Error::last_os_error());
        }
        // Random access pattern: filesystems poke all over.
        unsafe {
            libc::madvise(p, len, libc::MADV_RANDOM);
        }
        eprintln!("[virtio-blk:{name}] mmap rw {} bytes from {path}", len);
        // mmap keeps the inode reference alive; the fd isn't
        // needed past this point (FLUSH does msync, not fsync).
        drop(f);
        Ok(Self {
            name: name.to_string(),
            backing_ptr: p as *mut u8,
            backing_len: len,
            writable: true,
            queues: Mutex::new(Vec::new()),
            activated: std::sync::atomic::AtomicBool::new(false),
            irq_raise: Mutex::new(None),
        })
    }

    pub fn set_irq_raise(&self, f: Arc<dyn Fn() + Send + Sync>) {
        *self.irq_raise.lock().unwrap() = Some(f);
    }

    fn drain_q(&self) {
        let mut qs = self.queues.lock().unwrap();
        let q = match qs.get_mut(0) {
            Some(q) => q,
            None => return,
        };
        if !q.ready {
            return;
        }
        let mut any_used = false;
        while let Some((head, chain)) = q.pop_chain() {
            // Request layout (virtio-blk):
            //   desc[0] (read-only): struct virtio_blk_req {
            //     u32 type, u32 reserved, u64 sector }  — 16 bytes
            //   desc[1..n-1]: data buffers
            //   desc[n-1]  (write-only): u8 status
            if chain.len() < 2 {
                q.add_used(head, 0);
                any_used = true;
                continue;
            }
            let hdr = chain[0];
            let status_desc = chain[chain.len() - 1];
            // Read header.
            let req_type = q.mem.read_u32(hdr.addr);
            let _reserved = q.mem.read_u32(hdr.addr + 4);
            let sector = q.mem.read_u64(hdr.addr + 8);

            let mut status = VIRTIO_BLK_S_OK;
            let mut bytes_written: u32 = 1; // status byte
            match req_type {
                VIRTIO_BLK_T_IN => {
                    // Copy from backing[sector*512..] into each data
                    // descriptor in the middle of the chain. `sector` and
                    // the descriptor lengths are guest-controlled, so the
                    // running byte offset is tracked in u64 with CHECKED
                    // arithmetic. A crafted sector (e.g. near u64::MAX)
                    // would overflow `sector * 512`; the prior `as usize`
                    // cast then wrapped to a small value that slipped past
                    // the `off + want > backing_len` bounds check while
                    // `backing_ptr.add(off)` still used the real, huge
                    // offset — a guest→host OOB read. Any overflow or
                    // out-of-range span now fails the request with IOERR.
                    let mut off = sector.checked_mul(SECTOR_SIZE);
                    for d in &chain[1..chain.len() - 1] {
                        let want = d.len as u64;
                        let Some(start) = off else {
                            status = VIRTIO_BLK_S_IOERR;
                            break;
                        };
                        let Some(end) = start
                            .checked_add(want)
                            .filter(|e| *e <= self.backing_len as u64)
                        else {
                            status = VIRTIO_BLK_S_IOERR;
                            break;
                        };
                        // SAFETY: backing_ptr is mmap'd for backing_len
                        // bytes; `end <= backing_len` proves [start, end)
                        // is in bounds, so start/want fit in usize.
                        unsafe {
                            let src = self.backing_ptr.add(start as usize);
                            let slice = std::slice::from_raw_parts(src, want as usize);
                            q.mem.write_slice(d.addr, slice);
                        }
                        bytes_written = bytes_written.saturating_add(want as u32);
                        off = Some(end);
                    }
                }
                VIRTIO_BLK_T_FLUSH => {
                    // RO: no-op. RW: `msync(MS_SYNC)` pushes dirty
                    // mmap pages from the worker's address space
                    // into the macOS unified buffer cache. The UBC
                    // is process-independent and survives clean
                    // process exit, so this is sufficient for the
                    // ext4-journal-correctness story under normal
                    // shutdown.
                    //
                    // Why not F_FULLFSYNC: we tried that in an
                    // early 0.7.30 cut to ALSO cover hard SIGKILL
                    // of the worker (durability past UBC into the
                    // device). It's semantically correct but
                    // PROHIBITIVELY slow under write-heavy
                    // workloads — Apple's F_FULLFSYNC blocks until
                    // the SSD acks every outstanding write, and an
                    // `npm install` of 2k packages issues 10k+
                    // FLUSHes. Field-report: 30× slower bake
                    // (~5 s → ~150 s) PLUS in-guest stat() hangs
                    // post-bake when a relatime update queues
                    // behind in-flight F_FULLFSYNCs — host hits
                    // exec timeout, SIGKILLs the exec → `exit=137`
                    // with empty stdout/stderr → integrator's
                    // script reports `mount failed:`. F_FULLFSYNC
                    // removed in 0.7.31. SIGKILL durability is now
                    // best-effort; workloads that genuinely need
                    // it should run a guest-side `sync` then a
                    // graceful pool shutdown (which lets the
                    // worker exit cleanly, and macOS flushes the
                    // UBC to the device).
                    if self.writable {
                        unsafe {
                            libc::msync(
                                self.backing_ptr as *mut libc::c_void,
                                self.backing_len,
                                libc::MS_SYNC,
                            );
                        }
                    }
                }
                VIRTIO_BLK_T_GET_ID => {
                    let id = format!("{:>20}", self.name);
                    let bytes = id.as_bytes();
                    if let Some(d) = chain.get(1) {
                        let take = (d.len as usize).min(bytes.len());
                        q.mem.write_slice(d.addr, &bytes[..take]);
                        bytes_written += take as u32;
                    }
                }
                VIRTIO_BLK_T_OUT => {
                    if !self.writable {
                        status = VIRTIO_BLK_S_UNSUPP;
                    } else {
                        // Copy each data desc into backing[sector*512..].
                        // Same guest-controlled overflow hazard as T_IN
                        // above (here an OOB *write* into host memory):
                        // track the offset in u64 with checked math and
                        // reject any overflow / out-of-range span.
                        let mut off = sector.checked_mul(SECTOR_SIZE);
                        for d in &chain[1..chain.len() - 1] {
                            let n = d.len as u64;
                            let Some(start) = off else {
                                status = VIRTIO_BLK_S_IOERR;
                                break;
                            };
                            let Some(end) = start
                                .checked_add(n)
                                .filter(|e| *e <= self.backing_len as u64)
                            else {
                                status = VIRTIO_BLK_S_IOERR;
                                break;
                            };
                            // SAFETY: backing_ptr is mmap'd RW for
                            // backing_len bytes; [start, end) bounds-checked.
                            let mut tmp = vec![0u8; n as usize];
                            q.mem.read_slice(d.addr, &mut tmp);
                            unsafe {
                                let dst = self.backing_ptr.add(start as usize);
                                std::ptr::copy_nonoverlapping(tmp.as_ptr(), dst, n as usize);
                            }
                            off = Some(end);
                        }
                    }
                }
                _ => {
                    status = VIRTIO_BLK_S_UNSUPP;
                }
            }
            // Write status byte.
            q.mem.write_slice(status_desc.addr, &[status]);
            q.add_used(head, bytes_written);
            any_used = true;
        }
        if any_used {
            let f_opt = self.irq_raise.lock().unwrap().clone();
            drop(qs);
            if let Some(f) = f_opt {
                f();
            }
        }
    }
}

impl VirtioDevice for VirtioBlk {
    fn device_id(&self) -> u32 {
        VIRTIO_ID_BLOCK
    }
    fn num_queues(&self) -> usize {
        1
    }
    fn config(&self) -> Vec<u8> {
        // We only emit the first 2 fields (capacity u64) — Linux's
        // virtio-blk driver tolerates a short config space.
        let nsectors = (self.backing_len as u64) / SECTOR_SIZE;
        nsectors.to_le_bytes().to_vec()
    }
    fn features(&self) -> u64 {
        let mut f = VIRTIO_F_VERSION_1;
        if self.writable {
            // FLUSH only meaningful on a writable device — RO
            // mappings have no dirty pages and the host's
            // VIRTIO_BLK_T_FLUSH handler short-circuits when
            // `!writable`. Advertising it on RO would just cost a
            // no-op virtio round-trip per sync.
            f |= VIRTIO_BLK_F_FLUSH;
        } else {
            f |= VIRTIO_BLK_F_RO;
        }
        f
    }
    fn notify(&self, _q: u16) {
        self.drain_q();
    }
    fn activate(&self, queues: Vec<Queue>) {
        *self.queues.lock().unwrap() = queues;
        self.activated
            .store(true, std::sync::atomic::Ordering::Release);
        eprintln!(
            "[virtio-blk:{}] activated, {} sectors",
            self.name,
            self.backing_len as u64 / SECTOR_SIZE
        );
    }
    fn snapshot_queues(&self) -> Vec<Queue> {
        self.queues.lock().unwrap().clone()
    }
}

#[cfg(test)]
mod tests {
    //! Drives the block device over a real virtio descriptor chain in
    //! GuestMem — the path a guest actually hits. The headline cases are
    //! the guest-controlled `sector` overflow: a crafted sector must NOT
    //! drive `backing_ptr.add()` out of bounds (host OOB read/write) but
    //! fail cleanly with VIRTIO_BLK_S_IOERR.
    use super::*;
    use crate::devices::virtio::queue::{GuestMem, VRING_DESC_F_NEXT, VRING_DESC_F_WRITE};
    use std::io::Write;

    const BASE: u64 = 0x10_0000;
    const WIN: usize = 256 * 1024;
    const O_DESC: u64 = 0x0000;
    const O_AVAIL: u64 = 0x0800;
    const O_USED: u64 = 0x1000;
    const O_HDR: u64 = 0x2000;
    const O_DATA: u64 = 0x3000;
    const O_STATUS: u64 = 0x4000;

    fn temp_path(tag: &str) -> std::path::PathBuf {
        let nanos = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_nanos();
        std::env::temp_dir().join(format!("sm-blk-{tag}-{}-{nanos}.img", std::process::id()))
    }

    fn make_rw(size: u64) -> (VirtioBlk, std::path::PathBuf) {
        let path = temp_path("rw");
        File::create(&path).unwrap(); // empty; open_rw set_len-grows it
        let dev = VirtioBlk::open_rw("testvol", path.to_str().unwrap(), size).unwrap();
        (dev, path)
    }

    fn make_ro(size: u64) -> (VirtioBlk, std::path::PathBuf) {
        let path = temp_path("ro");
        let mut f = File::create(&path).unwrap();
        f.write_all(&vec![0u8; size as usize]).unwrap();
        drop(f);
        let dev = VirtioBlk::open_ro("testro", path.to_str().unwrap()).unwrap();
        (dev, path)
    }

    struct Resp {
        status: u8,
        data: Vec<u8>,
    }

    /// Issue one request through the full device path. `data` is the data
    /// descriptor's contents (for T_OUT, the bytes the guest sends; for
    /// T_IN, a zeroed receive buffer whose length is the read size).
    fn run(dev: &VirtioBlk, req_type: u32, sector: u64, data: &[u8], data_writable: bool) -> Resp {
        let mut backing = vec![0u8; WIN];
        let mem = GuestMem::new(backing.as_mut_ptr(), BASE, WIN);

        // virtio_blk_req header: type(u32) reserved(u32) sector(u64).
        mem.write_u32(BASE + O_HDR, req_type);
        mem.write_u32(BASE + O_HDR + 4, 0);
        mem.write_u64(BASE + O_HDR + 8, sector);
        mem.write_slice(BASE + O_DATA, data);

        let d = |i: u64| BASE + O_DESC + i * 16;
        // desc[0] header (RO) → desc[1].
        mem.write_u64(d(0), BASE + O_HDR);
        mem.write_u32(d(0) + 8, 16);
        mem.write_u16(d(0) + 12, VRING_DESC_F_NEXT);
        mem.write_u16(d(0) + 14, 1);
        // desc[1] data → desc[2]. Writable for reads (device → guest).
        let data_flags = VRING_DESC_F_NEXT | if data_writable { VRING_DESC_F_WRITE } else { 0 };
        mem.write_u64(d(1), BASE + O_DATA);
        mem.write_u32(d(1) + 8, data.len() as u32);
        mem.write_u16(d(1) + 12, data_flags);
        mem.write_u16(d(1) + 14, 2);
        // desc[2] status byte (WO), end of chain.
        mem.write_u64(d(2), BASE + O_STATUS);
        mem.write_u32(d(2) + 8, 1);
        mem.write_u16(d(2) + 12, VRING_DESC_F_WRITE);
        mem.write_u16(d(2) + 14, 0);
        // avail: ring[0] = head 0; idx = 1.
        mem.write_u16(BASE + O_AVAIL + 4, 0);
        mem.write_u16(BASE + O_AVAIL + 2, 1);

        let mut q = Queue::new(mem.clone());
        q.size = 8;
        q.ready = true;
        q.desc_table = BASE + O_DESC;
        q.avail_ring = BASE + O_AVAIL;
        q.used_ring = BASE + O_USED;

        dev.activate(vec![q]);
        dev.notify(0);

        let mut sb = [0u8; 1];
        mem.read_slice(BASE + O_STATUS, &mut sb);
        let mut out = vec![0u8; data.len()];
        mem.read_slice(BASE + O_DATA, &mut out);
        Resp {
            status: sb[0],
            data: out,
        }
    }

    #[test]
    fn write_then_read_round_trips() {
        let (dev, path) = make_rw(64 * 1024);
        let mut payload = b"SUPERMACHINE-BLK-ROUNDTRIP".to_vec();
        payload.resize(512, 0);

        let w = run(&dev, VIRTIO_BLK_T_OUT, 1, &payload, false);
        assert_eq!(w.status, VIRTIO_BLK_S_OK, "write should succeed");

        let r = run(&dev, VIRTIO_BLK_T_IN, 1, &vec![0u8; 512], true);
        assert_eq!(r.status, VIRTIO_BLK_S_OK, "read should succeed");
        assert_eq!(r.data, payload, "read-back must match written bytes");

        std::fs::remove_file(path).ok();
    }

    #[test]
    fn huge_sector_read_is_ioerr_not_oob() {
        // sector * 512 overflows u64. Before the fix this either panicked
        // (debug overflow) or wrapped small and drove backing_ptr.add()
        // far out of bounds (host OOB read). Must be a clean IOERR.
        let (dev, path) = make_rw(64 * 1024);
        let r = run(&dev, VIRTIO_BLK_T_IN, u64::MAX, &vec![0u8; 512], true);
        assert_eq!(r.status, VIRTIO_BLK_S_IOERR, "overflowing read must IOERR");
        std::fs::remove_file(path).ok();
    }

    #[test]
    fn huge_sector_write_is_ioerr_not_oob() {
        // Same overflow, but an OOB *write* into host memory before the fix.
        let (dev, path) = make_rw(64 * 1024);
        let r = run(&dev, VIRTIO_BLK_T_OUT, u64::MAX, &vec![0xABu8; 512], false);
        assert_eq!(r.status, VIRTIO_BLK_S_IOERR, "overflowing write must IOERR");
        std::fs::remove_file(path).ok();
    }

    #[test]
    fn sector_past_end_is_ioerr() {
        // In-range multiply, but the span runs off the end of the backing.
        let (dev, path) = make_rw(64 * 1024); // 128 sectors
        let r = run(&dev, VIRTIO_BLK_T_IN, 200, &vec![0u8; 512], true);
        assert_eq!(r.status, VIRTIO_BLK_S_IOERR);
        std::fs::remove_file(path).ok();
    }

    #[test]
    fn last_sector_in_bounds_is_ok() {
        // sector 127 * 512 = 65024; +512 = 65536 == backing_len → valid.
        let (dev, path) = make_rw(64 * 1024);
        let r = run(&dev, VIRTIO_BLK_T_IN, 127, &vec![0u8; 512], true);
        assert_eq!(
            r.status, VIRTIO_BLK_S_OK,
            "last full sector must be readable"
        );
        // One past the last sector must fail.
        let r2 = run(&dev, VIRTIO_BLK_T_IN, 128, &vec![0u8; 512], true);
        assert_eq!(r2.status, VIRTIO_BLK_S_IOERR);
        std::fs::remove_file(path).ok();
    }

    #[test]
    fn readonly_device_rejects_writes() {
        let (dev, path) = make_ro(64 * 1024);
        let r = run(&dev, VIRTIO_BLK_T_OUT, 0, &vec![0xCDu8; 512], false);
        assert_eq!(
            r.status, VIRTIO_BLK_S_UNSUPP,
            "RO device must reject T_OUT with UNSUPP"
        );
        std::fs::remove_file(path).ok();
    }
}