supermachine 0.7.70

// virtio-fs device emulation. Pairs the FUSE protocol (see
// `crate::fuse`) with the virtio queue plumbing in this directory.
//
// Layout (virtio spec §5.11):
//   - device id   = 26 (VIRTIO_ID_FS)
//   - num_queues  = 1 (hiprio) + N (request). We start with N=1; the
//                   guest's virtiofs driver only uses the first request
//                   queue unless we negotiate the multiqueue feature.
//   - config space:
//        +0  tag[36]      — UTF-8, NUL-padded, identifies the mount
//                            point name as seen by the guest's
//                            `mount -t virtiofs <tag> /target`.
//        +36 num_queues   — u32, the number of request queues
//                            (excluding hiprio).
//   - shared-memory region 0 = DAX window. The guest's virtiofs driver
//     issues FUSE_SETUPMAPPING / FUSE_REMOVEMAPPING requests on the
//     request queue; we translate those into `hv_vm_map` calls into
//     this window.
//
// This commit lands the FRAME of the device: ID, config, queues, the
// SHM-region descriptor — but `notify` is a stub that just logs the
// FUSE in-header. Actual FUSE handling lives in `crate::fuse::server`
// (next slice).

#![allow(dead_code)]

use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};

use crate::fuse::{
    build_inval_entry, build_inval_inode, FsBackend, FuseServer, InHeader, MemoryFs, Notifier,
};

use super::queue::Queue;
use super::{ShmRegion, VirtioDevice, VIRTIO_ID_FS};

/// virtio-fs feature bits we offer.
///
/// `VIRTIO_F_VERSION_1` (bit 32) — modern virtio.
/// virtio-fs doesn't currently expose device-specific feature bits in
/// the upstream Linux driver (5.4+), but we keep room for them here.
const FEATURES: u64 = 1u64 << 32;

/// Max tag length per virtio spec §5.11.4.
const TAG_LEN: usize = 36;

/// Default DAX window size. 8 GiB of guest physical address space —
/// plenty for current workloads and effectively free at rest (only
/// stage-2 page table entries materialize when SETUPMAPPING actually
/// maps a region into the window).
pub const DEFAULT_DAX_WINDOW_BYTES: u64 = 8 * 1024 * 1024 * 1024;

/// Construction parameters.
#[derive(Clone, Debug)]
pub struct VirtioFsConfig {
    /// Mount-point tag the guest will see (e.g. "shared", "myapp").
    /// Max 36 bytes UTF-8 once NUL-padded.
    pub tag: String,
    /// Number of request queues to expose. Must be ≥ 1; defaults to 1
    /// elsewhere. We don't currently advertise multiqueue, so the
    /// guest only kicks queue index 1 (request) and 0 (hiprio).
    pub num_request_queues: u32,
    /// Guest physical base of the DAX window. Must be 16 KiB aligned
    /// and not overlap any RAM, MMIO, or other device region.
    pub dax_window_gpa: u64,
    /// Length of the DAX window in bytes.
    pub dax_window_len: u64,
}

/// The virtio-fs device.
///
/// Holds the queues (handed in by `MmioVirtio::activate`), the FUSE
/// server (TODO), and the DAX window descriptor we advertise to the
/// guest via `shm_regions()`.
pub struct VirtioFs {
    cfg: VirtioFsConfig,
    queues: Mutex<Vec<Queue>>,
    activated: AtomicBool,
    irq_raise: Mutex<Option<Arc<dyn Fn() + Send + Sync>>>,
    /// FUSE protocol handler. Dispatches decoded requests to per-op
    /// handlers; in this slice only FUSE_INIT and FUSE_DESTROY do
    /// anything meaningful, everything else replies -ENOSYS.
    fuse: Mutex<FuseServer>,
    /// Pool of hipriority chains the guest gave us with WRITABLE
    /// descriptors only — i.e. empty buffers the guest stashed so
    /// the device can write notifications (FUSE_NOTIFY_INVAL_*) into
    /// them. Populated on hipri queue kick; drained by
    /// push_notification.
    notif_pool: Mutex<Vec<HeldChain>>,
}

/// One popped hipri chain held for notification use. We have to
/// hold (head, chain) so we can later call `q.add_used(head, len)`
/// when the notification is written.
struct HeldChain {
    head: u16,
    chain: Vec<super::queue::Desc>,
}

impl VirtioFs {
    /// Construct with a default `MemoryFs` backend. Used by tests +
    /// the not-yet-wired-up VMM path. Real mounts call `with_backend`.
    pub fn new(cfg: VirtioFsConfig) -> Self {
        Self::with_backend(cfg, Arc::new(MemoryFs::new()))
    }

    /// Construct with a specific filesystem backend. This is the
    /// real-mount entry point; the backend is the only thing that
    /// differs between an in-memory test mount and a host bind-mount.
    pub fn with_backend(cfg: VirtioFsConfig, backend: Arc<dyn FsBackend>) -> Self {
        assert!(
            cfg.dax_window_gpa & 0x3FFF == 0,
            "DAX window GPA must be 16 KiB aligned (got {:#x})",
            cfg.dax_window_gpa
        );
        assert!(
            cfg.dax_window_len & 0x3FFF == 0,
            "DAX window len must be 16 KiB aligned (got {:#x})",
            cfg.dax_window_len
        );
        assert!(
            cfg.tag.len() < TAG_LEN,
            "tag too long (max {} bytes, got {})",
            TAG_LEN - 1,
            cfg.tag.len()
        );
        assert!(
            cfg.num_request_queues >= 1,
            "must have at least 1 request queue"
        );
        Self {
            cfg,
            queues: Mutex::new(Vec::new()),
            activated: AtomicBool::new(false),
            irq_raise: Mutex::new(None),
            fuse: Mutex::new(FuseServer::new(backend)),
            notif_pool: Mutex::new(Vec::new()),
        }
    }

    pub fn set_irq_raise(&self, f: Arc<dyn Fn() + Send + Sync>) {
        *self.irq_raise.lock().unwrap() = Some(f);
    }

    /// Clear cross-cycle device state before a pool cycle-restore.
    ///
    /// `notif_pool` holds guest descriptor-chain heads + copied
    /// descriptor addrs that the guest offered for FUSE notifications.
    /// It is NOT part of the snapshot, so without this clear a restored
    /// worker would carry the PRIOR cycle's stale chains: a later
    /// `push_notification` would then write notification bytes to a GPA
    /// the restored guest has repurposed and `add_used` a descriptor
    /// index this cycle's guest never offered — guest memory corruption +
    /// virtio used-ring desync. Mirrors `serial`/`balloon`
    /// `reset_for_restore`; called from the runner's restore path. Only
    /// touches the `notif_pool` lock, preserving the documented
    /// `queues → notif_pool` lock order.
    pub fn reset_for_restore(&self) {
        self.notif_pool.lock().unwrap().clear();
    }

    /// Access the FUSE server. Used by the VMM device-set wiring to
    /// call `set_dax` once the DAX session is constructed (the session
    /// needs the same backend that this device was built with, which
    /// is private here, plus the HVF mapper which only the VMM has).
    pub fn fuse_server(&self) -> &Mutex<FuseServer> {
        &self.fuse
    }

    /// Drain the hipriority queue. Two classes of work:
    ///   (a) FORGET / INTERRUPT requests from the guest (readable
    ///       descriptors). Ack with 0 — our backend's forget is a
    ///       no-op anyway.
    ///   (b) Notification slots (writable-only chains). Stash them
    ///       in `notif_pool` so `push_notification` can fill one
    ///       when a host change fires.
    fn drain_hiprio_queue(&self) {
        if !self.activated.load(Ordering::Acquire) {
            return;
        }
        let mut qs = self.queues.lock().unwrap();
        let Some(hi) = qs.get_mut(0) else { return };
        if !hi.ready {
            return;
        }
        let mut acked_request = false;
        while let Some((head, chain)) = hi.pop_chain() {
            let has_readable = chain
                .iter()
                .any(|d| d.flags & super::queue::VRING_DESC_F_WRITE == 0);
            if has_readable {
                // FORGET / INTERRUPT — consume and ack.
                hi.add_used(head, 0);
                acked_request = true;
            } else {
                // Writable-only chain — stash for notification use.
                self.notif_pool
                    .lock()
                    .unwrap()
                    .push(HeldChain { head, chain });
                log_notify(|| {
                    format!(
                        "hipri stashed: pool size now {}",
                        self.notif_pool.lock().unwrap().len()
                    )
                });
            }
        }
        drop(qs);
        if acked_request {
            if let Some(f) = self.irq_raise.lock().unwrap().clone() {
                f();
            }
        }
    }

    /// Push a notification payload onto the hipriority queue. Returns
    /// true if a buffer was available and the payload was written;
    /// false if the pool is empty (notification dropped — the
    /// guest's virtio-fs driver should re-add buffers shortly).
    ///
    /// Used by `Notifier` impl to deliver FUSE_NOTIFY_INVAL_* to
    /// the guest's virtio-fs driver.
    fn push_notification(&self, bytes: &[u8]) -> bool {
        if !self.activated.load(Ordering::Acquire) {
            return false;
        }
        // Lock order: queues BEFORE notif_pool. `drain_hiprio_queue`
        // holds queues while pushing into notif_pool, so the watcher
        // thread (which calls this function) MUST acquire queues
        // first to avoid lock-order inversion deadlock. Pre-0.7.7
        // this function locked notif_pool first, then queues — a
        // classic AB/BA inversion that could deadlock under heavy
        // load (Vite-style "many small writes triggering many
        // NOTE_WRITE → INVAL_INODE notifications WHILE the guest is
        // also draining its hipriority queue rapidly"). Holding
        // queues across the pool pop is a tiny extra critical
        // section in the happy path and is the same hold-duration
        // drain_hiprio_queue already takes.
        let mut qs = self.queues.lock().unwrap();
        let held = match self.notif_pool.lock().unwrap().pop() {
            Some(h) => h,
            None => return false,
        };
        let Some(q) = qs.get_mut(0) else { return false };
        let mut written = 0usize;
        for d in held
            .chain
            .iter()
            .filter(|d| d.flags & super::queue::VRING_DESC_F_WRITE != 0)
        {
            if written >= bytes.len() {
                break;
            }
            let take = (bytes.len() - written).min(d.len as usize);
            q.mem.write_slice(d.addr, &bytes[written..written + take]);
            written += take;
        }
        q.add_used(held.head, written as u32);
        drop(qs);
        if let Some(f) = self.irq_raise.lock().unwrap().clone() {
            f();
        }
        true
    }

    /// Drain the request queue. For each chain:
    ///   1. Read the InHeader + payload from the readable descriptors
    ///   2. Dispatch via FuseServer to produce a typed Reply
    ///   3. Write the reply bytes back to the writable descriptors,
    ///      splitting across multiple descriptors if needed
    ///   4. Mark the chain used with the actual reply byte length
    ///   5. Raise the used-buffer IRQ once we've drained the batch
    fn drain_request_queue(&self) {
        if !self.activated.load(Ordering::Acquire) {
            return;
        }
        let mut qs = self.queues.lock().unwrap();
        // Request queue is index 1; hiprio is 0.
        let q = match qs.get_mut(1) {
            Some(q) => q,
            None => return,
        };
        if !q.ready {
            return;
        }
        let mut any = false;
        // FUSE requests have a hard cap on max_write (negotiated at
        // FUSE_INIT, default 1 MiB) + InHeader + opcode headers. 8 MiB
        // is a comfortable ceiling that a well-behaved guest never
        // hits but stops a hostile one from forcing huge Vec
        // allocations via crafted descriptor chains where each
        // descriptor claims `len = u32::MAX`.
        const MAX_REQUEST_BYTES: usize = 8 * 1024 * 1024;

        while let Some((head, chain)) = q.pop_chain() {
            // Concatenate the READABLE descriptors (InHeader + payload may
            // span several) under the hard request cap. The shared helper
            // returns None on over-cap so a hostile chain can't force an
            // unbounded host allocation; an empty chain yields an empty
            // buffer (no index panic) that the short-request check below
            // rejects.
            let in_bytes =
                match super::queue::read_readable_capped(&chain, &q.mem, MAX_REQUEST_BYTES) {
                    Some(b) => b,
                    None => {
                        eprintln!(
                        "[virtio-fs] request chain exceeds {MAX_REQUEST_BYTES} byte cap; rejecting"
                    );
                        q.add_used(head, 0);
                        any = true;
                        continue;
                    }
                };

            let hdr_size = core::mem::size_of::<InHeader>();
            if in_bytes.len() < hdr_size {
                eprintln!(
                    "[virtio-fs] short request: {} bytes < InHeader ({hdr_size})",
                    in_bytes.len()
                );
                q.add_used(head, 0);
                any = true;
                continue;
            }
            let hdr: InHeader =
                unsafe { core::ptr::read_unaligned(in_bytes.as_ptr() as *const InHeader) };
            let payload = &in_bytes[hdr_size..];

            // FuseServer.dispatch is &self (interior-mutability) so we
            // only hold this lock for the dispatch call itself, not
            // across queue operations.
            let reply = self.fuse.lock().unwrap().dispatch(&hdr, payload);

            // Write the reply across the writable descriptors (bounded —
            // never over-writes a short chain).
            let written = super::queue::write_writable(&chain, &q.mem, &reply.bytes);
            q.add_used(head, written as u32);
            any = true;
        }
        drop(qs);
        if any {
            if let Some(f) = self.irq_raise.lock().unwrap().clone() {
                f();
            }
        }
    }
}

impl Notifier for VirtioFs {
    fn invalidate_inode(&self, nodeid: u64, off: i64, len: i64) {
        let bytes = build_inval_inode(nodeid, off, len);
        let ok = self.push_notification(&bytes);
        log_notify(|| {
            format!(
                "INVAL_INODE nodeid={nodeid} off={off} len={len} pushed={ok} pool_left={}",
                self.notif_pool.lock().unwrap().len()
            )
        });
    }
    fn invalidate_entry(&self, parent_nodeid: u64, name: &[u8]) {
        let bytes = build_inval_entry(parent_nodeid, name);
        let ok = self.push_notification(&bytes);
        log_notify(|| {
            format!(
                "INVAL_ENTRY parent={parent_nodeid} name={:?} pushed={ok} pool_left={}",
                std::str::from_utf8(name).unwrap_or("<bytes>"),
                self.notif_pool.lock().unwrap().len()
            )
        });
    }
}

fn log_notify<F: FnOnce() -> String>(make_msg: F) {
    use std::io::Write;
    let Some(target) = crate::trace::fuse_target() else {
        return;
    };
    let s = make_msg();
    let target = target.to_string_lossy().into_owned();
    if target == "1" || target == "stderr" {
        eprintln!("[virtio-fs] {s}");
        return;
    }
    if let Ok(mut f) = std::fs::OpenOptions::new()
        .create(true)
        .append(true)
        .open(&target)
    {
        let _ = writeln!(f, "[virtio-fs] {s}");
    }
}

impl VirtioDevice for VirtioFs {
    fn device_id(&self) -> u32 {
        VIRTIO_ID_FS
    }

    fn num_queues(&self) -> usize {
        1 + self.cfg.num_request_queues as usize
    }

    fn features(&self) -> u64 {
        FEATURES
    }

    fn config(&self) -> Vec<u8> {
        // tag[36] || num_queues:u32
        let mut buf = vec![0u8; TAG_LEN + 4];
        let tag_bytes = self.cfg.tag.as_bytes();
        let take = tag_bytes.len().min(TAG_LEN - 1);
        buf[..take].copy_from_slice(&tag_bytes[..take]);
        // num_queues is the number of REQUEST queues (excluding hiprio).
        buf[TAG_LEN..TAG_LEN + 4].copy_from_slice(&self.cfg.num_request_queues.to_le_bytes());
        buf
    }

    fn notify(&self, q: u16) {
        match q {
            0 => self.drain_hiprio_queue(),
            _ => self.drain_request_queue(),
        }
    }

    fn activate(&self, queues: Vec<Queue>) {
        *self.queues.lock().unwrap() = queues;
        self.activated.store(true, Ordering::Release);
        eprintln!(
            "[virtio-fs] activated: tag={:?} req_queues={} dax_window={:#x}..{:#x} ({} MiB)",
            self.cfg.tag,
            self.cfg.num_request_queues,
            self.cfg.dax_window_gpa,
            self.cfg.dax_window_gpa + self.cfg.dax_window_len,
            self.cfg.dax_window_len / (1024 * 1024),
        );
    }

    fn snapshot_queues(&self) -> Vec<Queue> {
        self.queues.lock().unwrap().clone()
    }

    fn shm_regions(&self) -> Vec<ShmRegion> {
        // A zero-length window means "no DAX" — advertise no SHM region at all so
        // the guest mounts virtio-fs without a DAX cache (the KVM non-DAX path).
        if self.cfg.dax_window_len == 0 {
            return Vec::new();
        }
        vec![ShmRegion {
            id: 0,
            gpa: self.cfg.dax_window_gpa,
            len: self.cfg.dax_window_len,
        }]
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn config_layout() {
        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "shared".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x80_0000_0000,
            dax_window_len: DEFAULT_DAX_WINDOW_BYTES,
        });
        let cfg = dev.config();
        assert_eq!(cfg.len(), 40);
        // tag prefix
        assert_eq!(&cfg[..6], b"shared");
        // remainder of tag region is zero-padded
        for &b in &cfg[6..36] {
            assert_eq!(b, 0);
        }
        // num_queues at offset 36 = 1
        assert_eq!(&cfg[36..40], &1u32.to_le_bytes());
    }

    #[test]
    fn shm_region_advertised() {
        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "t".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x100_0000_0000,
            dax_window_len: 0x4000,
        });
        let regs = dev.shm_regions();
        assert_eq!(regs.len(), 1);
        assert_eq!(regs[0].id, 0);
        assert_eq!(regs[0].gpa, 0x100_0000_0000);
        assert_eq!(regs[0].len, 0x4000);
    }

    #[test]
    fn reset_for_restore_clears_notif_pool() {
        // Regression: notif_pool is NOT part of the snapshot, so without
        // an explicit clear a RESTORE-recycled pool worker would carry
        // the prior cycle's stale HeldChain entries — old descriptor head
        // indices + guest-physical addrs the restored guest never offered
        // — and push_notification would later write to a repurposed GPA
        // and add_used a bogus index (guest memory corruption + used-ring
        // desync). reset_for_restore must drop them.
        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "t".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x100_0000_0000,
            dax_window_len: 0x4000,
        });
        // Simulate a hipri notification chain the guest offered last cycle.
        dev.notif_pool.lock().unwrap().push(HeldChain {
            head: 7,
            chain: Vec::new(),
        });
        assert_eq!(dev.notif_pool.lock().unwrap().len(), 1);

        dev.reset_for_restore();

        assert!(
            dev.notif_pool.lock().unwrap().is_empty(),
            "reset_for_restore must clear stale cross-cycle notification chains"
        );
    }

    // === Malformed-FUSE-over-the-wire integration ======================
    //
    // The proptest in fuse::server fuzzes `dispatch` directly. This goes
    // one layer out: it drives the FULL device path a hostile guest
    // actually hits — pop_chain → read the request off a real
    // descriptor chain in GuestMem → dispatch → write the reply back
    // across the writable descriptors → add_used — with arbitrary,
    // malformed FUSE bytes. It locks in that the device never panics or
    // reads/writes out of bounds, consumes each chain exactly once, and
    // frames every reply within the guest-provided buffer.
    use super::super::queue::{GuestMem, VRING_DESC_F_NEXT, VRING_DESC_F_WRITE};
    use crate::fuse::{InitIn, OutHeader, FUSE_KERNEL_VERSION};
    use proptest::prelude::*;

    const MEM_BASE: u64 = 0x10_0000;
    const MEM_LEN: usize = 64 * 1024;
    const O_DESC: u64 = 0x000;
    const O_AVAIL: u64 = 0x800;
    const O_USED: u64 = 0x1000;
    const O_REQ: u64 = 0x2000;
    const O_REPLY: u64 = 0x3000;
    const REPLY_CAP: u32 = 4096;

    struct DriveOut {
        used_idx: u16,
        used_id: u32,
        written: u32,
        reply: Vec<u8>,
    }

    /// Drive ONE request through the real device. Builds a guest-memory
    /// window with a valid two-descriptor chain (readable request +
    /// writable reply), installs it as the request queue (index 1), and
    /// calls `drain_request_queue`. `desc_len_override` lets a test claim
    /// a descriptor length larger than the bytes actually present, to
    /// exercise the 8 MiB over-cap rejection without the harness itself
    /// allocating anything huge.
    fn drive_request(request: &[u8], desc_len_override: Option<u32>) -> DriveOut {
        let mut backing = vec![0u8; MEM_LEN];
        let mem = GuestMem::new(backing.as_mut_ptr(), MEM_BASE, MEM_LEN);

        // Request bytes into the readable region (clamped to the window).
        let n = request.len().min(MEM_LEN - O_REQ as usize);
        mem.write_slice(MEM_BASE + O_REQ, &request[..n]);

        // desc[0] readable: the request.
        let d0 = MEM_BASE + O_DESC;
        mem.write_u64(d0, MEM_BASE + O_REQ);
        mem.write_u32(d0 + 8, desc_len_override.unwrap_or(request.len() as u32));
        mem.write_u16(d0 + 12, VRING_DESC_F_NEXT);
        mem.write_u16(d0 + 14, 1);
        // desc[1] writable: the reply buffer.
        let d1 = MEM_BASE + O_DESC + 16;
        mem.write_u64(d1, MEM_BASE + O_REPLY);
        mem.write_u32(d1 + 8, REPLY_CAP);
        mem.write_u16(d1 + 12, VRING_DESC_F_WRITE);
        mem.write_u16(d1 + 14, 0);
        // avail ring: ring[0] = head 0; idx = 1.
        mem.write_u16(MEM_BASE + O_AVAIL + 4, 0);
        mem.write_u16(MEM_BASE + O_AVAIL + 2, 1);

        let mut req_q = Queue::new(mem.clone());
        req_q.size = 8;
        req_q.ready = true;
        req_q.desc_table = MEM_BASE + O_DESC;
        req_q.avail_ring = MEM_BASE + O_AVAIL;
        req_q.used_ring = MEM_BASE + O_USED;

        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "t".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x100_0000_0000,
            dax_window_len: 0x4000,
        });
        {
            let mut qs = dev.queues.lock().unwrap();
            // index 0 = hiprio, left not-ready so drain ignores it.
            qs.push(Queue::new(mem.clone()));
            // index 1 = request queue.
            qs.push(req_q);
        }
        dev.activated.store(true, Ordering::Release);

        dev.drain_request_queue();

        // Read the used-ring outcome + the reply bytes back out.
        let used_idx = mem.read_u16(MEM_BASE + O_USED + 2);
        let used_id = mem.read_u32(MEM_BASE + O_USED + 4);
        let written = mem.read_u32(MEM_BASE + O_USED + 8);
        let mut reply = vec![0u8; written.min(REPLY_CAP) as usize];
        mem.read_slice(MEM_BASE + O_REPLY, &mut reply);
        // `backing` is owned here and dropped only on return — `mem`'s
        // raw pointer stays valid for every access above.
        DriveOut {
            used_idx,
            used_id,
            written,
            reply,
        }
    }

    fn in_header_bytes(opcode: u32, unique: u64, nodeid: u64, payload_len: usize) -> Vec<u8> {
        let hdr = InHeader {
            len: (core::mem::size_of::<InHeader>() + payload_len) as u32,
            opcode,
            unique,
            nodeid,
            uid: 0,
            gid: 0,
            pid: 0,
            padding: 0,
        };
        unsafe {
            std::slice::from_raw_parts(
                &hdr as *const InHeader as *const u8,
                core::mem::size_of::<InHeader>(),
            )
        }
        .to_vec()
    }

    #[test]
    fn wire_valid_init_round_trips() {
        // A structurally-valid FUSE_INIT must round-trip the full device
        // path with error==0 and a self-consistent reply length.
        let init = InitIn {
            major: FUSE_KERNEL_VERSION,
            minor: 0,
            max_readahead: 0,
            flags: 0,
            flags2: 0,
            unused: [0; 11],
        };
        let init_bytes = unsafe {
            std::slice::from_raw_parts(
                &init as *const InitIn as *const u8,
                core::mem::size_of::<InitIn>(),
            )
        };
        let mut req = in_header_bytes(/* FUSE_INIT */ 26, 1, 0, init_bytes.len());
        req.extend_from_slice(init_bytes);

        let out = drive_request(&req, None);
        assert_eq!(out.used_idx, 1, "chain must be consumed once");
        assert_eq!(out.used_id, 0);
        assert!(out.written as usize >= core::mem::size_of::<OutHeader>());
        let declared = u32::from_le_bytes(out.reply[0..4].try_into().unwrap());
        let error = i32::from_le_bytes(out.reply[4..8].try_into().unwrap());
        assert_eq!(
            declared as u32, out.written,
            "reply len must match bytes written"
        );
        assert_eq!(error, 0, "INIT should succeed");
    }

    #[test]
    fn wire_short_request_is_rejected_cleanly() {
        // Fewer than InHeader bytes: device acks the chain with len 0 and
        // never dispatches — no panic, no reply.
        let out = drive_request(&[0u8; 8], None);
        assert_eq!(out.used_idx, 1);
        assert_eq!(out.written, 0, "short request must produce no reply");
    }

    #[test]
    fn wire_oversized_descriptor_is_capped_not_allocated() {
        // A readable descriptor claiming 9 MiB (> the 8 MiB request cap)
        // must be rejected BEFORE any allocation/read — proving the cap
        // guards the host against a crafted huge-length descriptor.
        let req = in_header_bytes(26, 1, 0, 0);
        let out = drive_request(&req, Some(9 * 1024 * 1024));
        assert_eq!(out.used_idx, 1);
        assert_eq!(
            out.written, 0,
            "over-cap chain must be rejected with no reply"
        );
    }

    proptest! {
        #![proptest_config(ProptestConfig::with_cases(512))]

        /// Arbitrary opcode + arbitrary payload, framed as a real request
        /// on a real descriptor chain. The device must never panic or go
        /// out of bounds, must consume the chain exactly once, must never
        /// write past the writable descriptor, and — because a full
        /// InHeader is always present — must emit a reply whose declared
        /// length is at least an OutHeader and never undershoots the
        /// bytes it actually wrote (no wire length-confusion).
        #[test]
        fn wire_arbitrary_fuse_request_is_safe(
            opcode in any::<u32>(),
            unique in any::<u64>(),
            nodeid in prop_oneof![Just(1u64), any::<u64>()],
            payload in proptest::collection::vec(any::<u8>(), 0..1024),
        ) {
            let mut req = in_header_bytes(opcode, unique, nodeid, payload.len());
            req.extend_from_slice(&payload);

            let out = drive_request(&req, None);
            prop_assert_eq!(out.used_idx, 1);
            prop_assert_eq!(out.used_id, 0);
            prop_assert!(out.written <= REPLY_CAP, "wrote past writable descriptor");
            prop_assert!(
                out.written as usize >= core::mem::size_of::<OutHeader>(),
                "reply smaller than OutHeader ({} bytes)",
                out.written,
            );
            let declared = u32::from_le_bytes(out.reply[0..4].try_into().unwrap());
            prop_assert!(
                declared >= out.written,
                "OutHeader.len {} undershoots written {}",
                declared, out.written,
            );
        }
    }
}