supermachine 0.7.76

// virtio-fs device emulation. Pairs the FUSE protocol (see
// `crate::fuse`) with the virtio queue plumbing in this directory.
//
// Layout (virtio spec §5.11):
//   - device id   = 26 (VIRTIO_ID_FS)
//   - num_queues  = 1 (hiprio) + N (request). We start with N=1; the
//                   guest's virtiofs driver only uses the first request
//                   queue unless we negotiate the multiqueue feature.
//   - config space:
//        +0  tag[36]      — UTF-8, NUL-padded, identifies the mount
//                            point name as seen by the guest's
//                            `mount -t virtiofs <tag> /target`.
//        +36 num_queues   — u32, the number of request queues
//                            (excluding hiprio).
//   - shared-memory region 0 = DAX window. The guest's virtiofs driver
//     issues FUSE_SETUPMAPPING / FUSE_REMOVEMAPPING requests on the
//     request queue; we translate those into `hv_vm_map` calls into
//     this window.
//
// Threading: the REQUEST queue is drained by a dedicated per-device
// I/O worker thread; the guest's QueueNotify doorbell just kicks it.
// Through 0.7.74 the drain ran synchronously inside the doorbell MMIO
// exit — every FUSE op (a host stat/open/read/write) blocked the
// kicking vCPU, so a fleet of VMs doing concurrent FUSE churn
// serialised all guest I/O behind vCPU exits (the pool>=6 virtio-fs
// "wedge"; the same architectural flaw is documented for the KVM
// rootfs-over-FUSE attempt in docs/design/kvm-virtiofs-dax-2026-06-07.md).
// The hiprio queue stays synchronous on the doorbell — it only stashes
// notification buffers / acks FORGET, no host syscalls.
//
// Because the worker writes guest RAM (reply bytes + used ring) from a
// non-vCPU thread, "all vCPUs parked" no longer implies "guest RAM is
// quiescent". Anything that captures or rewrites guest RAM wholesale
// (snapshot capture, restore RAM remap) must hold [`FsIoPauseGuard`] /
// `pause_io()` for the duration: the worker parks at a FUSE-chain
// boundary, so a request is always either still in the avail ring or
// fully answered (reply written + used-ring bumped) — never popped but
// unanswered, which would leave a restored guest waiting forever on a
// reply that no longer exists. `activate()` (which `MmioVirtio::
// restore_state` calls on DRIVER_OK) kicks the worker, so requests that
// WERE captured pending in the avail ring get serviced on restore.

#![allow(dead_code)]

use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Condvar, Mutex};

use crate::fuse::{
    build_inval_entry, build_inval_inode, FsBackend, FuseServer, InHeader, MemoryFs, Notifier,
};

use super::queue::Queue;
use super::{ShmRegion, VirtioDevice, VIRTIO_ID_FS};

/// virtio-fs feature bits we offer.
///
/// `VIRTIO_F_VERSION_1` (bit 32) — modern virtio.
/// virtio-fs doesn't currently expose device-specific feature bits in
/// the upstream Linux driver (5.4+), but we keep room for them here.
const FEATURES: u64 = 1u64 << 32;

/// Max tag length per virtio spec §5.11.4.
const TAG_LEN: usize = 36;

/// Default DAX window size. 8 GiB of guest physical address space —
/// plenty for current workloads and effectively free at rest (only
/// stage-2 page table entries materialize when SETUPMAPPING actually
/// maps a region into the window).
pub const DEFAULT_DAX_WINDOW_BYTES: u64 = 8 * 1024 * 1024 * 1024;

/// Construction parameters.
#[derive(Clone, Debug)]
pub struct VirtioFsConfig {
    /// Mount-point tag the guest will see (e.g. "shared", "myapp").
    /// Max 36 bytes UTF-8 once NUL-padded.
    pub tag: String,
    /// Number of request queues to expose. Must be ≥ 1; defaults to 1
    /// elsewhere. We don't currently advertise multiqueue, so the
    /// guest only kicks queue index 1 (request) and 0 (hiprio).
    pub num_request_queues: u32,
    /// Guest physical base of the DAX window. Must be 16 KiB aligned
    /// and not overlap any RAM, MMIO, or other device region.
    pub dax_window_gpa: u64,
    /// Length of the DAX window in bytes.
    pub dax_window_len: u64,
}

/// The virtio-fs device.
///
/// A thin facade over [`FsCore`] (the actual device state, shared with
/// the I/O worker thread) plus the worker's join handle. The facade is
/// what the wiring layers hold (`Arc<VirtioFs>` in `Vmm::fs_devices`,
/// `Arc<dyn VirtioDevice>` inside `MmioVirtio`); the worker holds only
/// `Arc<FsCore>`, so the worker can never be the thread that drops the
/// facade (whose `Drop` joins the worker — a self-join would deadlock).
pub struct VirtioFs {
    core: Arc<FsCore>,
    /// I/O worker join handle; taken (exactly once) by `shutdown_io`.
    worker: Mutex<Option<std::thread::JoinHandle<()>>>,
}

/// Device state shared between the facade and the I/O worker thread.
struct FsCore {
    cfg: VirtioFsConfig,
    queues: Mutex<Vec<Queue>>,
    activated: AtomicBool,
    irq_raise: Mutex<Option<Arc<dyn Fn() + Send + Sync>>>,
    /// FUSE protocol handler. Dispatches decoded requests to per-op
    /// handlers.
    fuse: Mutex<FuseServer>,
    /// Pool of hipriority chains the guest gave us with WRITABLE
    /// descriptors only — i.e. empty buffers the guest stashed so
    /// the device can write notifications (FUSE_NOTIFY_INVAL_*) into
    /// them. Populated on hipri queue kick; drained by
    /// push_notification.
    notif_pool: Mutex<Vec<HeldChain>>,
    /// Worker control + guest-RAM-write gate. Lock order: `io` is
    /// never held while acquiring `queues` is FINE (drain takes `io`
    /// first via try_enter_io, releases it, then takes `queues`) —
    /// the invariant is simply that neither lock is acquired while
    /// the other is held.
    io: Mutex<IoCtl>,
    io_cv: Condvar,
}

/// Worker/pause control state. Guarded by `FsCore::io`.
#[derive(Default)]
struct IoCtl {
    /// A request-queue doorbell arrived (or a restore re-activated the
    /// queues); the worker owes a drain pass.
    kicked: bool,
    /// Worker is inside a drain pass (between `kicked` being consumed
    /// and the pass finishing). Lets `wait_io_idle` see the gap where
    /// `kicked` is already false but the pass hasn't started touching
    /// chains yet.
    draining: bool,
    /// The worker must exit; `try_enter_io` fails so no further guest
    /// RAM writes happen (also gates the PosixFs watcher's
    /// notifications).
    stop: bool,
    /// Outstanding `pause_io()` holds (nested pause is allowed — e.g.
    /// the runner's cycle-restore guard around `Vmm::restore_snapshot`'s
    /// own internal guard).
    pause_holds: u32,
    /// In-flight guest-RAM-writing sections: a FUSE chain being
    /// processed, or a notification being written. `pause_io` returns
    /// only once this is 0.
    active_io: u32,
}

/// One popped hipri chain held for notification use. We have to
/// hold (head, chain) so we can later call `q.add_used(head, len)`
/// when the notification is written.
struct HeldChain {
    head: u16,
    chain: Vec<super::queue::Desc>,
}

/// RAII pause of a set of virtio-fs I/O workers. Construct around any
/// window that captures or rewrites guest RAM wholesale (snapshot
/// capture, restore RAM remap): each worker parks at a FUSE-chain
/// boundary for the guard's lifetime, so no reply bytes, used-ring
/// updates, notification writes, or IRQ raises land mid-window.
pub struct FsIoPauseGuard<'a>(&'a [Arc<VirtioFs>]);

impl<'a> FsIoPauseGuard<'a> {
    pub fn new(devices: &'a [Arc<VirtioFs>]) -> Self {
        for d in devices {
            d.pause_io();
        }
        Self(devices)
    }
}

impl Drop for FsIoPauseGuard<'_> {
    fn drop(&mut self) {
        for d in self.0 {
            d.resume_io();
        }
    }
}

impl VirtioFs {
    /// Construct with a default `MemoryFs` backend. Used by tests +
    /// the not-yet-wired-up VMM path. Real mounts call `with_backend`.
    pub fn new(cfg: VirtioFsConfig) -> Self {
        Self::with_backend(cfg, Arc::new(MemoryFs::new()))
    }

    /// Construct with a specific filesystem backend. This is the
    /// real-mount entry point; the backend is the only thing that
    /// differs between an in-memory test mount and a host bind-mount.
    pub fn with_backend(cfg: VirtioFsConfig, backend: Arc<dyn FsBackend>) -> Self {
        assert!(
            cfg.dax_window_gpa & 0x3FFF == 0,
            "DAX window GPA must be 16 KiB aligned (got {:#x})",
            cfg.dax_window_gpa
        );
        assert!(
            cfg.dax_window_len & 0x3FFF == 0,
            "DAX window len must be 16 KiB aligned (got {:#x})",
            cfg.dax_window_len
        );
        assert!(
            cfg.tag.len() < TAG_LEN,
            "tag too long (max {} bytes, got {})",
            TAG_LEN - 1,
            cfg.tag.len()
        );
        assert!(
            cfg.num_request_queues >= 1,
            "must have at least 1 request queue"
        );
        let core = Arc::new(FsCore {
            cfg,
            queues: Mutex::new(Vec::new()),
            activated: AtomicBool::new(false),
            irq_raise: Mutex::new(None),
            fuse: Mutex::new(FuseServer::new(backend)),
            notif_pool: Mutex::new(Vec::new()),
            io: Mutex::new(IoCtl::default()),
            io_cv: Condvar::new(),
        });
        let worker_core = Arc::clone(&core);
        let worker = std::thread::Builder::new()
            .name("sm-virtiofs-io".into())
            .spawn(move || FsCore::worker_loop(worker_core))
            .expect("spawn virtio-fs io worker thread");
        Self {
            core,
            worker: Mutex::new(Some(worker)),
        }
    }

    pub fn set_irq_raise(&self, f: Arc<dyn Fn() + Send + Sync>) {
        *self.core.irq_raise.lock().unwrap() = Some(f);
    }

    /// Clear cross-cycle device state before a pool cycle-restore.
    ///
    /// `notif_pool` holds guest descriptor-chain heads + copied
    /// descriptor addrs that the guest offered for FUSE notifications.
    /// It is NOT part of the snapshot, so without this clear a restored
    /// worker would carry the PRIOR cycle's stale chains: a later
    /// `push_notification` would then write notification bytes to a GPA
    /// the restored guest has repurposed and `add_used` a descriptor
    /// index this cycle's guest never offered — guest memory corruption +
    /// virtio used-ring desync. Mirrors `serial`/`balloon`
    /// `reset_for_restore`; called from the runner's restore path. Only
    /// touches the `notif_pool` lock, preserving the documented
    /// `queues → notif_pool` lock order.
    pub fn reset_for_restore(&self) {
        self.core.notif_pool.lock().unwrap().clear();
    }

    /// Access the FUSE server. Used by the VMM device-set wiring to
    /// call `set_dax` once the DAX session is constructed (the session
    /// needs the same backend that this device was built with, which
    /// is private here, plus the HVF mapper which only the VMM has).
    pub fn fuse_server(&self) -> &Mutex<FuseServer> {
        &self.core.fuse
    }

    /// Park the I/O worker at a FUSE-chain boundary and block until no
    /// guest-RAM-writing section is in flight. Re-entrant (holds
    /// nest). Pair with `resume_io`, or use [`FsIoPauseGuard`].
    pub fn pause_io(&self) {
        let mut io = self.core.io.lock().unwrap();
        io.pause_holds += 1;
        while io.active_io > 0 {
            io = self.core.io_cv.wait(io).unwrap();
        }
    }

    /// Release one `pause_io` hold; the worker resumes (and re-drains
    /// any doorbell that arrived while paused) once all holds drop.
    pub fn resume_io(&self) {
        let mut io = self.core.io.lock().unwrap();
        io.pause_holds = io.pause_holds.saturating_sub(1);
        if io.pause_holds == 0 {
            self.core.io_cv.notify_all();
        }
    }

    /// Block until the worker has no pending kick and no in-flight
    /// section. Test synchronization point for the async drain (a
    /// doorbell no longer implies the request was processed by the
    /// time `notify` returns). Do not call with `pause_io` held and a
    /// kick pending — that combination never goes idle.
    pub fn wait_io_idle(&self) {
        let mut io = self.core.io.lock().unwrap();
        while io.kicked || io.draining || io.active_io > 0 {
            io = self.core.io_cv.wait(io).unwrap();
        }
    }

    /// Stop + join the I/O worker and gate any further guest-RAM
    /// writes (including the PosixFs watcher's notification path).
    /// MUST be called before guest RAM is freed — `Vmm::drop` does
    /// this ahead of `MicroVm::drop`'s munmap, mirroring
    /// `vsock.shutdown()` (same use-after-free class: the worker
    /// writes reply bytes through the device's captured `GuestMem`
    /// pointer, which the `HOST_RAM_PTR` null-guard does not cover).
    /// Idempotent; also runs from `Drop` as a backstop.
    pub fn shutdown_io(&self) {
        {
            let mut io = self.core.io.lock().unwrap();
            io.stop = true;
            self.core.io_cv.notify_all();
        }
        if let Some(h) = self.worker.lock().unwrap().take() {
            let _ = h.join();
        }
    }
}

impl Drop for VirtioFs {
    fn drop(&mut self) {
        self.shutdown_io();
    }
}

impl FsCore {
    /// I/O worker body. Sleeps until kicked; honors pause holds (the
    /// snapshot/restore gate) and the stop flag.
    fn worker_loop(core: Arc<FsCore>) {
        let mut io = core.io.lock().unwrap();
        loop {
            if io.stop {
                return;
            }
            if io.kicked && io.pause_holds == 0 {
                io.kicked = false;
                io.draining = true;
                drop(io);
                core.drain_request_queue();
                io = core.io.lock().unwrap();
                io.draining = false;
                // Wake wait_io_idle / pause waiters re-evaluating state.
                core.io_cv.notify_all();
                continue;
            }
            io = core.io_cv.wait(io).unwrap();
        }
    }

    /// Record a request-queue doorbell (or a restore re-activation)
    /// and wake the worker.
    fn kick_worker(&self) {
        let mut io = self.io.lock().unwrap();
        io.kicked = true;
        self.io_cv.notify_all();
    }

    /// Open a guest-RAM-writing section. Fails when paused (snapshot/
    /// restore window) or stopping — callers drop the work (notifier)
    /// or leave a kick pending and bail (drain).
    fn try_enter_io(&self) -> bool {
        let mut io = self.io.lock().unwrap();
        if io.stop || io.pause_holds > 0 {
            return false;
        }
        io.active_io += 1;
        true
    }

    fn exit_io(&self) {
        let mut io = self.io.lock().unwrap();
        io.active_io -= 1;
        self.io_cv.notify_all();
    }

    fn raise_irq(&self) {
        if let Some(f) = self.irq_raise.lock().unwrap().clone() {
            f();
        }
    }

    /// Drain the hipriority queue. Two classes of work:
    ///   (a) FORGET / INTERRUPT requests from the guest (readable
    ///       descriptors). Ack with 0 — our backend's forget is a
    ///       no-op anyway.
    ///   (b) Notification slots (writable-only chains). Stash them
    ///       in `notif_pool` so `push_notification` can fill one
    ///       when a host change fires.
    ///
    /// Runs synchronously on the doorbell (vCPU thread): no host
    /// syscalls here, and the used-ring write for (a) is quiesced by
    /// vCPU parking like any other vCPU-thread store.
    fn drain_hiprio_queue(&self) {
        if !self.activated.load(Ordering::Acquire) {
            return;
        }
        let mut qs = self.queues.lock().unwrap();
        let Some(hi) = qs.get_mut(0) else { return };
        if !hi.ready {
            return;
        }
        let mut acked_request = false;
        while let Some((head, chain)) = hi.pop_chain() {
            let has_readable = chain
                .iter()
                .any(|d| d.flags & super::queue::VRING_DESC_F_WRITE == 0);
            if has_readable {
                // FORGET / INTERRUPT — consume and ack.
                hi.add_used(head, 0);
                acked_request = true;
            } else {
                // Writable-only chain — stash for notification use.
                self.notif_pool
                    .lock()
                    .unwrap()
                    .push(HeldChain { head, chain });
                log_notify(|| {
                    format!(
                        "hipri stashed: pool size now {}",
                        self.notif_pool.lock().unwrap().len()
                    )
                });
            }
        }
        drop(qs);
        if acked_request {
            self.raise_irq();
        }
    }

    /// Push a notification payload onto the hipriority queue. Returns
    /// true if a buffer was available and the payload was written;
    /// false if the pool is empty (notification dropped — the
    /// guest's virtio-fs driver should re-add buffers shortly) or the
    /// device is paused for snapshot/restore (dropped — the guest's
    /// entry/attr cache TTL is the backstop, and a notification about
    /// pre-snapshot state would be stale on the restored guest anyway).
    ///
    /// Used by `Notifier` impl to deliver FUSE_NOTIFY_INVAL_* to
    /// the guest's virtio-fs driver. Caller is the PosixFs watcher
    /// thread — a non-vCPU guest-RAM writer, hence the io-section gate.
    fn push_notification(&self, bytes: &[u8]) -> bool {
        if !self.activated.load(Ordering::Acquire) {
            return false;
        }
        if !self.try_enter_io() {
            return false;
        }
        let ok = self.push_notification_in_section(bytes);
        self.exit_io();
        ok
    }

    fn push_notification_in_section(&self, bytes: &[u8]) -> bool {
        // Lock order: queues BEFORE notif_pool. `drain_hiprio_queue`
        // holds queues while pushing into notif_pool, so the watcher
        // thread (which calls this function) MUST acquire queues
        // first to avoid lock-order inversion deadlock. Pre-0.7.7
        // this function locked notif_pool first, then queues — a
        // classic AB/BA inversion that could deadlock under heavy
        // load (Vite-style "many small writes triggering many
        // NOTE_WRITE → INVAL_INODE notifications WHILE the guest is
        // also draining its hipriority queue rapidly"). Holding
        // queues across the pool pop is a tiny extra critical
        // section in the happy path and is the same hold-duration
        // drain_hiprio_queue already takes.
        let mut qs = self.queues.lock().unwrap();
        let held = match self.notif_pool.lock().unwrap().pop() {
            Some(h) => h,
            None => return false,
        };
        let Some(q) = qs.get_mut(0) else { return false };
        let mut written = 0usize;
        for d in held
            .chain
            .iter()
            .filter(|d| d.flags & super::queue::VRING_DESC_F_WRITE != 0)
        {
            if written >= bytes.len() {
                break;
            }
            let take = (bytes.len() - written).min(d.len as usize);
            q.mem.write_slice(d.addr, &bytes[written..written + take]);
            written += take;
        }
        q.add_used(held.head, written as u32);
        drop(qs);
        self.raise_irq();
        true
    }

    /// Drain the request queue on the worker thread, one chain per
    /// io-section so `pause_io` parks us at a chain boundary (the
    /// snapshot invariant: a request is never captured popped-but-
    /// unanswered). For each chain:
    ///   1. Read the InHeader + payload from the readable descriptors
    ///   2. Dispatch via FuseServer to produce a typed Reply
    ///   3. Write the reply bytes back to the writable descriptors,
    ///      splitting across multiple descriptors if needed
    ///   4. Mark the chain used with the actual reply byte length
    ///   5. Raise the used-buffer IRQ (per chain — the raise mutates
    ///      MmioVirtio interrupt status + GIC state, which the snapshot
    ///      also captures, so it must stay inside the io-section)
    fn drain_request_queue(&self) {
        if !self.activated.load(Ordering::Acquire) {
            return;
        }
        loop {
            if !self.try_enter_io() {
                // Paused (or stopping) mid-batch: leave a kick pending
                // so the worker re-drains after resume — both for
                // still-queued chains and for a doorbell that raced in.
                self.kick_worker();
                return;
            }
            let processed = self.process_one_request();
            self.exit_io();
            if !processed {
                return;
            }
        }
    }

    /// Pop + fully service ONE request chain (read → dispatch → reply
    /// → add_used → IRQ). Returns false when the queue is empty/not
    /// ready. Must run inside an io-section.
    fn process_one_request(&self) -> bool {
        // FUSE requests have a hard cap on max_write (negotiated at
        // FUSE_INIT, default 1 MiB) + InHeader + opcode headers. 8 MiB
        // is a comfortable ceiling that a well-behaved guest never
        // hits but stops a hostile one from forcing huge Vec
        // allocations via crafted descriptor chains where each
        // descriptor claims `len = u32::MAX`.
        const MAX_REQUEST_BYTES: usize = 8 * 1024 * 1024;

        let mut qs = self.queues.lock().unwrap();
        // Request queue is index 1; hiprio is 0.
        let q = match qs.get_mut(1) {
            Some(q) => q,
            None => return false,
        };
        if !q.ready {
            return false;
        }
        let Some((head, chain)) = q.pop_chain() else {
            return false;
        };
        // Concatenate the READABLE descriptors (InHeader + payload may
        // span several) under the hard request cap. The shared helper
        // returns None on over-cap so a hostile chain can't force an
        // unbounded host allocation; an empty chain yields an empty
        // buffer (no index panic) that the short-request check below
        // rejects.
        let in_bytes = match super::queue::read_readable_capped(&chain, &q.mem, MAX_REQUEST_BYTES) {
            Some(b) => b,
            None => {
                eprintln!(
                    "[virtio-fs] request chain exceeds {MAX_REQUEST_BYTES} byte cap; rejecting"
                );
                q.add_used(head, 0);
                drop(qs);
                self.raise_irq();
                return true;
            }
        };

        let hdr_size = core::mem::size_of::<InHeader>();
        if in_bytes.len() < hdr_size {
            eprintln!(
                "[virtio-fs] short request: {} bytes < InHeader ({hdr_size})",
                in_bytes.len()
            );
            q.add_used(head, 0);
            drop(qs);
            self.raise_irq();
            return true;
        }
        let hdr: InHeader =
            unsafe { core::ptr::read_unaligned(in_bytes.as_ptr() as *const InHeader) };
        let payload = &in_bytes[hdr_size..];

        // FuseServer.dispatch is &self (interior-mutability) so we
        // only hold this lock for the dispatch call itself, not
        // across queue operations.
        let reply = self.fuse.lock().unwrap().dispatch(&hdr, payload);

        // Write the reply across the writable descriptors (bounded —
        // never over-writes a short chain).
        let written = super::queue::write_writable(&chain, &q.mem, &reply.bytes);
        q.add_used(head, written as u32);
        drop(qs);
        self.raise_irq();
        true
    }
}

impl Notifier for VirtioFs {
    fn invalidate_inode(&self, nodeid: u64, off: i64, len: i64) {
        let bytes = build_inval_inode(nodeid, off, len);
        let ok = self.core.push_notification(&bytes);
        log_notify(|| {
            format!(
                "INVAL_INODE nodeid={nodeid} off={off} len={len} pushed={ok} pool_left={}",
                self.core.notif_pool.lock().unwrap().len()
            )
        });
    }
    fn invalidate_entry(&self, parent_nodeid: u64, name: &[u8]) {
        let bytes = build_inval_entry(parent_nodeid, name);
        let ok = self.core.push_notification(&bytes);
        log_notify(|| {
            format!(
                "INVAL_ENTRY parent={parent_nodeid} name={:?} pushed={ok} pool_left={}",
                std::str::from_utf8(name).unwrap_or("<bytes>"),
                self.core.notif_pool.lock().unwrap().len()
            )
        });
    }
}

fn log_notify<F: FnOnce() -> String>(make_msg: F) {
    use std::io::Write;
    let Some(target) = crate::trace::fuse_target() else {
        return;
    };
    let s = make_msg();
    let target = target.to_string_lossy().into_owned();
    if target == "1" || target == "stderr" {
        eprintln!("[virtio-fs] {s}");
        return;
    }
    if let Ok(mut f) = std::fs::OpenOptions::new()
        .create(true)
        .append(true)
        .open(&target)
    {
        let _ = writeln!(f, "[virtio-fs] {s}");
    }
}

impl VirtioDevice for VirtioFs {
    fn device_id(&self) -> u32 {
        VIRTIO_ID_FS
    }

    fn num_queues(&self) -> usize {
        1 + self.core.cfg.num_request_queues as usize
    }

    fn features(&self) -> u64 {
        FEATURES
    }

    fn config(&self) -> Vec<u8> {
        // tag[36] || num_queues:u32
        let mut buf = vec![0u8; TAG_LEN + 4];
        let tag_bytes = self.core.cfg.tag.as_bytes();
        let take = tag_bytes.len().min(TAG_LEN - 1);
        buf[..take].copy_from_slice(&tag_bytes[..take]);
        // num_queues is the number of REQUEST queues (excluding hiprio).
        buf[TAG_LEN..TAG_LEN + 4].copy_from_slice(&self.core.cfg.num_request_queues.to_le_bytes());
        buf
    }

    fn notify(&self, q: u16) {
        match q {
            0 => self.core.drain_hiprio_queue(),
            // Request queue: just kick the worker — the FUSE ops (host
            // syscalls) must not run on the vCPU doorbell-exit thread.
            _ => self.core.kick_worker(),
        }
    }

    fn activate(&self, queues: Vec<Queue>) {
        *self.core.queues.lock().unwrap() = queues;
        self.core.activated.store(true, Ordering::Release);
        eprintln!(
            "[virtio-fs] activated: tag={:?} req_queues={} dax_window={:#x}..{:#x} ({} MiB)",
            self.core.cfg.tag,
            self.core.cfg.num_request_queues,
            self.core.cfg.dax_window_gpa,
            self.core.cfg.dax_window_gpa + self.core.cfg.dax_window_len,
            self.core.cfg.dax_window_len / (1024 * 1024),
        );
        // A restore re-activates via MmioVirtio::restore_state with the
        // snapshot's queue cursors — if the snapshot caught a request
        // pending in the avail ring (guest was WFI-waiting on a FUSE
        // reply), this kick services it; without it the restored guest
        // would wait forever. Harmless on cold boot (empty queue).
        self.core.kick_worker();
    }

    fn snapshot_queues(&self) -> Vec<Queue> {
        self.core.queues.lock().unwrap().clone()
    }

    fn shm_regions(&self) -> Vec<ShmRegion> {
        // A zero-length window means "no DAX" — advertise no SHM region at all so
        // the guest mounts virtio-fs without a DAX cache (the KVM non-DAX path).
        if self.core.cfg.dax_window_len == 0 {
            return Vec::new();
        }
        vec![ShmRegion {
            id: 0,
            gpa: self.core.cfg.dax_window_gpa,
            len: self.core.cfg.dax_window_len,
        }]
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn config_layout() {
        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "shared".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x80_0000_0000,
            dax_window_len: DEFAULT_DAX_WINDOW_BYTES,
        });
        let cfg = dev.config();
        assert_eq!(cfg.len(), 40);
        // tag prefix
        assert_eq!(&cfg[..6], b"shared");
        // remainder of tag region is zero-padded
        for &b in &cfg[6..36] {
            assert_eq!(b, 0);
        }
        // num_queues at offset 36 = 1
        assert_eq!(&cfg[36..40], &1u32.to_le_bytes());
    }

    #[test]
    fn shm_region_advertised() {
        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "t".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x100_0000_0000,
            dax_window_len: 0x4000,
        });
        let regs = dev.shm_regions();
        assert_eq!(regs.len(), 1);
        assert_eq!(regs[0].id, 0);
        assert_eq!(regs[0].gpa, 0x100_0000_0000);
        assert_eq!(regs[0].len, 0x4000);
    }

    #[test]
    fn reset_for_restore_clears_notif_pool() {
        // Regression: notif_pool is NOT part of the snapshot, so without
        // an explicit clear a RESTORE-recycled pool worker would carry
        // the prior cycle's stale HeldChain entries — old descriptor head
        // indices + guest-physical addrs the restored guest never offered
        // — and push_notification would later write to a repurposed GPA
        // and add_used a bogus index (guest memory corruption + used-ring
        // desync). reset_for_restore must drop them.
        let dev = VirtioFs::new(VirtioFsConfig {
            tag: "t".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x100_0000_0000,
            dax_window_len: 0x4000,
        });
        // Simulate a hipri notification chain the guest offered last cycle.
        dev.core.notif_pool.lock().unwrap().push(HeldChain {
            head: 7,
            chain: Vec::new(),
        });
        assert_eq!(dev.core.notif_pool.lock().unwrap().len(), 1);

        dev.reset_for_restore();

        assert!(
            dev.core.notif_pool.lock().unwrap().is_empty(),
            "reset_for_restore must clear stale cross-cycle notification chains"
        );
    }

    // === Malformed-FUSE-over-the-wire integration ======================
    //
    // The proptest in fuse::server fuzzes `dispatch` directly. This goes
    // one layer out: it drives the FULL device path a hostile guest
    // actually hits — pop_chain → read the request off a real
    // descriptor chain in GuestMem → dispatch → write the reply back
    // across the writable descriptors → add_used — with arbitrary,
    // malformed FUSE bytes. It locks in that the device never panics or
    // reads/writes out of bounds, consumes each chain exactly once, and
    // frames every reply within the guest-provided buffer.
    use super::super::queue::{GuestMem, VRING_DESC_F_NEXT, VRING_DESC_F_WRITE};
    use crate::fuse::{InitIn, OutHeader, FUSE_KERNEL_VERSION};
    use proptest::prelude::*;

    const MEM_BASE: u64 = 0x10_0000;
    const MEM_LEN: usize = 64 * 1024;
    const O_DESC: u64 = 0x000;
    const O_AVAIL: u64 = 0x800;
    const O_USED: u64 = 0x1000;
    const O_REQ: u64 = 0x2000;
    const O_REPLY: u64 = 0x3000;
    const REPLY_CAP: u32 = 4096;

    struct DriveOut {
        used_idx: u16,
        used_id: u32,
        written: u32,
        reply: Vec<u8>,
    }

    /// Build the one-request guest-memory window + queue used by the
    /// drive helpers: a valid two-descriptor chain (readable request +
    /// writable reply) with avail idx = 1.
    fn build_request_queue(
        mem: &GuestMem,
        request: &[u8],
        desc_len_override: Option<u32>,
    ) -> Queue {
        // Request bytes into the readable region (clamped to the window).
        let n = request.len().min(MEM_LEN - O_REQ as usize);
        mem.write_slice(MEM_BASE + O_REQ, &request[..n]);

        // desc[0] readable: the request.
        let d0 = MEM_BASE + O_DESC;
        mem.write_u64(d0, MEM_BASE + O_REQ);
        mem.write_u32(d0 + 8, desc_len_override.unwrap_or(request.len() as u32));
        mem.write_u16(d0 + 12, VRING_DESC_F_NEXT);
        mem.write_u16(d0 + 14, 1);
        // desc[1] writable: the reply buffer.
        let d1 = MEM_BASE + O_DESC + 16;
        mem.write_u64(d1, MEM_BASE + O_REPLY);
        mem.write_u32(d1 + 8, REPLY_CAP);
        mem.write_u16(d1 + 12, VRING_DESC_F_WRITE);
        mem.write_u16(d1 + 14, 0);
        // avail ring: ring[0] = head 0; idx = 1.
        mem.write_u16(MEM_BASE + O_AVAIL + 4, 0);
        mem.write_u16(MEM_BASE + O_AVAIL + 2, 1);

        let mut req_q = Queue::new(mem.clone());
        req_q.size = 8;
        req_q.ready = true;
        req_q.desc_table = MEM_BASE + O_DESC;
        req_q.avail_ring = MEM_BASE + O_AVAIL;
        req_q.used_ring = MEM_BASE + O_USED;
        req_q
    }

    fn read_drive_out(mem: &GuestMem) -> DriveOut {
        let used_idx = mem.read_u16(MEM_BASE + O_USED + 2);
        let used_id = mem.read_u32(MEM_BASE + O_USED + 4);
        let written = mem.read_u32(MEM_BASE + O_USED + 8);
        let mut reply = vec![0u8; written.min(REPLY_CAP) as usize];
        mem.read_slice(MEM_BASE + O_REPLY, &mut reply);
        DriveOut {
            used_idx,
            used_id,
            written,
            reply,
        }
    }

    fn test_dev() -> VirtioFs {
        VirtioFs::new(VirtioFsConfig {
            tag: "t".into(),
            num_request_queues: 1,
            dax_window_gpa: 0x100_0000_0000,
            dax_window_len: 0x4000,
        })
    }

    /// Drive ONE request through the real device. Builds a guest-memory
    /// window with a valid two-descriptor chain (readable request +
    /// writable reply), installs it as the request queue (index 1), and
    /// calls `drain_request_queue` synchronously (the worker thread is
    /// idle — nothing kicks it here). `desc_len_override` lets a test
    /// claim a descriptor length larger than the bytes actually present,
    /// to exercise the 8 MiB over-cap rejection without the harness
    /// itself allocating anything huge.
    fn drive_request(request: &[u8], desc_len_override: Option<u32>) -> DriveOut {
        let mut backing = vec![0u8; MEM_LEN];
        let mem = GuestMem::new(backing.as_mut_ptr(), MEM_BASE, MEM_LEN);
        let req_q = build_request_queue(&mem, request, desc_len_override);

        let dev = test_dev();
        {
            let mut qs = dev.core.queues.lock().unwrap();
            // index 0 = hiprio, left not-ready so drain ignores it.
            qs.push(Queue::new(mem.clone()));
            // index 1 = request queue.
            qs.push(req_q);
        }
        dev.core.activated.store(true, Ordering::Release);

        dev.core.drain_request_queue();

        // `backing` is owned here and dropped only on return — `mem`'s
        // raw pointer stays valid for every access above.
        read_drive_out(&mem)
    }

    fn in_header_bytes(opcode: u32, unique: u64, nodeid: u64, payload_len: usize) -> Vec<u8> {
        let hdr = InHeader {
            len: (core::mem::size_of::<InHeader>() + payload_len) as u32,
            opcode,
            unique,
            nodeid,
            uid: 0,
            gid: 0,
            pid: 0,
            padding: 0,
        };
        unsafe {
            std::slice::from_raw_parts(
                &hdr as *const InHeader as *const u8,
                core::mem::size_of::<InHeader>(),
            )
        }
        .to_vec()
    }

    fn valid_init_request() -> Vec<u8> {
        let init = InitIn {
            major: FUSE_KERNEL_VERSION,
            minor: 0,
            max_readahead: 0,
            flags: 0,
            flags2: 0,
            unused: [0; 11],
        };
        let init_bytes = unsafe {
            std::slice::from_raw_parts(
                &init as *const InitIn as *const u8,
                core::mem::size_of::<InitIn>(),
            )
        };
        let mut req = in_header_bytes(/* FUSE_INIT */ 26, 1, 0, init_bytes.len());
        req.extend_from_slice(init_bytes);
        req
    }

    #[test]
    fn wire_valid_init_round_trips() {
        // A structurally-valid FUSE_INIT must round-trip the full device
        // path with error==0 and a self-consistent reply length.
        let out = drive_request(&valid_init_request(), None);
        assert_eq!(out.used_idx, 1, "chain must be consumed once");
        assert_eq!(out.used_id, 0);
        assert!(out.written as usize >= core::mem::size_of::<OutHeader>());
        let declared = u32::from_le_bytes(out.reply[0..4].try_into().unwrap());
        let error = i32::from_le_bytes(out.reply[4..8].try_into().unwrap());
        assert_eq!(
            declared as u32, out.written,
            "reply len must match bytes written"
        );
        assert_eq!(error, 0, "INIT should succeed");
    }

    #[test]
    fn notify_drains_request_queue_on_the_worker_thread() {
        // The async-drain contract: the doorbell (`notify(1)`) returns
        // immediately and the WORKER services the request; wait_io_idle
        // is the test's synchronization point. This is the path a real
        // guest hits — pre-fix the drain ran inside the doorbell exit.
        let mut backing = vec![0u8; MEM_LEN];
        let mem = GuestMem::new(backing.as_mut_ptr(), MEM_BASE, MEM_LEN);
        let req_q = build_request_queue(&mem, &valid_init_request(), None);
        // Unpublish: activate below must find an empty avail ring so the
        // request is provably serviced by the NOTIFY, not the activate
        // kick (covered by its own test).
        mem.write_u16(MEM_BASE + O_AVAIL + 2, 0);

        let dev = test_dev();
        dev.activate(vec![Queue::new(mem.clone()), req_q]);
        dev.wait_io_idle(); // absorb the activate kick (empty queue)
        assert_eq!(read_drive_out(&mem).used_idx, 0);

        // Publish the request and ring the doorbell.
        mem.write_u16(MEM_BASE + O_AVAIL + 2, 1);
        dev.notify(1);
        dev.wait_io_idle();

        let out = read_drive_out(&mem);
        assert_eq!(out.used_idx, 1, "worker must have serviced the request");
        let error = i32::from_le_bytes(out.reply[4..8].try_into().unwrap());
        assert_eq!(error, 0, "INIT should succeed via the worker path");
    }

    #[test]
    fn activate_kick_services_request_captured_pending_in_avail_ring() {
        // Restore-hang regression oracle: a snapshot can capture a FUSE
        // request still in the avail ring (guest WFI-waiting on the
        // reply). MmioVirtio::restore_state re-activates the device with
        // those cursors — activate() must kick the worker so the request
        // is serviced; otherwise the restored guest waits forever.
        let mut backing = vec![0u8; MEM_LEN];
        let mem = GuestMem::new(backing.as_mut_ptr(), MEM_BASE, MEM_LEN);
        let req_q = build_request_queue(&mem, &valid_init_request(), None);

        let dev = test_dev();
        // No notify() ever fires — activate alone must service it.
        dev.activate(vec![Queue::new(mem.clone()), req_q]);
        dev.wait_io_idle();

        let out = read_drive_out(&mem);
        assert_eq!(
            out.used_idx, 1,
            "activate must drain requests captured pending in the snapshot"
        );
    }

    #[test]
    fn pause_io_defers_doorbells_and_resume_replays_them() {
        // The snapshot/restore gate: a doorbell arriving while paused
        // must not be serviced (no guest-RAM writes during the pause
        // window) and must NOT be lost — resume_io replays it.
        let mut backing = vec![0u8; MEM_LEN];
        let mem = GuestMem::new(backing.as_mut_ptr(), MEM_BASE, MEM_LEN);
        let req_q = build_request_queue(&mem, &valid_init_request(), None);

        // Unpublish so the activate kick finds an empty queue; the
        // request is published only once the device is paused.
        mem.write_u16(MEM_BASE + O_AVAIL + 2, 0);

        let dev = test_dev();
        dev.activate(vec![Queue::new(mem.clone()), req_q]);
        dev.wait_io_idle();

        dev.pause_io();
        mem.write_u16(MEM_BASE + O_AVAIL + 2, 1);
        dev.notify(1);
        // Give a wrong implementation a chance to misbehave.
        std::thread::sleep(std::time::Duration::from_millis(50));
        assert_eq!(
            read_drive_out(&mem).used_idx,
            0,
            "no request may be serviced while paused"
        );
        dev.resume_io();
        dev.wait_io_idle();
        assert_eq!(
            read_drive_out(&mem).used_idx,
            1,
            "the deferred doorbell must be replayed after resume"
        );
    }

    #[test]
    fn push_notification_is_dropped_while_paused() {
        // The PosixFs watcher thread writes guest RAM via
        // push_notification — it must be gated by the same pause used
        // for snapshot capture (pre-fix this was a torn-snapshot /
        // use-after-free window).
        let dev = test_dev();
        dev.core.activated.store(true, Ordering::Release);
        dev.pause_io();
        assert!(
            !dev.core.push_notification(&[0u8; 8]),
            "notification must be dropped during a pause window"
        );
        dev.resume_io();
    }

    #[test]
    fn shutdown_io_is_idempotent_and_gates_io() {
        let dev = test_dev();
        dev.shutdown_io();
        dev.shutdown_io(); // second call must be a no-op
        dev.core.activated.store(true, Ordering::Release);
        assert!(
            !dev.core.push_notification(&[0u8; 8]),
            "no guest-RAM writes after shutdown"
        );
        dev.notify(1); // must not panic / hang with the worker gone
    }

    #[test]
    fn wire_short_request_is_rejected_cleanly() {
        // Fewer than InHeader bytes: device acks the chain with len 0 and
        // never dispatches — no panic, no reply.
        let out = drive_request(&[0u8; 8], None);
        assert_eq!(out.used_idx, 1);
        assert_eq!(out.written, 0, "short request must produce no reply");
    }

    #[test]
    fn wire_oversized_descriptor_is_capped_not_allocated() {
        // A readable descriptor claiming 9 MiB (> the 8 MiB request cap)
        // must be rejected BEFORE any allocation/read — proving the cap
        // guards the host against a crafted huge-length descriptor.
        let req = in_header_bytes(26, 1, 0, 0);
        let out = drive_request(&req, Some(9 * 1024 * 1024));
        assert_eq!(out.used_idx, 1);
        assert_eq!(
            out.written, 0,
            "over-cap chain must be rejected with no reply"
        );
    }

    proptest! {
        #![proptest_config(ProptestConfig::with_cases(512))]

        /// Arbitrary opcode + arbitrary payload, framed as a real request
        /// on a real descriptor chain. The device must never panic or go
        /// out of bounds, must consume the chain exactly once, must never
        /// write past the writable descriptor, and — because a full
        /// InHeader is always present — must emit a reply whose declared
        /// length is at least an OutHeader and never undershoots the
        /// bytes it actually wrote (no wire length-confusion).
        #[test]
        fn wire_arbitrary_fuse_request_is_safe(
            opcode in any::<u32>(),
            unique in any::<u64>(),
            nodeid in prop_oneof![Just(1u64), any::<u64>()],
            payload in proptest::collection::vec(any::<u8>(), 0..1024),
        ) {
            let mut req = in_header_bytes(opcode, unique, nodeid, payload.len());
            req.extend_from_slice(&payload);

            let out = drive_request(&req, None);
            prop_assert_eq!(out.used_idx, 1);
            prop_assert_eq!(out.used_id, 0);
            prop_assert!(out.written <= REPLY_CAP, "wrote past writable descriptor");
            prop_assert!(
                out.written as usize >= core::mem::size_of::<OutHeader>(),
                "reply smaller than OutHeader ({} bytes)",
                out.written,
            );
            let declared = u32::from_le_bytes(out.reply[0..4].try_into().unwrap());
            prop_assert!(
                declared >= out.written,
                "OutHeader.len {} undershoots written {}",
                declared, out.written,
            );
        }
    }
}