supermachine 0.5.1

// POSIX filesystem backend — translates FUSE ops to host syscalls on
// a single rooted subtree.
//
// Each mount has a host-side `root` directory; the backend exposes that
// subtree to the guest via FUSE. Inode numbers in the FUSE protocol are
// allocated by the backend (NOT the host filesystem's st_ino) so:
//   - We can change the host filesystem under us without invalidating
//     guest-side caches (which key on nodeid).
//   - Multiple mount instances of the same host path each get their
//     own nodeid namespace.
//
// The backend keeps a small bidirectional map:
//   nodeid -> InodeInfo { host_path, kind }
//   (parent_nodeid, name) -> nodeid    (lookup cache)
//
// Handle table: open files map fh -> RawFd. Opening the same path
// twice gives different fhs, mirroring open(2) semantics.
//
// Symlinks are followed during traversal — virtio-fs's typical use case
// is "expose this directory tree read-only-ish", not "expose a chroot
// jail". A later slice can add no-follow + open_by_handle for security.

use std::collections::BTreeMap;
use std::ffi::{CString, OsStr, OsString};
use std::os::unix::ffi::OsStrExt;
use std::os::unix::fs::MetadataExt;
use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};

use super::backend::{
    DirEntry, Entry, Errno, FsBackend, StatFs, EACCES, EBADF, EINVAL, ENOENT, ENOSPC, ENOTDIR,
    EISDIR, EIO,
};
use super::notify::Notifier;
use super::protocol::{
    Attr, DT_BLK, DT_CHR, DT_DIR, DT_FIFO, DT_LNK, DT_REG, DT_SOCK, DT_UNKNOWN,
    FUSE_ROOT_ID, S_IFBLK, S_IFCHR, S_IFDIR, S_IFIFO, S_IFLNK, S_IFMT, S_IFREG, S_IFSOCK,
};

/// What kind of host object backs a given nodeid. We cache this so
/// hot-path `getattr` doesn't always have to stat.
#[derive(Clone, Copy, Debug)]
enum Kind {
    File,
    Dir,
    Symlink,
    Other,
}

#[derive(Clone)]
struct InodeInfo {
    host_path: PathBuf,
    kind: Kind,
}

/// One DAX-mmap'd region of a host file. Multiple slots may share
/// one Mmap if SETUPMAPPING calls overlap (we deduplicate on host
/// path + foffset boundaries in a later optimization; for now each
/// dax_map allocates a fresh mmap).
struct Mmap {
    ptr: *mut u8,
    len: usize,
}

// SAFETY: ptr is a process-local pointer used only by DaxSession's
// HvfMapper. Mmap-as-DAX-source isn't dereferenced from rust here.
unsafe impl Send for Mmap {}
unsafe impl Sync for Mmap {}

struct State {
    /// nodeid -> InodeInfo.
    inodes: BTreeMap<u64, InodeInfo>,
    /// (parent, name) -> nodeid. Populated by lookup.
    children: BTreeMap<(u64, Vec<u8>), u64>,
    /// fh -> OwnedFd. Closed on `release` / `releasedir`.
    handles: BTreeMap<u64, OwnedFd>,
    /// Active DAX mappings indexed by host_va. Owns the mmap; dropping
    /// hits munmap.
    dax_mmaps: BTreeMap<usize, Mmap>,
    next_nodeid: u64,
    next_fh: u64,
}

impl Drop for State {
    fn drop(&mut self) {
        for (_, m) in std::mem::take(&mut self.dax_mmaps) {
            unsafe {
                libc::munmap(m.ptr as *mut _, m.len);
            }
        }
    }
}

/// POSIX-backed FUSE filesystem rooted at `root`. All paths are
/// constrained to live under `root` — we don't `chroot`, we just
/// resolve names manually so we can refuse `..` escapes.
pub struct PosixFs {
    st: Mutex<State>,
    /// Background thread that watches file-content-change events via
    /// kqueue and dispatches them to the guest as
    /// FUSE_NOTIFY_INVAL_INODE messages.
    watcher: Mutex<Option<Watcher>>,
}

/// State for the kqueue watcher thread.
struct Watcher {
    /// Shared with the watcher thread via Arc. The thread reads kq,
    /// looks up the (kq_ident → nodeid) map, and calls notifier.
    /// We hold a strong ref so dropping PosixFs stops the thread.
    inner: Arc<WatcherInner>,
    /// Join handle for clean shutdown.
    handle: Option<std::thread::JoinHandle<()>>,
}

struct WatcherInner {
    /// kqueue fd; the watcher thread waits on this.
    kq: libc::c_int,
    /// Stop signal — set by drop, polled by the watcher thread.
    stop: AtomicBool,
    /// Notifier the watcher thread invokes when a kevent fires. None
    /// until set_notifier is called.
    notifier: Mutex<Option<Arc<dyn Notifier>>>,
    /// Map kqueue ident (an fd we duped specifically for kqueue) →
    /// (nodeid, parent_nodeid, name, owned_fd). parent_nodeid + name
    /// let us also invalidate the dentry cache so re-opens see the
    /// new inode after an atomic rename.
    watched: Mutex<BTreeMap<libc::c_int, WatchedEntry>>,
}

struct WatchedEntry {
    nodeid: u64,
    parent_nodeid: u64,
    name: Vec<u8>,
    _owned_fd: OwnedFd,
}

impl Drop for Watcher {
    fn drop(&mut self) {
        self.inner.stop.store(true, Ordering::Release);
        // Poke the kqueue with a USER event so the thread wakes from kevent.
        let trigger = libc::kevent {
            ident: 0,
            filter: libc::EVFILT_USER,
            flags: libc::EV_ADD | libc::EV_ONESHOT | libc::EV_RECEIPT,
            fflags: libc::NOTE_TRIGGER,
            data: 0,
            udata: std::ptr::null_mut(),
        };
        let mut tr = trigger;
        unsafe {
            let _ = libc::kevent(self.inner.kq, &mut tr as *mut _, 1, std::ptr::null_mut(), 0, std::ptr::null());
        }
        if let Some(h) = self.handle.take() {
            let _ = h.join();
        }
        unsafe {
            libc::close(self.inner.kq);
        }
    }
}

impl PosixFs {
    /// Mount `root` as the FUSE filesystem root. The root must exist
    /// and be a directory; we stat it eagerly so a misconfigured
    /// mount fails fast.
    pub fn new(root: impl Into<PathBuf>) -> Result<Self, std::io::Error> {
        let root = root.into();
        let md = std::fs::metadata(&root)?;
        if !md.is_dir() {
            return Err(std::io::Error::new(
                std::io::ErrorKind::NotADirectory,
                format!("posix-fs root is not a directory: {}", root.display()),
            ));
        }
        let mut inodes = BTreeMap::new();
        inodes.insert(
            FUSE_ROOT_ID,
            InodeInfo {
                host_path: root,
                kind: Kind::Dir,
            },
        );
        Ok(Self {
            st: Mutex::new(State {
                inodes,
                children: BTreeMap::new(),
                handles: BTreeMap::new(),
                dax_mmaps: BTreeMap::new(),
                next_nodeid: FUSE_ROOT_ID + 1,
                next_fh: 1,
            }),
            watcher: Mutex::new(None),
        })
    }

    /// Attach a Notifier and start a kqueue background thread that
    /// watches host-side changes to files this PosixFs has surfaced
    /// to the guest. On NOTE_DELETE / NOTE_RENAME / NOTE_WRITE /
    /// NOTE_EXTEND / NOTE_ATTRIB, dispatches FUSE_NOTIFY_INVAL_INODE
    /// to the guest so its dentry + page caches re-read on next access.
    ///
    /// Watching is per-inode: a file gets watched on first OPEN or
    /// SETUPMAPPING. This covers the dev-loop pattern (editor saves a
    /// file the guest is using); for rarely-accessed files we rely on
    /// the guest's 1s attr_valid timeout. Recursive directory watching
    /// is a follow-up if needed.
    pub fn set_notifier(&self, notifier: Arc<dyn Notifier>) -> Result<(), std::io::Error> {
        let mut watcher_slot = self.watcher.lock().unwrap();
        if watcher_slot.is_some() {
            // Already running; just swap the notifier.
            let w = watcher_slot.as_ref().unwrap();
            *w.inner.notifier.lock().unwrap() = Some(notifier);
            return Ok(());
        }
        let kq = unsafe { libc::kqueue() };
        if kq < 0 {
            return Err(std::io::Error::last_os_error());
        }
        let inner = Arc::new(WatcherInner {
            kq,
            stop: AtomicBool::new(false),
            notifier: Mutex::new(Some(notifier)),
            watched: Mutex::new(BTreeMap::new()),
        });
        let thread_inner = inner.clone();
        let handle = std::thread::Builder::new()
            .name("supermachine-posixfs-watch".to_owned())
            .spawn(move || run_watcher(thread_inner))
            .map_err(|e| std::io::Error::other(e.to_string()))?;
        *watcher_slot = Some(Watcher {
            inner,
            handle: Some(handle),
        });
        Ok(())
    }

    /// Add `path` to the kqueue watch list, associated with `nodeid`.
    /// Best-effort: returns without error if watching is disabled or
    /// the file can't be opened. Calls should be cheap (open + 1
    /// kevent) so it's safe to invoke on every OPEN/SETUPMAPPING.
    fn watch_inode(&self, nodeid: u64, path: &std::path::Path) {
        let watcher = self.watcher.lock().unwrap();
        let Some(w) = watcher.as_ref() else { return };
        // Open a dedicated fd so the watch persists independent of
        // the guest's open file handle.
        let c = match CString::new(path.as_os_str().as_bytes()) {
            Ok(c) => c,
            Err(_) => return,
        };
        let fd = unsafe { libc::open(c.as_ptr(), libc::O_RDONLY | libc::O_EVTONLY) };
        if fd < 0 {
            return;
        }
        let owned = unsafe { OwnedFd::from_raw_fd(fd) };

        // Look up parent_nodeid + name so the watcher thread can emit
        // FUSE_NOTIFY_INVAL_ENTRY on rename/delete. Required to flush
        // the guest's dentry cache (1 s default TTL otherwise).
        let (parent_nodeid, name) = {
            let st = self.st.lock().unwrap();
            // Reverse-lookup in the (parent, name) → nodeid table.
            let entry = st
                .children
                .iter()
                .find(|(_, id)| **id == nodeid)
                .map(|((p, n), _)| (*p, n.clone()));
            match entry {
                Some(e) => e,
                None => return, // root or unknown — skip dentry invalidation
            }
        };

        let mut watched = w.inner.watched.lock().unwrap();
        // Already watching this nodeid? Drop old, install new.
        watched.retain(|_, e| e.nodeid != nodeid);

        let ev = libc::kevent {
            ident: fd as libc::uintptr_t,
            filter: libc::EVFILT_VNODE,
            flags: libc::EV_ADD | libc::EV_CLEAR,
            fflags: libc::NOTE_DELETE
                | libc::NOTE_RENAME
                | libc::NOTE_WRITE
                | libc::NOTE_EXTEND
                | libc::NOTE_ATTRIB,
            data: 0,
            udata: std::ptr::null_mut(),
        };
        let mut event = ev;
        let rc = unsafe {
            libc::kevent(
                w.inner.kq,
                &mut event as *mut _,
                1,
                std::ptr::null_mut(),
                0,
                std::ptr::null(),
            )
        };
        if rc < 0 {
            return;
        }
        watched.insert(
            fd,
            WatchedEntry {
                nodeid,
                parent_nodeid,
                name,
                _owned_fd: owned,
            },
        );
    }

    fn host_path_of(&self, nodeid: u64) -> Result<PathBuf, Errno> {
        let st = self.st.lock().unwrap();
        st.inodes.get(&nodeid).map(|i| i.host_path.clone()).ok_or(ENOENT)
    }

    fn kind_of(&self, nodeid: u64) -> Result<Kind, Errno> {
        let st = self.st.lock().unwrap();
        st.inodes.get(&nodeid).map(|i| i.kind).ok_or(ENOENT)
    }
}

fn errno_now() -> Errno {
    -(std::io::Error::last_os_error().raw_os_error().unwrap_or(libc::EIO))
}

fn attr_from_meta(ino: u64, md: &std::fs::Metadata) -> Attr {
    let mode_full = md.mode();
    let perm_bits = mode_full & 0o7777;
    let typ_bits = if md.is_dir() {
        S_IFDIR
    } else if md.is_file() {
        S_IFREG
    } else if md.file_type().is_symlink() {
        S_IFLNK
    } else {
        match mode_full & S_IFMT {
            S_IFBLK => S_IFBLK,
            S_IFCHR => S_IFCHR,
            S_IFIFO => S_IFIFO,
            S_IFSOCK => S_IFSOCK,
            _ => 0,
        }
    };
    Attr {
        ino,
        size: md.size(),
        blocks: md.blocks(),
        atime: md.atime() as u64,
        mtime: md.mtime() as u64,
        ctime: md.ctime() as u64,
        atimensec: md.atime_nsec() as u32,
        mtimensec: md.mtime_nsec() as u32,
        ctimensec: md.ctime_nsec() as u32,
        mode: typ_bits | perm_bits,
        nlink: md.nlink() as u32,
        uid: md.uid(),
        gid: md.gid(),
        rdev: md.rdev() as u32,
        blksize: md.blksize() as u32,
        flags: 0,
    }
}

fn kind_from_meta(md: &std::fs::Metadata) -> Kind {
    if md.is_dir() {
        Kind::Dir
    } else if md.is_file() {
        Kind::File
    } else if md.file_type().is_symlink() {
        Kind::Symlink
    } else {
        Kind::Other
    }
}

fn name_safe(name: &OsStr) -> Result<(), Errno> {
    let bytes = name.as_bytes();
    if bytes.is_empty() || bytes == b"." || bytes == b".." {
        return Err(EINVAL);
    }
    if bytes.contains(&b'/') {
        return Err(EINVAL);
    }
    Ok(())
}

impl FsBackend for PosixFs {
    fn lookup(&self, parent: u64, name: &OsStr) -> Result<Entry, Errno> {
        name_safe(name)?;
        let parent_path = self.host_path_of(parent)?;
        let path = parent_path.join(name);
        let md = std::fs::metadata(&path).map_err(|e| -e.raw_os_error().unwrap_or(libc::EIO))?;

        let mut st = self.st.lock().unwrap();
        // Reuse existing nodeid if we've looked this child up before.
        let key = (parent, name.as_bytes().to_vec());
        let nodeid = match st.children.get(&key) {
            Some(&id) => id,
            None => {
                let id = st.next_nodeid;
                st.next_nodeid += 1;
                st.inodes.insert(
                    id,
                    InodeInfo {
                        host_path: path.clone(),
                        kind: kind_from_meta(&md),
                    },
                );
                st.children.insert(key, id);
                id
            }
        };
        let attr = attr_from_meta(nodeid, &md);
        Ok(Entry {
            nodeid,
            generation: 0,
            attr,
            entry_valid: 1,
            attr_valid: 1,
        })
    }

    fn forget(&self, _nodeid: u64, _nlookup: u64) {
        // We retain inode entries indefinitely for path stability.
        // Real production would reference-count and prune. Tests
        // don't depend on prune so we no-op.
    }

    fn getattr(&self, nodeid: u64, _fh: Option<u64>) -> Result<Attr, Errno> {
        let path = self.host_path_of(nodeid)?;
        let md = std::fs::metadata(&path).map_err(|e| -e.raw_os_error().unwrap_or(libc::EIO))?;
        Ok(attr_from_meta(nodeid, &md))
    }

    fn open(&self, nodeid: u64, flags: u32) -> Result<u64, Errno> {
        let path = self.host_path_of(nodeid)?;
        match self.kind_of(nodeid)? {
            Kind::Dir => return Err(EISDIR),
            _ => {}
        }
        // Best-effort: register a kqueue watch on this inode so host
        // changes propagate to the guest as FUSE_NOTIFY_INVAL_INODE.
        self.watch_inode(nodeid, &path);
        let c = CString::new(path.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        // Mask out CREAT/EXCL: virtio-fs always issues a LOOKUP first;
        // OPEN should not create. We honor RDONLY/RDWR/WRONLY + DIRECT.
        let access = flags as i32 & libc::O_ACCMODE;
        let fd = unsafe { libc::open(c.as_ptr(), access) };
        if fd < 0 {
            return Err(errno_now());
        }
        // SAFETY: open returned a valid fd above.
        let owned = unsafe { OwnedFd::from_raw_fd(fd) };
        let mut st = self.st.lock().unwrap();
        let fh = st.next_fh;
        st.next_fh += 1;
        st.handles.insert(fh, owned);
        Ok(fh)
    }

    fn read(&self, _nodeid: u64, fh: u64, offset: u64, size: u32) -> Result<Vec<u8>, Errno> {
        let st = self.st.lock().unwrap();
        let raw = st.handles.get(&fh).ok_or(EBADF)?.as_raw_fd();
        drop(st);
        let mut buf = vec![0u8; size as usize];
        let n = unsafe {
            libc::pread(raw, buf.as_mut_ptr() as *mut _, buf.len(), offset as libc::off_t)
        };
        if n < 0 {
            return Err(errno_now());
        }
        buf.truncate(n as usize);
        Ok(buf)
    }

    fn release(&self, _nodeid: u64, fh: u64) -> Result<(), Errno> {
        let mut st = self.st.lock().unwrap();
        st.handles.remove(&fh).ok_or(EBADF).map(|_| ())
    }

    fn write(&self, _nodeid: u64, fh: u64, offset: u64, data: &[u8]) -> Result<u32, Errno> {
        let st = self.st.lock().unwrap();
        let raw = st.handles.get(&fh).ok_or(EBADF)?.as_raw_fd();
        drop(st);
        let n = unsafe {
            libc::pwrite(
                raw,
                data.as_ptr() as *const _,
                data.len(),
                offset as libc::off_t,
            )
        };
        if n < 0 {
            return Err(errno_now());
        }
        Ok(n as u32)
    }

    fn fsync(&self, _nodeid: u64, fh: u64, datasync: bool) -> Result<(), Errno> {
        let st = self.st.lock().unwrap();
        let raw = st.handles.get(&fh).ok_or(EBADF)?.as_raw_fd();
        drop(st);
        let rc = unsafe {
            if datasync {
                libc::fsync(raw) // macOS doesn't have fdatasync; fsync is the stronger form
            } else {
                libc::fsync(raw)
            }
        };
        if rc != 0 {
            return Err(errno_now());
        }
        Ok(())
    }

    fn opendir(&self, nodeid: u64, _flags: u32) -> Result<u64, Errno> {
        let path = self.host_path_of(nodeid)?;
        let c = CString::new(path.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        let fd = unsafe { libc::open(c.as_ptr(), libc::O_RDONLY | libc::O_DIRECTORY) };
        if fd < 0 {
            return Err(errno_now());
        }
        let owned = unsafe { OwnedFd::from_raw_fd(fd) };
        let mut st = self.st.lock().unwrap();
        let fh = st.next_fh;
        st.next_fh += 1;
        st.handles.insert(fh, owned);
        Ok(fh)
    }

    fn readdir(
        &self,
        nodeid: u64,
        _fh: u64,
        offset: u64,
        _size: u32,
    ) -> Result<Vec<DirEntry>, Errno> {
        // We rebuild the dirent list each call rather than caching
        // (real production should cache per-fh between consecutive
        // offsets). Use std::fs::read_dir for portability.
        let path = self.host_path_of(nodeid)?;
        let rd = std::fs::read_dir(&path).map_err(|e| -e.raw_os_error().unwrap_or(libc::EIO))?;
        let mut out = Vec::new();
        for (i, entry_res) in rd.enumerate() {
            if (i as u64) < offset {
                continue;
            }
            let entry = match entry_res {
                Ok(e) => e,
                Err(_) => continue,
            };
            let typ = match entry.file_type() {
                Ok(t) if t.is_dir() => DT_DIR,
                Ok(t) if t.is_file() => DT_REG,
                Ok(t) if t.is_symlink() => DT_LNK,
                Ok(t) => match t {
                    t if t.is_block_device() => DT_BLK,
                    t if t.is_char_device() => DT_CHR,
                    t if t.is_fifo() => DT_FIFO,
                    t if t.is_socket() => DT_SOCK,
                    _ => DT_UNKNOWN,
                },
                Err(_) => DT_UNKNOWN,
            };
            // Inode number: we can't allocate a nodeid until LOOKUP
            // runs (the guest will issue a LOOKUP for any entry it
            // wants to use). Send the host's st_ino so directory
            // listings show stable values; the guest only relies on
            // the name+type for the readdir surface.
            let ino = entry.metadata().map(|m| m.ino()).unwrap_or(0);
            out.push(DirEntry {
                ino,
                name: entry.file_name().as_bytes().to_vec(),
                typ,
            });
        }
        Ok(out)
    }

    fn releasedir(&self, _nodeid: u64, fh: u64) -> Result<(), Errno> {
        let mut st = self.st.lock().unwrap();
        st.handles.remove(&fh).ok_or(EBADF).map(|_| ())
    }

    fn statfs(&self, nodeid: u64) -> Result<StatFs, Errno> {
        let path = self.host_path_of(nodeid)?;
        let c = CString::new(path.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        let mut s: libc::statfs = unsafe { std::mem::zeroed() };
        if unsafe { libc::statfs(c.as_ptr(), &mut s) } < 0 {
            return Err(errno_now());
        }
        Ok(StatFs {
            blocks: s.f_blocks,
            bfree: s.f_bfree,
            bavail: s.f_bavail,
            files: s.f_files,
            ffree: s.f_ffree,
            bsize: s.f_bsize as u32,
            namelen: 255,
            frsize: s.f_bsize as u32,
        })
    }

    fn create(
        &self,
        parent: u64,
        name: &OsStr,
        mode: u32,
        flags: u32,
    ) -> Result<(crate::fuse::backend::Entry, u64), Errno> {
        name_safe(name)?;
        let parent_path = self.host_path_of(parent)?;
        let full = parent_path.join(name);
        let c = CString::new(full.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        let access = flags as i32 & libc::O_ACCMODE;
        let fd = unsafe {
            libc::open(
                c.as_ptr(),
                access | libc::O_CREAT | libc::O_EXCL,
                mode as libc::c_uint,
            )
        };
        if fd < 0 {
            return Err(errno_now());
        }
        let owned = unsafe { OwnedFd::from_raw_fd(fd) };

        let md = std::fs::metadata(&full).map_err(|e| -e.raw_os_error().unwrap_or(libc::EIO))?;
        let mut st = self.st.lock().unwrap();
        let nodeid = st.next_nodeid;
        st.next_nodeid += 1;
        st.inodes.insert(
            nodeid,
            InodeInfo {
                host_path: full.clone(),
                kind: kind_from_meta(&md),
            },
        );
        st.children.insert((parent, name.as_bytes().to_vec()), nodeid);
        let fh = st.next_fh;
        st.next_fh += 1;
        st.handles.insert(fh, owned);
        let attr = attr_from_meta(nodeid, &md);
        Ok((
            crate::fuse::backend::Entry {
                nodeid,
                generation: 0,
                attr,
                entry_valid: 1,
                attr_valid: 1,
            },
            fh,
        ))
    }

    fn mkdir(&self, parent: u64, name: &OsStr, mode: u32) -> Result<crate::fuse::backend::Entry, Errno> {
        name_safe(name)?;
        let parent_path = self.host_path_of(parent)?;
        let full = parent_path.join(name);
        let c = CString::new(full.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        let rc = unsafe { libc::mkdir(c.as_ptr(), mode as libc::mode_t) };
        if rc != 0 {
            return Err(errno_now());
        }
        let md = std::fs::metadata(&full).map_err(|e| -e.raw_os_error().unwrap_or(libc::EIO))?;
        let mut st = self.st.lock().unwrap();
        let nodeid = st.next_nodeid;
        st.next_nodeid += 1;
        st.inodes.insert(
            nodeid,
            InodeInfo {
                host_path: full,
                kind: Kind::Dir,
            },
        );
        st.children.insert((parent, name.as_bytes().to_vec()), nodeid);
        Ok(crate::fuse::backend::Entry {
            nodeid,
            generation: 0,
            attr: attr_from_meta(nodeid, &md),
            entry_valid: 1,
            attr_valid: 1,
        })
    }

    fn unlink(&self, parent: u64, name: &OsStr) -> Result<(), Errno> {
        name_safe(name)?;
        let parent_path = self.host_path_of(parent)?;
        let full = parent_path.join(name);
        let c = CString::new(full.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        let rc = unsafe { libc::unlink(c.as_ptr()) };
        if rc != 0 {
            return Err(errno_now());
        }
        let mut st = self.st.lock().unwrap();
        st.children.remove(&(parent, name.as_bytes().to_vec()));
        Ok(())
    }

    fn rmdir(&self, parent: u64, name: &OsStr) -> Result<(), Errno> {
        name_safe(name)?;
        let parent_path = self.host_path_of(parent)?;
        let full = parent_path.join(name);
        let c = CString::new(full.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
        let rc = unsafe { libc::rmdir(c.as_ptr()) };
        if rc != 0 {
            return Err(errno_now());
        }
        let mut st = self.st.lock().unwrap();
        st.children.remove(&(parent, name.as_bytes().to_vec()));
        Ok(())
    }

    fn dax_map(
        &self,
        nodeid: u64,
        fh: u64,
        foffset: u64,
        len: u64,
        prot: u32,
    ) -> Result<*mut u8, Errno> {
        // The guest's iomap-driven read path (used by `dax=always`)
        // sends SETUPMAPPING with `fh = u64::MAX` because the read
        // is inode-level (no userspace fd backs it). Open the file
        // on demand by walking the inode table back to the host
        // path. For mmap-driven SETUPMAPPING (spike-22 zero-copy)
        // the fh IS a valid handle from a prior FUSE_OPEN — use it.
        //
        // mmap() captures the fd internally; the OwnedFd we hold
        // here covers the lifetime through the mmap call. After
        // mmap returns success the kernel keeps its own reference,
        // so dropping our OwnedFd on the unwind path is safe.
        let opened_fresh: Option<OwnedFd>;
        let raw = if fh == u64::MAX {
            let path = self.host_path_of(nodeid)?;
            let c = CString::new(path.as_os_str().as_bytes()).map_err(|_| EINVAL)?;
            // Try RDWR first so writes through DAX work; fall back
            // to RDONLY for files we can't open RW (e.g. read-only
            // host file).
            let mut fd = unsafe { libc::open(c.as_ptr(), libc::O_RDWR) };
            if fd < 0 {
                fd = unsafe { libc::open(c.as_ptr(), libc::O_RDONLY) };
            }
            if fd < 0 {
                return Err(errno_now());
            }
            // SAFETY: fd is fresh from open().
            let owned = unsafe { OwnedFd::from_raw_fd(fd) };
            let raw = owned.as_raw_fd();
            opened_fresh = Some(owned);
            raw
        } else {
            let st = self.st.lock().unwrap();
            let raw = st.handles.get(&fh).ok_or(EBADF)?.as_raw_fd();
            drop(st);
            opened_fresh = None;
            raw
        };
        // Suppress unused-var lint when fh != MAX.
        let _ = &opened_fresh;
        // Spike 22 validated: PROT_READ|PROT_WRITE host backing
        // works for both R and RW DAX. We always map RW on host
        // and let the guest-side stage-2 protection enforce R-only
        // semantics. Apple's HVF requires writable host backing
        // regardless of guest-side flags.
        let _ = prot; // recorded for hv_vm_map elsewhere
        let host_prot = libc::PROT_READ | libc::PROT_WRITE;
        let ptr = unsafe {
            libc::mmap(
                std::ptr::null_mut(),
                len as usize,
                host_prot,
                libc::MAP_SHARED,
                raw,
                foffset as libc::off_t,
            )
        };
        if ptr == libc::MAP_FAILED {
            return Err(errno_now());
        }
        let mut st = self.st.lock().unwrap();
        st.dax_mmaps.insert(
            ptr as usize,
            Mmap {
                ptr: ptr as *mut u8,
                len: len as usize,
            },
        );
        Ok(ptr as *mut u8)
    }

    fn dax_unmap(&self, _nodeid: u64, host_va: *mut u8, _len: u64) -> Result<(), Errno> {
        let mut st = self.st.lock().unwrap();
        let m = st.dax_mmaps.remove(&(host_va as usize)).ok_or(EINVAL)?;
        let rc = unsafe { libc::munmap(m.ptr as *mut _, m.len) };
        if rc != 0 {
            return Err(errno_now());
        }
        Ok(())
    }
}

/// Background thread body: blocks in kevent(), dispatches NOTE_*
/// events to the notifier.
fn run_watcher(inner: Arc<WatcherInner>) {
    // Register a USER event so Drop can wake us.
    let wakeup = libc::kevent {
        ident: 0,
        filter: libc::EVFILT_USER,
        flags: libc::EV_ADD | libc::EV_CLEAR,
        fflags: 0,
        data: 0,
        udata: std::ptr::null_mut(),
    };
    let mut w = wakeup;
    unsafe {
        libc::kevent(
            inner.kq,
            &mut w as *mut _,
            1,
            std::ptr::null_mut(),
            0,
            std::ptr::null(),
        );
    }

    let mut events: [libc::kevent; 16] = unsafe { std::mem::zeroed() };
    loop {
        if inner.stop.load(Ordering::Acquire) {
            break;
        }
        let n = unsafe {
            libc::kevent(
                inner.kq,
                std::ptr::null(),
                0,
                events.as_mut_ptr(),
                events.len() as libc::c_int,
                std::ptr::null(),
            )
        };
        if n < 0 {
            let err = std::io::Error::last_os_error();
            if err.raw_os_error() == Some(libc::EINTR) {
                continue;
            }
            eprintln!("[posix-fs watcher] kevent failed: {err}; thread exiting");
            return;
        }
        if inner.stop.load(Ordering::Acquire) {
            break;
        }
        for ev in events.iter().take(n as usize) {
            if ev.filter == libc::EVFILT_USER {
                continue;
            }
            // Look up the watched entry for the kqueue ident.
            let fd = ev.ident as libc::c_int;
            let entry = inner
                .watched
                .lock()
                .unwrap()
                .get(&fd)
                .map(|e| (e.nodeid, e.parent_nodeid, e.name.clone()));
            let Some((nodeid, parent_nodeid, name)) = entry else { continue };
            if let Some(n) = inner.notifier.lock().unwrap().as_ref() {
                // Three-step invalidation:
                //   (off=0, len=-1) — invalidates DATA pages
                //   (off=0, len=0)  — invalidates ATTRS (size, mtime,
                //                      etc.). FUSE protocol uses this
                //                      (0,0) sentinel specifically.
                //   INVAL_ENTRY     — drops the dentry cache so a
                //                      re-open under the same name
                //                      hits the NEW inode after an
                //                      atomic rename.
                // All three are needed to make atomic-rename feel
                // synchronous to the guest's userspace.
                n.invalidate_inode(nodeid, 0, -1);
                n.invalidate_inode(nodeid, 0, 0);
                n.invalidate_entry(parent_nodeid, &name);
            }
            // If the file was deleted or renamed, drop the watch so
            // we don't keep an orphan fd.
            if ev.fflags & (libc::NOTE_DELETE | libc::NOTE_RENAME) != 0 {
                inner.watched.lock().unwrap().remove(&fd);
            }
        }
    }
}

// Unused-import suppression — std re-exports are conditional on macOS file_type extensions.
use std::os::unix::fs::FileTypeExt;
#[allow(unused_imports)]
use std::convert::TryFrom;
#[allow(dead_code)]
const _: () = {
    let _ = OsString::new;
    let _ = ENOSPC;
    let _ = ENOTDIR;
    let _ = EACCES;
    let _ = EIO;
};

#[cfg(test)]
mod tests {
    use super::*;

    fn tmpdir(name: &str) -> PathBuf {
        let pid = unsafe { libc::getpid() };
        let p = std::env::temp_dir().join(format!("posixfs-{pid}-{name}"));
        let _ = std::fs::remove_dir_all(&p);
        std::fs::create_dir_all(&p).unwrap();
        p
    }

    #[test]
    fn lookup_and_read_real_file() {
        let dir = tmpdir("t1");
        std::fs::write(dir.join("hello.txt"), b"hi from posix").unwrap();
        let fs = PosixFs::new(&dir).unwrap();
        let e = fs.lookup(FUSE_ROOT_ID, OsStr::new("hello.txt")).unwrap();
        assert!(e.attr.size == 13);
        let fh = fs.open(e.nodeid, libc::O_RDONLY as u32).unwrap();
        let buf = fs.read(e.nodeid, fh, 0, 64).unwrap();
        assert_eq!(buf, b"hi from posix");
        fs.release(e.nodeid, fh).unwrap();
    }

    #[test]
    fn readdir_lists_real_entries_with_types() {
        let dir = tmpdir("t2");
        std::fs::write(dir.join("a.txt"), b"a").unwrap();
        std::fs::create_dir_all(dir.join("sub")).unwrap();
        let fs = PosixFs::new(&dir).unwrap();
        let dh = fs.opendir(FUSE_ROOT_ID, 0).unwrap();
        let entries = fs.readdir(FUSE_ROOT_ID, dh, 0, 4096).unwrap();
        let by_name: std::collections::HashMap<&[u8], u32> =
            entries.iter().map(|e| (e.name.as_slice(), e.typ)).collect();
        assert_eq!(by_name[&b"a.txt"[..]], DT_REG);
        assert_eq!(by_name[&b"sub"[..]], DT_DIR);
        fs.releasedir(FUSE_ROOT_ID, dh).unwrap();
    }

    #[test]
    fn lookup_rejects_dotdot() {
        let dir = tmpdir("t3");
        let fs = PosixFs::new(&dir).unwrap();
        let err = fs.lookup(FUSE_ROOT_ID, OsStr::new("..")).unwrap_err();
        assert_eq!(err, EINVAL);
    }

    #[test]
    fn lookup_rejects_slash_in_name() {
        let dir = tmpdir("t4");
        let fs = PosixFs::new(&dir).unwrap();
        let err = fs.lookup(FUSE_ROOT_ID, OsStr::new("a/b")).unwrap_err();
        assert_eq!(err, EINVAL);
    }

    #[test]
    fn dax_map_then_unmap_round_trip() {
        let dir = tmpdir("t5");
        // 32 KiB file with known pattern.
        let path = dir.join("data.bin");
        let mut data = vec![0u8; 32 * 1024];
        for (i, b) in data.iter_mut().enumerate() {
            *b = (i % 251) as u8;
        }
        std::fs::write(&path, &data).unwrap();

        let fs = PosixFs::new(&dir).unwrap();
        let e = fs.lookup(FUSE_ROOT_ID, OsStr::new("data.bin")).unwrap();
        let fh = fs.open(e.nodeid, libc::O_RDWR as u32).unwrap();

        // mmap the whole file via dax_map.
        let host_va = fs.dax_map(e.nodeid, fh, 0, 32 * 1024, 0).unwrap();
        assert!(!host_va.is_null());
        // Verify mmap contents match the file we wrote.
        let host_slice = unsafe { std::slice::from_raw_parts(host_va, 32 * 1024) };
        assert_eq!(host_slice, &data[..]);

        // Unmap — must succeed and clear internal tracking.
        fs.dax_unmap(e.nodeid, host_va, 32 * 1024).unwrap();
        // Calling dax_unmap on a host_va we don't know about must error.
        assert_eq!(fs.dax_unmap(e.nodeid, host_va, 32 * 1024).unwrap_err(), EINVAL);
        fs.release(e.nodeid, fh).unwrap();
    }

    #[test]
    fn read_eof_returns_empty() {
        let dir = tmpdir("t6");
        std::fs::write(dir.join("x"), b"short").unwrap();
        let fs = PosixFs::new(&dir).unwrap();
        let e = fs.lookup(FUSE_ROOT_ID, OsStr::new("x")).unwrap();
        let fh = fs.open(e.nodeid, libc::O_RDONLY as u32).unwrap();
        let eof = fs.read(e.nodeid, fh, 100, 10).unwrap();
        assert!(eof.is_empty());
        fs.release(e.nodeid, fh).unwrap();
    }

    #[test]
    fn open_directory_returns_eisdir() {
        let dir = tmpdir("t7");
        std::fs::create_dir_all(dir.join("sub")).unwrap();
        let fs = PosixFs::new(&dir).unwrap();
        let e = fs.lookup(FUSE_ROOT_ID, OsStr::new("sub")).unwrap();
        let err = fs.open(e.nodeid, libc::O_RDONLY as u32).unwrap_err();
        assert_eq!(err, EISDIR);
    }
}