Skip to main content

libfuse_fs/passthrough/
mod.rs

1#![allow(clippy::useless_conversion)]
2use config::{CachePolicy, Config};
3#[cfg(target_os = "linux")]
4use file_handle::{FileHandle, OpenableFileHandle};
5
6#[cfg(target_os = "macos")]
7use self::statx::statx_timestamp;
8use futures::executor::block_on;
9use inode_store::{InodeId, InodeStore};
10#[cfg(target_os = "linux")]
11use libc::{self, statx_timestamp};
12
13use moka::future::Cache;
14use rfuse3::{Errno, raw::reply::ReplyEntry};
15use uuid::Uuid;
16
17use crate::passthrough::mmap::{MmapCachedValue, MmapChunkKey};
18use crate::util::convert_stat64_to_file_attr;
19#[cfg(target_os = "linux")]
20use mount_fd::MountFds;
21use statx::StatExt;
22use std::cmp;
23use std::io::Result;
24use std::ops::DerefMut;
25#[cfg(target_os = "macos")]
26use std::os::fd::FromRawFd;
27use std::os::unix::ffi::OsStrExt;
28use std::path::Path;
29#[cfg(target_os = "macos")]
30use std::sync::Mutex as StdMutex;
31use tracing::error;
32use tracing::{debug, warn};
33
34#[cfg(target_os = "macos")]
35use std::num::NonZeroUsize;
36#[cfg(target_os = "macos")]
37use std::sync::Weak;
38use std::sync::atomic::{AtomicBool, AtomicU32};
39use std::{
40    collections::{BTreeMap, btree_map},
41    ffi::{CStr, CString, OsString},
42    fs::File,
43    io::{self, Error},
44    marker::PhantomData,
45    os::{
46        fd::{AsFd, AsRawFd, BorrowedFd, RawFd},
47        unix::ffi::OsStringExt,
48    },
49    path::PathBuf,
50    sync::Arc,
51    sync::atomic::{AtomicU64, Ordering},
52    time::Duration,
53};
54use util::{
55    UniqueInodeGenerator, ebadf, is_dir, openat, reopen_fd_through_proc, stat_fd,
56    validate_path_component,
57};
58
59use vm_memory::bitmap::BitmapSlice;
60
61use nix::sys::resource::{Resource, getrlimit, setrlimit};
62
63pub mod async_io;
64pub mod config;
65#[cfg(target_os = "linux")]
66mod file_handle;
67mod inode_store;
68mod mmap;
69#[cfg(target_os = "linux")]
70mod mount_fd;
71mod os_compat;
72mod statx;
73pub mod util;
74
75/// Current directory
76pub const CURRENT_DIR_CSTR: &[u8] = b".\0";
77/// Parent directory
78pub const PARENT_DIR_CSTR: &[u8] = b"..\0";
79pub const VFS_MAX_INO: u64 = 0xff_ffff_ffff_ffff;
80/// Path to `/proc/self/mountinfo`, consumed by the Linux-only file-handle
81/// path (`mount_fd::MountFds`). Linux-only — macOS lacks `/proc`.
82#[cfg(target_os = "linux")]
83const MOUNT_INFO_FILE: &str = "/proc/self/mountinfo";
84pub const EMPTY_CSTR: &[u8] = b"\0";
85#[cfg(target_os = "linux")]
86pub const PROC_SELF_FD_CSTR: &[u8] = b"/proc/self/fd\0";
87#[cfg(target_os = "macos")]
88pub const PROC_SELF_FD_CSTR: &[u8] = b"/dev/fd\0";
89pub const ROOT_ID: u64 = 1;
90use tokio::sync::{Mutex, MutexGuard, RwLock};
91
92const MIN_PASSTHROUGH_NOFILE_SOFT_LIMIT: u64 = 8192;
93const RESERVED_FILE_DESCRIPTORS: u64 = 64;
94
95#[cfg(target_os = "macos")]
96fn recover_std_mutex<T>(mutex: &StdMutex<T>) -> std::sync::MutexGuard<'_, T> {
97    mutex
98        .lock()
99        .unwrap_or_else(|poisoned| poisoned.into_inner())
100}
101
102#[derive(Debug, Clone)]
103pub struct PassthroughArgs<P, M>
104where
105    P: AsRef<Path>,
106    M: AsRef<str>,
107{
108    pub root_dir: P,
109    pub mapping: Option<M>,
110}
111
112pub async fn new_passthroughfs_layer<P: AsRef<Path>, M: AsRef<str>>(
113    args: PassthroughArgs<P, M>,
114) -> Result<PassthroughFs> {
115    let mut config = Config {
116        root_dir: args.root_dir.as_ref().to_path_buf(),
117        // enable xattr
118        xattr: true,
119        do_import: true,
120        ..Default::default()
121    };
122    #[cfg(target_os = "macos")]
123    if !config.macos_lazy_inode_fd {
124        // Eager-fd fallback: macOS has no `O_PATH`, so every lookup pins a
125        // real fd. Force TTLs to zero so kernel cache invalidation releases
126        // those references promptly and we don't exhaust the fd table.
127        config.entry_timeout = Duration::ZERO;
128        config.attr_timeout = Duration::ZERO;
129        config.dir_entry_timeout = Some(Duration::ZERO);
130        config.dir_attr_timeout = Some(Duration::ZERO);
131    }
132    if let Some(mapping) = args.mapping {
133        config.mapping = mapping
134            .as_ref()
135            .parse()
136            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
137    }
138
139    let fs = PassthroughFs::<()>::new(config)?;
140
141    #[cfg(target_os = "linux")]
142    if fs.cfg.do_import {
143        fs.import().await?;
144    }
145    #[cfg(target_os = "macos")]
146    {
147        // On macOS, always import for now since we rely on the root node being set up?
148        // Or respect the config.
149        fs.import().await?;
150    }
151
152    Ok(fs)
153}
154
155type Inode = u64;
156type Handle = u64;
157
158fn desired_nofile_soft_limit(soft: u64, hard: u64, minimum: u64) -> Option<u64> {
159    if soft >= minimum || hard <= soft {
160        return None;
161    }
162
163    Some(cmp::min(minimum, hard))
164}
165
166fn raise_nofile_soft_limit(minimum: u64) -> u64 {
167    let Ok((soft, hard)) = getrlimit(Resource::RLIMIT_NOFILE) else {
168        return minimum;
169    };
170
171    if let Some(target) = desired_nofile_soft_limit(soft, hard, minimum) {
172        match setrlimit(Resource::RLIMIT_NOFILE, target, hard) {
173            Ok(()) => return target,
174            Err(err) => {
175                warn!(
176                    "passthroughfs: failed to raise RLIMIT_NOFILE from {soft} to {target}: {err}"
177                );
178            }
179        }
180    }
181
182    soft
183}
184
185/// Maximum host inode number supported by passthroughfs
186const MAX_HOST_INO: u64 = 0x7fff_ffff_ffff;
187
188/**
189 * Represents the file associated with an inode (`InodeData`).
190 *
191 * When obtaining such a file, it may either be a new file (the `Owned` variant), in which case the
192 * object's lifetime is static, or it may reference `InodeData.file` (the `Ref` variant), in which
193 * case the object's lifetime is that of the respective `InodeData` object.
194 */
195#[derive(Debug)]
196enum InodeFile<'a> {
197    /// Freshly opened file, owned by this `InodeFile`. Linux constructs
198    /// this from `OpenableFileHandle::open(O_PATH)`; macOS doesn't use it
199    /// (lazy mode hands out `Arc(Arc<File>)`, eager mode hands out `Ref`).
200    #[cfg(target_os = "linux")]
201    Owned(File),
202    Ref(&'a File),
203    /// Shared reference into the lazy-fd cache (`InodeHandle::Reopenable`).
204    /// Avoids the per-call `dup(2)` syscall that `Owned` would require — we
205    /// just bump the `Arc` refcount and let the caller borrow the underlying
206    /// fd. Lifetime is `'static` because the `Arc` itself owns the `File`.
207    #[cfg(target_os = "macos")]
208    Arc(Arc<File>),
209}
210
211impl AsRawFd for InodeFile<'_> {
212    /// Return a file descriptor for this file
213    /// Note: This fd is only valid as long as the `InodeFile` exists.
214    fn as_raw_fd(&self) -> RawFd {
215        match self {
216            #[cfg(target_os = "linux")]
217            Self::Owned(file) => file.as_raw_fd(),
218            Self::Ref(file_ref) => file_ref.as_raw_fd(),
219            #[cfg(target_os = "macos")]
220            Self::Arc(arc) => arc.as_raw_fd(),
221        }
222    }
223}
224
225impl AsFd for InodeFile<'_> {
226    fn as_fd(&self) -> BorrowedFd<'_> {
227        match self {
228            #[cfg(target_os = "linux")]
229            Self::Owned(file) => file.as_fd(),
230            Self::Ref(file_ref) => file_ref.as_fd(),
231            #[cfg(target_os = "macos")]
232            Self::Arc(arc) => arc.as_fd(),
233        }
234    }
235}
236
237#[derive(Debug)]
238#[allow(dead_code)]
239enum InodeHandle {
240    // TODO: Remove this variant once we have a way to handle files that are not
241    File(File),
242    /// `name_to_handle_at`/`open_by_handle_at` based identity (Linux-only).
243    /// macOS lacks the syscalls; the lazy-fd `Reopenable` variant is used
244    /// instead.
245    #[cfg(target_os = "linux")]
246    Handle(Arc<OpenableFileHandle>),
247
248    /// Lazy-fd inode reference (macOS only, gated by
249    /// `Config::macos_lazy_inode_fd`).
250    ///
251    /// Stores an absolute host path plus an optional cached backing file,
252    /// opened on first access via `InodeData::get_file()`. This lets entry/attr
253    /// cache TTLs go above zero without pinning a real fd per kernel-cached
254    /// inode — `O_PATH` is unavailable on macOS, so the only alternative was
255    /// forcing TTL to 0.
256    ///
257    /// `state` is wrapped in `Arc` so `LazyFdLru` can hold a `Weak` reference
258    /// and clear `cached` on eviction without touching the path or removing
259    /// the inode from `InodeMap`.
260    #[cfg(target_os = "macos")]
261    Reopenable {
262        state: Arc<StdMutex<ReopenableState>>,
263    },
264}
265
266#[cfg(target_os = "macos")]
267#[derive(Debug)]
268struct ReopenableState {
269    /// Absolute host path used to (re)open the backing file. Mutable so that a
270    /// successful `rename(2)` can update the cached path; the cached fd is
271    /// invalidated alongside the path so the next `get_file()` reopens.
272    path: PathBuf,
273    /// Lazily-opened backing file shared via `Arc` so `get_file()` returns
274    /// `InodeFile::Arc(...)` without a `dup(2)` per call. Held while the inode
275    /// is referenced so hot paths (`stat`/`read`/`xattr`) skip per-call
276    /// `openat`. Reset by `InodeData::update_lazy_path` or by an LRU
277    /// eviction.
278    cached: Option<Arc<File>>,
279    /// Backref into the global LRU so a successful lazy-open can register
280    /// (or promote) this inode and bump the reopen counter. `None` when the
281    /// LRU layer is disabled (e.g. `macos_lazy_inode_fd == false`).
282    lazy_fd_lru: Option<Arc<LazyFdLru>>,
283}
284
285/// Bounded LRU of cached lazy-fd backing files (macOS only).
286///
287/// Each entry holds a `Weak<StdMutex<ReopenableState>>`. When the LRU exceeds
288/// its capacity, the oldest entry is popped: if its `Weak` still upgrades, we
289/// clear `ReopenableState::cached` so the underlying `Arc<File>` drops. The
290/// path is left untouched, so subsequent `get_file()` calls simply reopen.
291///
292/// Insertions are `O(log n)` worst case (the underlying `lru::LruCache` uses
293/// a `HashMap` + doubly-linked list; lookups are `O(1)` on average). The
294/// guarding mutex is `parking_lot`-free `std::sync::Mutex` to keep the build
295/// dependency footprint minimal — observed contention on the LRU mutex is
296/// negligible because each operation is a single map mutation.
297#[cfg(target_os = "macos")]
298pub(crate) struct LazyFdLru {
299    inner: StdMutex<lru::LruCache<Inode, Weak<StdMutex<ReopenableState>>>>,
300    /// Number of times a cached fd had to be (re)opened. Includes both first
301    /// opens and reopens triggered by LRU eviction. Exposed via
302    /// `PassthroughFs::macos_lazy_fd_reopen_count` for tests / metrics.
303    reopen_count: AtomicU64,
304    /// Configured capacity (>= 1). Stored separately so the value is
305    /// inspectable without locking the LruCache.
306    cap: NonZeroUsize,
307}
308
309#[cfg(target_os = "macos")]
310impl LazyFdLru {
311    fn new(cap: NonZeroUsize) -> Self {
312        LazyFdLru {
313            inner: StdMutex::new(lru::LruCache::new(cap)),
314            reopen_count: AtomicU64::new(0),
315            cap,
316        }
317    }
318
319    /// Register a fresh open for `inode`. Promotes if the entry already
320    /// exists, otherwise inserts. If insertion pushes the cache over `cap`
321    /// the oldest entry is evicted: its `Weak<ReopenableState>` is upgraded
322    /// (best effort) and `cached` is cleared so the underlying `Arc<File>`
323    /// drops.
324    ///
325    /// Uses `LruCache::push` (not `put`) because `push` returns the
326    /// evicted (key, value) when the cache was full — `put` evicts
327    /// silently and gives us no chance to drop the cached fd.
328    fn touch(&self, inode: Inode, weak: Weak<StdMutex<ReopenableState>>) {
329        let mut guard = recover_std_mutex(&self.inner);
330        if let Some((_evicted_inode, evicted_weak)) = guard.push(inode, weak) {
331            // Drop the lock before walking back to the InodeData mutex —
332            // they're independent, but releasing eagerly keeps the LRU
333            // critical section as small as possible.
334            drop(guard);
335            if let Some(state) = evicted_weak.upgrade() {
336                let mut s = recover_std_mutex(&state);
337                s.cached = None;
338            }
339        }
340    }
341
342    /// Drop the entry for `inode` (called from `forget_one` once refcount
343    /// reaches zero). Idempotent — missing entries are a no-op. Does **not**
344    /// touch `ReopenableState::cached`; the inode itself is going away.
345    fn remove(&self, inode: Inode) {
346        let mut guard = recover_std_mutex(&self.inner);
347        let _ = guard.pop(&inode);
348    }
349
350    /// Snapshot of the reopen counter. Each lazy `open(2)` triggered by a
351    /// missing/evicted cache entry bumps this by one.
352    pub(crate) fn reopen_count(&self) -> u64 {
353        self.reopen_count.load(Ordering::Relaxed)
354    }
355
356    /// Snapshot of the configured cap. Used by tests / metrics surfacing.
357    pub(crate) fn cap(&self) -> usize {
358        self.cap.get()
359    }
360
361    /// Snapshot of the current cache occupancy. Bounded by `cap()`.
362    pub(crate) fn len(&self) -> usize {
363        recover_std_mutex(&self.inner).len()
364    }
365
366    fn bump_reopen(&self) {
367        self.reopen_count.fetch_add(1, Ordering::Relaxed);
368    }
369}
370
371#[cfg(target_os = "macos")]
372impl std::fmt::Debug for LazyFdLru {
373    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
374        f.debug_struct("LazyFdLru")
375            .field("cap", &self.cap.get())
376            .field("reopen_count", &self.reopen_count())
377            .finish()
378    }
379}
380
381impl InodeHandle {
382    #[cfg(target_os = "linux")]
383    fn file_handle(&self) -> Option<&FileHandle> {
384        match self {
385            InodeHandle::File(_) => None,
386            InodeHandle::Handle(h) => Some(h.file_handle()),
387        }
388    }
389
390    /// Best-effort `InodeFile` accessor that **does not handle** `Reopenable`.
391    ///
392    /// `Reopenable` callers must go through [`InodeData::get_file`] so the
393    /// lazy-open path can run. This is here for the existing call sites that
394    /// only ever see `File`/`Handle` variants (e.g. internal helpers) — they
395    /// still compile by going through `InodeData::get_file` which forwards
396    /// here for non-Reopenable variants.
397    fn get_file(&self) -> Result<InodeFile<'_>> {
398        match self {
399            InodeHandle::File(f) => Ok(InodeFile::Ref(f)),
400            #[cfg(target_os = "linux")]
401            InodeHandle::Handle(h) => {
402                let f = h.open(libc::O_PATH)?;
403                Ok(InodeFile::Owned(f))
404            }
405            #[cfg(target_os = "macos")]
406            InodeHandle::Reopenable { .. } => {
407                // Programmer error: every Reopenable access must go via
408                // InodeData::get_file so we can drive the lazy-open path.
409                #[cfg(debug_assertions)]
410                panic!(
411                    "InodeHandle::get_file called on Reopenable; \
412                     use InodeData::get_file instead"
413                );
414                #[cfg(not(debug_assertions))]
415                {
416                    Err(io::Error::other(
417                        "InodeHandle::get_file called on Reopenable; \
418                         use InodeData::get_file instead",
419                    ))
420                }
421            }
422        }
423    }
424
425    fn open_file(&self, flags: libc::c_int, proc_self_fd: &File) -> Result<File> {
426        match self {
427            InodeHandle::File(f) => reopen_fd_through_proc(f, flags, proc_self_fd),
428            #[cfg(target_os = "linux")]
429            InodeHandle::Handle(h) => h.open(flags),
430            #[cfg(target_os = "macos")]
431            InodeHandle::Reopenable { state } => {
432                // Open the backing path with the requested flags via the
433                // symlink-aware helper. `state.path` is absolute, so the
434                // dirfd is irrelevant. LRU bookkeeping happens one level up
435                // in `InodeData::open_file`, which knows the inode number.
436                let mut guard = recover_std_mutex(state);
437                let path = CString::new(guard.path.as_os_str().as_bytes())
438                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
439                let fd = lazy_open_path(&path, flags)?;
440                let f = unsafe { File::from_raw_fd(fd) };
441                // Populate the cache opportunistically — only for plain
442                // read-only opens, so we don't stash fds with surprising flags
443                // (write, O_TRUNC, ...). We still need a separate fd to give
444                // back to the caller (it expects ownership), so dup once at
445                // cache-population time. Subsequent `get_file()` calls reuse
446                // the cached `Arc<File>` with no further syscalls.
447                if guard.cached.is_none() && flags == libc::O_RDONLY {
448                    guard.cached = Some(Arc::new(f.try_clone()?));
449                }
450                Ok(f)
451            }
452        }
453    }
454
455    #[cfg(target_os = "linux")]
456    fn stat(&self) -> Result<libc::stat64> {
457        self.do_stat()
458    }
459    #[cfg(target_os = "macos")]
460    fn stat(&self) -> Result<libc::stat> {
461        // On macOS, stat_fd returns libc::stat, which is the correct type.
462        // No explicit cast from stat64 is needed if stat_fd is correctly implemented
463        // to return the platform-specific stat struct.
464        self.do_stat()
465    }
466
467    #[cfg(target_os = "linux")]
468    fn do_stat(&self) -> Result<libc::stat64> {
469        match self {
470            InodeHandle::File(f) => stat_fd(f, None),
471            InodeHandle::Handle(_h) => {
472                let file = self.get_file()?;
473                stat_fd(&file, None)
474            }
475        }
476    }
477
478    #[cfg(target_os = "macos")]
479    fn do_stat(&self) -> Result<libc::stat> {
480        match self {
481            InodeHandle::File(f) => stat_fd(f, None),
482            InodeHandle::Reopenable { state } => {
483                // Stat by path — no need to keep an fd around just for stat.
484                let path = {
485                    let guard = recover_std_mutex(state);
486                    CString::new(guard.path.as_os_str().as_bytes())
487                        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?
488                };
489                // `guard.path` is absolute, so AT_FDCWD is only a required
490                // placeholder. AT_SYMLINK_NOFOLLOW matches the rest of the
491                // passthrough's stat behavior.
492                let mut st = std::mem::MaybeUninit::<libc::stat>::zeroed();
493                let res = unsafe {
494                    libc::fstatat(
495                        libc::AT_FDCWD,
496                        path.as_ptr(),
497                        st.as_mut_ptr(),
498                        libc::AT_SYMLINK_NOFOLLOW,
499                    )
500                };
501                if res != 0 {
502                    return Err(io::Error::last_os_error());
503                }
504                Ok(unsafe { st.assume_init() })
505            }
506        }
507    }
508}
509
510/// Represents an inode in `PassthroughFs`.
511#[derive(Debug)]
512pub struct InodeData {
513    inode: Inode,
514    // Most of these aren't actually files but ¯\_(ツ)_/¯.
515    handle: InodeHandle,
516    id: InodeId,
517    refcount: AtomicU64,
518    // File type and mode
519    mode: u32,
520    /// Birth time used as part of the `(ino, btime)` cache key for the
521    /// Linux-only `handle_cache`. macOS doesn't construct file handles, so
522    /// the field is captured for consistency but never read.
523    #[cfg_attr(target_os = "macos", allow(dead_code))]
524    btime: statx_timestamp,
525}
526
527/// macOS lazy-fd open helper.
528///
529/// Opens an absolute path produced by the lazy-fd cache (`Reopenable`).
530/// Two-step semantics:
531/// 1. `O_NOFOLLOW` first — opens the entry as long as the trailing
532///    component isn't a symlink.
533/// 2. On `ELOOP`, retry with `O_SYMLINK` so the link node itself is
534///    opened (for subsequent `readlink`/xattr).
535///
536/// **Why we don't use `O_NOFOLLOW_ANY`** (PR-9.3 finding): `/tmp` on
537/// macOS is itself a symlink to `/private/tmp`, so any cached path that
538/// went through `/tmp` would fail with `ELOOP` under
539/// `O_NOFOLLOW_ANY`. Combining it with `O_NOFOLLOW` returns `EINVAL`
540/// (the flags are mutually exclusive on Darwin). Hardening against
541/// intermediate-symlink TOCTOU therefore needs a different design —
542/// e.g. canonicalizing `cfg.root_dir` at startup so all stored paths
543/// are realpath-resolved, then using `O_NOFOLLOW_ANY` standalone.
544/// Tracked as future work in `macos-support-matrix.md`.
545///
546/// The `O_NOFOLLOW` -> `O_SYMLINK` retry is intentionally a compatibility
547/// fallback and has a narrow race if the entry is swapped between the two
548/// opens. `cfg.root_dir` is canonicalized before lazy mode is enabled, so the
549/// race is confined to entries under that root; removing it completely needs
550/// the future `O_NOFOLLOW_ANY` design above.
551///
552/// Always sets `O_CLOEXEC`. Returns the raw fd; the caller wraps it in `File`.
553#[cfg(target_os = "macos")]
554fn lazy_open_path(path: &CStr, flags: libc::c_int) -> io::Result<libc::c_int> {
555    // Strip flags that don't make sense for this opener.
556    let base = (flags & !libc::O_CREAT & !libc::O_DIRECTORY) | libc::O_CLOEXEC;
557    let with_nofollow = base | libc::O_NOFOLLOW;
558    let fd = unsafe { libc::open(path.as_ptr(), with_nofollow) };
559    if fd >= 0 {
560        return Ok(fd);
561    }
562    let err = io::Error::last_os_error();
563    if err.raw_os_error() == Some(libc::ELOOP) {
564        let symlink_flags = (base & !libc::O_NOFOLLOW) | libc::O_SYMLINK;
565        let fd = unsafe { libc::open(path.as_ptr(), symlink_flags) };
566        if fd >= 0 {
567            return Ok(fd);
568        }
569        return Err(io::Error::last_os_error());
570    }
571    Err(err)
572}
573
574impl InodeData {
575    fn new(
576        inode: Inode,
577        f: InodeHandle,
578        refcount: u64,
579        id: InodeId,
580        mode: u32,
581        btime: statx_timestamp,
582    ) -> Self {
583        InodeData {
584            inode,
585            handle: f,
586            id,
587            refcount: AtomicU64::new(refcount),
588            mode,
589            btime,
590        }
591    }
592
593    fn get_file(&self) -> Result<InodeFile<'_>> {
594        #[cfg(target_os = "macos")]
595        if let InodeHandle::Reopenable { state } = &self.handle {
596            // Lazy-open path: lock state, ensure a cached `Arc<File>`, then
597            // hand out a refcount bump as `InodeFile::Arc`. No `dup(2)` per
598            // call — a single shared fd is reused while the inode is alive.
599            let mut guard = recover_std_mutex(state);
600            let mut touched_lru = None;
601            if guard.cached.is_none() {
602                let path = CString::new(guard.path.as_os_str().as_bytes())
603                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
604                // Use the symlink-aware helper so we don't follow links;
605                // mirrors `open_file_restricted` semantics.
606                let fd = lazy_open_path(&path, libc::O_RDONLY)?;
607                guard.cached = Some(Arc::new(unsafe { File::from_raw_fd(fd) }));
608                // Capture the LRU handle while we still hold the guard so
609                // we can register/promote after dropping the lock. Bumping
610                // `reopen_count` reflects this open (cache miss or post-
611                // eviction reopen).
612                touched_lru = guard.lazy_fd_lru.clone();
613            }
614            let arc = Arc::clone(guard.cached.as_ref().unwrap());
615            drop(guard);
616            if let Some(lru) = touched_lru {
617                lru.bump_reopen();
618                lru.touch(self.inode, Arc::downgrade(state));
619            }
620            return Ok(InodeFile::Arc(arc));
621        }
622        self.handle.get_file()
623    }
624
625    fn open_file(&self, flags: libc::c_int, proc_self_fd: &File) -> Result<File> {
626        let f = self.handle.open_file(flags, proc_self_fd)?;
627        // If `open_file` populated the lazy cache (RDONLY path), promote in
628        // the LRU on the inode's actual key. The Reopenable handle owns its
629        // own LRU backref, so we only need to consult it.
630        #[cfg(target_os = "macos")]
631        if let InodeHandle::Reopenable { state } = &self.handle {
632            let (had_cache, lru_opt) = {
633                let guard = recover_std_mutex(state);
634                (guard.cached.is_some(), guard.lazy_fd_lru.clone())
635            };
636            if had_cache && let Some(lru) = lru_opt {
637                // Open with non-RDONLY flags doesn't populate the cache,
638                // so we only `bump_reopen` for the path that actually
639                // refreshed cache. Touch is unconditional once we know
640                // the cache is populated — promotes in LRU.
641                if flags == libc::O_RDONLY {
642                    lru.bump_reopen();
643                }
644                lru.touch(self.inode, Arc::downgrade(state));
645            }
646        }
647        Ok(f)
648    }
649
650    /// macOS lazy-fd: replace the absolute path stored on a `Reopenable`
651    /// inode and invalidate the cached fd so the next `get_file()` reopens
652    /// at the new location. No-op for non-Reopenable handles.
653    ///
654    /// **Note**: an already-open `cached` fd survives `rename(2)` on POSIX,
655    /// but the path is needed if we ever lose the cache (eviction,
656    /// `update_lazy_path` itself, etc.). Keeping path and cache consistent
657    /// avoids stale-path bugs once an LRU eviction layer is added.
658    #[cfg(target_os = "macos")]
659    fn update_lazy_path(&self, new_path: PathBuf) {
660        if let InodeHandle::Reopenable { state } = &self.handle {
661            let mut guard = recover_std_mutex(state);
662            guard.path = new_path;
663            guard.cached = None;
664        }
665    }
666
667    /// Returns the absolute path of a `Reopenable` inode, if any. Used by
668    /// `do_lookup` to compute child paths and by rename to rebuild paths.
669    #[cfg(target_os = "macos")]
670    fn lazy_path(&self) -> Option<PathBuf> {
671        match &self.handle {
672            InodeHandle::Reopenable { state } => Some(recover_std_mutex(state).path.clone()),
673            _ => None,
674        }
675    }
676}
677
678/// Data structures to manage accessed inodes.
679struct InodeMap {
680    pub inodes: RwLock<InodeStore>,
681}
682
683impl InodeMap {
684    fn new() -> Self {
685        InodeMap {
686            inodes: RwLock::new(Default::default()),
687        }
688    }
689
690    async fn clear(&self) {
691        // Do not expect poisoned lock here, so safe to unwrap().
692        self.inodes.write().await.clear();
693    }
694
695    async fn get(&self, inode: Inode) -> Result<Arc<InodeData>> {
696        // Do not expect poisoned lock here, so safe to unwrap().
697        self.inodes
698            .read()
699            .await
700            .get(&inode)
701            .cloned()
702            .ok_or_else(ebadf)
703    }
704
705    fn get_inode_locked(
706        inodes: &InodeStore,
707        #[cfg_attr(target_os = "macos", allow(unused_variables))] handle: &InodeHandle,
708    ) -> Option<Inode> {
709        #[cfg(target_os = "linux")]
710        if let Some(h) = handle.file_handle() {
711            return inodes.inode_by_handle(h).copied();
712        }
713        #[cfg(target_os = "macos")]
714        let _ = inodes;
715        None
716    }
717
718    async fn get_alt(&self, id: &InodeId, handle: &InodeHandle) -> Option<Arc<InodeData>> {
719        // Do not expect poisoned lock here, so safe to unwrap().
720        let inodes = self.inodes.read().await;
721
722        Self::get_alt_locked(&inodes, id, handle)
723    }
724
725    fn get_alt_locked(
726        inodes: &InodeStore,
727        id: &InodeId,
728        #[cfg_attr(target_os = "macos", allow(unused_variables))] handle: &InodeHandle,
729    ) -> Option<Arc<InodeData>> {
730        // Linux: try the by-handle lookup first to detect inode-ID reuse.
731        // macOS: file handles don't exist, so by-id is the only key.
732        #[cfg(target_os = "linux")]
733        let by_handle = handle.file_handle().and_then(|h| inodes.get_by_handle(h));
734        #[cfg(target_os = "macos")]
735        let by_handle: Option<&Arc<InodeData>> = None;
736
737        by_handle
738            .or_else(|| {
739                inodes.get_by_id(id).filter(|_data| {
740                    // Linux only: when falling back to by-id, ensure we hit an
741                    // entry that does not have a file handle. Entries *with*
742                    // handles also have a handle alt key, so if we did not
743                    // find it by that key we must have found an entry for a
744                    // different file with a reused inode ID.
745                    #[cfg(target_os = "linux")]
746                    {
747                        _data.handle.file_handle().is_none()
748                    }
749                    #[cfg(target_os = "macos")]
750                    {
751                        true
752                    }
753                })
754            })
755            .cloned()
756    }
757
758    async fn insert(&self, data: Arc<InodeData>) {
759        let mut inodes = self.inodes.write().await;
760
761        Self::insert_locked(&mut inodes, data)
762    }
763
764    fn insert_locked(inodes: &mut InodeStore, data: Arc<InodeData>) {
765        inodes.insert(data);
766    }
767}
768
769struct HandleData {
770    inode: Inode,
771    file: File,
772    lock: Mutex<()>,
773    open_flags: AtomicU32,
774}
775
776impl HandleData {
777    fn new(inode: Inode, file: File, flags: u32) -> Self {
778        HandleData {
779            inode,
780            file,
781            lock: Mutex::new(()),
782            open_flags: AtomicU32::new(flags),
783        }
784    }
785
786    fn get_file(&self) -> &File {
787        &self.file
788    }
789
790    async fn get_file_mut(&self) -> (MutexGuard<'_, ()>, &File) {
791        (self.lock.lock().await, &self.file)
792    }
793
794    fn borrow_fd(&self) -> BorrowedFd<'_> {
795        self.file.as_fd()
796    }
797
798    async fn get_flags(&self) -> u32 {
799        self.open_flags.load(Ordering::Relaxed)
800    }
801
802    async fn set_flags(&self, flags: u32) {
803        self.open_flags.store(flags, Ordering::Relaxed);
804    }
805}
806
807struct HandleMap {
808    handles: RwLock<BTreeMap<Handle, Arc<HandleData>>>,
809}
810
811impl HandleMap {
812    fn new() -> Self {
813        HandleMap {
814            handles: RwLock::new(BTreeMap::new()),
815        }
816    }
817
818    async fn clear(&self) {
819        // Do not expect poisoned lock here, so safe to unwrap().
820        self.handles.write().await.clear();
821    }
822
823    async fn insert(&self, handle: Handle, data: HandleData) {
824        // Do not expect poisoned lock here, so safe to unwrap().
825        self.handles.write().await.insert(handle, Arc::new(data));
826    }
827
828    async fn release(&self, handle: Handle, inode: Inode) -> Result<()> {
829        // Do not expect poisoned lock here, so safe to unwrap().
830        let mut handles = self.handles.write().await;
831
832        if let btree_map::Entry::Occupied(e) = handles.entry(handle)
833            && e.get().inode == inode
834        {
835            // We don't need to close the file here because that will happen automatically when
836            // the last `Arc` is dropped.
837            e.remove();
838
839            return Ok(());
840        }
841
842        Err(ebadf())
843    }
844
845    async fn get(&self, handle: Handle, inode: Inode) -> Result<Arc<HandleData>> {
846        // Do not expect poisoned lock here, so safe to unwrap().
847        self.handles
848            .read()
849            .await
850            .get(&handle)
851            .filter(|hd| hd.inode == inode)
852            .cloned()
853            .ok_or_else(ebadf)
854    }
855}
856
857/// Key into the per-fs `handle_cache` that maps `(host_inode, btime)` →
858/// `Arc<FileHandle>`. Linux only; macOS doesn't construct file handles.
859#[cfg(target_os = "linux")]
860#[derive(Debug, Hash, Eq, PartialEq)]
861struct FileUniqueKey(u64, statx_timestamp);
862
863/// A file system that simply "passes through" all requests it receives to the underlying file
864/// system.
865///
866/// To keep the implementation simple it servers the contents of its root directory. Users
867/// that wish to serve only a specific directory should set up the environment so that that
868/// directory ends up as the root of the file system process. One way to accomplish this is via a
869/// combination of mount namespaces and the pivot_root system call.
870pub struct PassthroughFs<S: BitmapSlice + Send + Sync = ()> {
871    // File descriptors for various points in the file system tree. These fds are always opened with
872    // the `O_PATH` option so they cannot be used for reading or writing any data. See the
873    // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot
874    // do with an fd opened with this flag.
875    inode_map: InodeMap,
876    next_inode: AtomicU64,
877
878    // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
879    // used for reading and writing data.
880    handle_map: HandleMap,
881    next_handle: AtomicU64,
882
883    // Use to generate unique inode
884    ino_allocator: UniqueInodeGenerator,
885    // Maps mount IDs to an open FD on the respective ID for the purpose of open_by_handle_at().
886    #[cfg(target_os = "linux")]
887    mount_fds: MountFds,
888
889    // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from
890    // `inodes` into one that can go into `handles`. This is accomplished by reading the
891    // `/proc/self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
892    // to be serving doesn't have access to `/proc/self/fd`.
893    proc_self_fd: File,
894
895    // Whether writeback caching is enabled for this directory. This will only be true when
896    // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
897    writeback: AtomicBool,
898
899    // Whether no_open is enabled.
900    no_open: AtomicBool,
901
902    // Whether no_opendir is enabled.
903    no_opendir: AtomicBool,
904
905    // Whether kill_priv_v2 is enabled.
906    //killpriv_v2: AtomicBool,
907
908    // Whether no_readdir is enabled.
909    no_readdir: AtomicBool,
910
911    // Whether seal_size is enabled.
912    seal_size: AtomicBool,
913
914    // Whether per-file DAX feature is enabled.
915    // Init from guest kernel Init cmd of fuse fs.
916    //perfile_dax: AtomicBool,
917    dir_entry_timeout: Duration,
918    dir_attr_timeout: Duration,
919
920    cfg: Config,
921
922    _uuid: Uuid,
923
924    phantom: PhantomData<S>,
925
926    #[cfg(target_os = "linux")]
927    handle_cache: Cache<FileUniqueKey, Arc<FileHandle>>,
928
929    mmap_chunks: Cache<MmapChunkKey, Arc<RwLock<mmap::MmapCachedValue>>>,
930
931    /// LRU bounding the number of cached lazy-fd backing files. Some only
932    /// when on macOS with `Config::macos_lazy_inode_fd == true`. Cloned
933    /// into each `ReopenableState` so the lazy-open path can register
934    /// without a backref to `self`.
935    #[cfg(target_os = "macos")]
936    lazy_fd_lru: Option<Arc<LazyFdLru>>,
937}
938
939impl<S: BitmapSlice + Send + Sync> PassthroughFs<S> {
940    /// Create a Passthrough file system instance.
941    pub fn new(mut cfg: Config) -> Result<PassthroughFs<S>> {
942        if cfg.no_open && cfg.cache_policy != CachePolicy::Always {
943            warn!("passthroughfs: no_open only work with cache=always, reset to open mode");
944            cfg.no_open = false;
945        }
946        if cfg.writeback && cfg.cache_policy == CachePolicy::Never {
947            warn!(
948                "passthroughfs: writeback cache conflicts with cache=none, reset to no_writeback"
949            );
950            cfg.writeback = false;
951        }
952        #[cfg(target_os = "macos")]
953        if cfg.macos_lazy_inode_fd {
954            cfg.root_dir = std::fs::canonicalize(&cfg.root_dir)?;
955        }
956
957        // Safe because this is a constant value and a valid C string.
958        let proc_self_fd_cstr = unsafe { CStr::from_bytes_with_nul_unchecked(PROC_SELF_FD_CSTR) };
959
960        #[cfg(target_os = "linux")]
961        let flags = libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC;
962        #[cfg(target_os = "macos")]
963        let flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
964
965        let proc_self_fd = Self::open_file(&libc::AT_FDCWD, proc_self_fd_cstr, flags, 0)?;
966
967        let (dir_entry_timeout, dir_attr_timeout) =
968            match (cfg.dir_entry_timeout, cfg.dir_attr_timeout) {
969                (Some(e), Some(a)) => (e, a),
970                (Some(e), None) => (e, cfg.attr_timeout),
971                (None, Some(a)) => (cfg.entry_timeout, a),
972                (None, None) => (cfg.entry_timeout, cfg.attr_timeout),
973            };
974
975        #[cfg(target_os = "linux")]
976        let mount_fds = MountFds::new(None)?;
977
978        let fd_limit = raise_nofile_soft_limit(MIN_PASSTHROUGH_NOFILE_SOFT_LIMIT);
979
980        // macOS lazy-fd LRU: cap defaults to half the soft RLIMIT_NOFILE,
981        // floor of 1 (NonZeroUsize). `RESERVED_FILE_DESCRIPTORS` already
982        // accounts for ancillary fds (proc_self_fd, mount_fds, …), so we
983        // base the half-share on the post-reserve budget rather than the
984        // raw rlimit.
985        #[cfg(target_os = "macos")]
986        let lazy_fd_lru: Option<Arc<LazyFdLru>> = if cfg.macos_lazy_inode_fd {
987            let cap = match cfg.macos_lazy_fd_lru_max {
988                Some(n) => n,
989                None => {
990                    let auto = fd_limit.saturating_sub(RESERVED_FILE_DESCRIPTORS).max(2) / 2;
991                    NonZeroUsize::new(auto.try_into().unwrap_or(usize::MAX))
992                        .unwrap_or(NonZeroUsize::new(1).unwrap())
993                }
994            };
995            Some(Arc::new(LazyFdLru::new(cap)))
996        } else {
997            None
998        };
999
1000        let max_mmap_size = if cfg.use_mmap { cfg.max_mmap_size } else { 0 };
1001
1002        let mmap_cache_builder = Cache::builder()
1003            .max_capacity(max_mmap_size)
1004            .weigher(
1005                |_key: &MmapChunkKey, value: &Arc<RwLock<mmap::MmapCachedValue>>| -> u32 {
1006                    let guard = block_on(value.read());
1007                    match &*guard {
1008                        MmapCachedValue::Mmap(mmap) => mmap.len() as u32,
1009                        MmapCachedValue::MmapMut(mmap_mut) => mmap_mut.len() as u32,
1010                    }
1011                },
1012            )
1013            .time_to_idle(Duration::from_millis(60));
1014
1015        Ok(PassthroughFs {
1016            inode_map: InodeMap::new(),
1017            next_inode: AtomicU64::new(ROOT_ID + 1),
1018            ino_allocator: UniqueInodeGenerator::new(),
1019
1020            handle_map: HandleMap::new(),
1021            next_handle: AtomicU64::new(1),
1022
1023            #[cfg(target_os = "linux")]
1024            mount_fds,
1025            proc_self_fd,
1026
1027            writeback: AtomicBool::new(false),
1028            no_open: AtomicBool::new(false),
1029            no_opendir: AtomicBool::new(false),
1030            //killpriv_v2: AtomicBool::new(false),
1031            no_readdir: AtomicBool::new(cfg.no_readdir),
1032            seal_size: AtomicBool::new(cfg.seal_size),
1033            //perfile_dax: AtomicBool::new(false),
1034            dir_entry_timeout,
1035            dir_attr_timeout,
1036            cfg,
1037
1038            _uuid: Uuid::new_v4(),
1039
1040            phantom: PhantomData,
1041
1042            #[cfg(target_os = "linux")]
1043            handle_cache: moka::future::Cache::new(
1044                fd_limit.saturating_sub(RESERVED_FILE_DESCRIPTORS).max(1),
1045            ),
1046
1047            mmap_chunks: mmap_cache_builder.build(),
1048
1049            #[cfg(target_os = "macos")]
1050            lazy_fd_lru,
1051        })
1052    }
1053
1054    /// macOS only: snapshot of the lazy-fd reopen counter. Returns `None`
1055    /// when the lazy-fd path is disabled.
1056    #[cfg(target_os = "macos")]
1057    pub fn macos_lazy_fd_reopen_count(&self) -> Option<u64> {
1058        self.lazy_fd_lru.as_ref().map(|l| l.reopen_count())
1059    }
1060
1061    /// macOS only: current number of cached lazy fds (bounded by the LRU
1062    /// cap). Returns `None` when the lazy-fd path is disabled.
1063    #[cfg(target_os = "macos")]
1064    pub fn macos_lazy_fd_cache_len(&self) -> Option<usize> {
1065        self.lazy_fd_lru.as_ref().map(|l| l.len())
1066    }
1067
1068    /// macOS only: configured cap of the lazy-fd LRU. Returns `None` when
1069    /// the lazy-fd path is disabled.
1070    #[cfg(target_os = "macos")]
1071    pub fn macos_lazy_fd_cap(&self) -> Option<usize> {
1072        self.lazy_fd_lru.as_ref().map(|l| l.cap())
1073    }
1074
1075    /// Resolve `inode` to the absolute host filesystem path that backs it.
1076    ///
1077    /// On macOS in lazy mode this returns the cached `Reopenable.path`
1078    /// directly (no syscall); otherwise it resolves via `F_GETPATH` on the
1079    /// inode's eager fd. On Linux it resolves via `readlink(/proc/self/fd/N)`
1080    /// using the same `fd_path_cstr` helper.
1081    ///
1082    /// Returns `None` if the inode is unknown or can't be resolved (e.g.
1083    /// already forgotten). Used by overlayfs/unionfs `copy_regfile_up` for
1084    /// the cross-layer APFS clone fast path; safe for any caller that
1085    /// needs a host-fs path for a tracked inode.
1086    pub async fn passthrough_host_path(&self, inode: Inode) -> Option<PathBuf> {
1087        let data = self.inode_map.get(inode).await.ok()?;
1088        #[cfg(target_os = "macos")]
1089        if let Some(p) = data.lazy_path() {
1090            return Some(p);
1091        }
1092        let file = data.get_file().ok()?;
1093        let cstr = util::fd_path_cstr(file.as_raw_fd()).ok()?;
1094        Some(PathBuf::from(std::ffi::OsStr::from_bytes(cstr.to_bytes())))
1095    }
1096
1097    /// Initialize the Passthrough file system.
1098    pub async fn import(&self) -> Result<()> {
1099        let root = CString::new(self.cfg.root_dir.as_os_str().as_bytes())
1100            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
1101
1102        let (handle, st) = Self::open_file_and_handle(
1103            self,
1104            &libc::AT_FDCWD,
1105            &root,
1106            #[cfg(target_os = "macos")]
1107            Some(self.cfg.root_dir.clone()),
1108        )
1109        .await
1110        .map_err(|e| {
1111            error!("fuse: import: failed to get file or handle: {e:?}");
1112
1113            e
1114        })?;
1115
1116        let id = InodeId::from_stat(&st);
1117
1118        // Safe because this doesn't modify any memory and there is no need to check the return
1119        // value because this system call always succeeds. We need to clear the umask here because
1120        // we want the client to be able to set all the bits in the mode.
1121        unsafe { libc::umask(0o000) };
1122
1123        // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
1124        self.inode_map
1125            .insert(Arc::new(InodeData::new(
1126                ROOT_ID,
1127                handle,
1128                2,
1129                id,
1130                st.st.st_mode.into(),
1131                st.btime
1132                    .ok_or_else(|| io::Error::other("birth time not available"))?,
1133            )))
1134            .await;
1135
1136        Ok(())
1137    }
1138
1139    /// Get the list of file descriptors which should be reserved across live upgrade.
1140    pub fn keep_fds(&self) -> Vec<RawFd> {
1141        vec![self.proc_self_fd.as_raw_fd()]
1142    }
1143
1144    /// Read-only borrow of the configuration. Used by Layer impls and other
1145    /// integrations that need to consult fields like `whiteout_format`.
1146    pub fn config(&self) -> &Config {
1147        &self.cfg
1148    }
1149
1150    fn readlinkat(dfd: i32, pathname: &CStr) -> Result<PathBuf> {
1151        let mut buf = Vec::with_capacity(libc::PATH_MAX as usize);
1152
1153        // Safe because the kernel will only write data to buf and we check the return value
1154        let buf_read = unsafe {
1155            libc::readlinkat(
1156                dfd,
1157                pathname.as_ptr(),
1158                buf.as_mut_ptr() as *mut libc::c_char,
1159                buf.capacity(),
1160            )
1161        };
1162        if buf_read < 0 {
1163            error!("fuse: readlinkat error");
1164            return Err(Error::last_os_error());
1165        }
1166
1167        // Safe because we trust the value returned by kernel.
1168        unsafe { buf.set_len(buf_read as usize) };
1169        buf.shrink_to_fit();
1170
1171        // Be careful:
1172        // - readlink() does not append a terminating null byte to buf
1173        // - OsString instances are not NUL terminated
1174        Ok(PathBuf::from(OsString::from_vec(buf)))
1175    }
1176
1177    /// Get the file pathname corresponding to the Inode
1178    /// This function is used by Nydus blobfs
1179    pub async fn readlinkat_proc_file(&self, inode: Inode) -> Result<PathBuf> {
1180        let data = self.inode_map.get(inode).await?;
1181        let file = data.get_file()?;
1182        let pathname = CString::new(format!("{}", file.as_raw_fd()))
1183            .map_err(|e| Error::new(io::ErrorKind::InvalidData, e))?;
1184
1185        Self::readlinkat(self.proc_self_fd.as_raw_fd(), &pathname)
1186    }
1187
1188    fn create_file_excl(
1189        dir: &impl AsRawFd,
1190        pathname: &CStr,
1191        flags: i32,
1192        mode: u32,
1193    ) -> io::Result<Option<File>> {
1194        match openat(dir, pathname, flags | libc::O_CREAT | libc::O_EXCL, mode) {
1195            Ok(file) => Ok(Some(file)),
1196            Err(err) => {
1197                // Ignore the error if the file exists and O_EXCL is not present in `flags`.
1198                if err.kind() == io::ErrorKind::AlreadyExists {
1199                    if (flags & libc::O_EXCL) != 0 {
1200                        return Err(err);
1201                    }
1202                    return Ok(None);
1203                }
1204                Err(err)
1205            }
1206        }
1207    }
1208
1209    fn open_file(dfd: &impl AsRawFd, pathname: &CStr, flags: i32, mode: u32) -> io::Result<File> {
1210        openat(dfd, pathname, flags, mode)
1211    }
1212
1213    fn open_file_restricted(
1214        &self,
1215        dir: &impl AsRawFd,
1216        pathname: &CStr,
1217        flags: i32,
1218        mode: u32,
1219    ) -> io::Result<File> {
1220        let flags = libc::O_NOFOLLOW | libc::O_CLOEXEC | flags;
1221
1222        // TODO
1223        //if self.os_facts.has_openat2 {
1224        //    oslib::do_open_relative_to(dir, pathname, flags, mode)
1225        //} else {
1226        #[cfg(target_os = "macos")]
1227        {
1228            match openat(dir, pathname, flags, mode) {
1229                Err(err) if err.raw_os_error() == Some(libc::ELOOP) => {
1230                    let symlink_flags = (flags & !libc::O_NOFOLLOW) | libc::O_SYMLINK;
1231                    openat(dir, pathname, symlink_flags, mode)
1232                }
1233                result => result,
1234            }
1235        }
1236        #[cfg(not(target_os = "macos"))]
1237        {
1238            openat(dir, pathname, flags, mode)
1239        }
1240        //}
1241    }
1242
1243    /// Create a File or File Handle for `name` under directory `dir_fd` to support `lookup()`.
1244    async fn open_file_and_handle(
1245        &self,
1246        dir: &impl AsRawFd,
1247        name: &CStr,
1248        #[cfg(target_os = "macos")] lazy_abs_path: Option<PathBuf>,
1249    ) -> io::Result<(InodeHandle, StatExt)> {
1250        // macOS lazy-fd path: stat by name (no fd held) and return a
1251        // `Reopenable` handle that opens lazily on first I/O. This is the
1252        // mechanism that lets entry/attr cache TTLs go above zero on macOS.
1253        #[cfg(target_os = "macos")]
1254        if self.cfg.macos_lazy_inode_fd
1255            && let Some(abs_path) = lazy_abs_path
1256        {
1257            let st = statx::statx(dir, Some(name))?;
1258            return Ok((
1259                InodeHandle::Reopenable {
1260                    state: Arc::new(StdMutex::new(ReopenableState {
1261                        path: abs_path,
1262                        cached: None,
1263                        lazy_fd_lru: self.lazy_fd_lru.clone(),
1264                    })),
1265                },
1266                st,
1267            ));
1268        }
1269
1270        #[cfg(target_os = "linux")]
1271        {
1272            let path_file = self.open_file_restricted(dir, name, libc::O_PATH, 0)?;
1273            let st = statx::statx(&path_file, None)?;
1274
1275            let btime_is_valid = match st.btime {
1276                Some(ts) => ts.tv_sec != 0 || ts.tv_nsec != 0,
1277                None => false,
1278            };
1279
1280            if btime_is_valid {
1281                let key = FileUniqueKey(st.st.st_ino, st.btime.unwrap());
1282                let cache = self.handle_cache.clone();
1283                if let Some(h) = cache.get(&key).await {
1284                    let openable = self.to_openable_handle(h)?;
1285                    Ok((InodeHandle::Handle(openable), st))
1286                } else if let Some(handle_from_fd) = FileHandle::from_fd(&path_file)? {
1287                    let handle_arc = Arc::new(handle_from_fd);
1288                    cache.insert(key, Arc::clone(&handle_arc)).await;
1289                    let openable = self.to_openable_handle(handle_arc)?;
1290                    Ok((InodeHandle::Handle(openable), st))
1291                } else {
1292                    Ok((InodeHandle::File(path_file), st))
1293                }
1294            } else if let Some(handle_from_fd) = FileHandle::from_fd(&path_file)? {
1295                let handle_arc = Arc::new(handle_from_fd);
1296                let openable = self.to_openable_handle(handle_arc)?;
1297                Ok((InodeHandle::Handle(openable), st))
1298            } else {
1299                Ok((InodeHandle::File(path_file), st))
1300            }
1301        }
1302        #[cfg(target_os = "macos")]
1303        {
1304            // macOS without lazy mode: pin an `O_RDONLY` fd as
1305            // `InodeHandle::File`. file-handle path is unreachable since the
1306            // syscalls don't exist on Darwin.
1307            let path_file = self.open_file_restricted(dir, name, libc::O_RDONLY, 0)?;
1308            let st = statx::statx(&path_file, None)?;
1309            Ok((InodeHandle::File(path_file), st))
1310        }
1311    }
1312
1313    #[cfg(target_os = "linux")]
1314    fn to_openable_handle(&self, fh: Arc<FileHandle>) -> io::Result<Arc<OpenableFileHandle>> {
1315        (*Arc::as_ref(&fh))
1316            .clone()
1317            .into_openable(&self.mount_fds, |fd, flags, _mode| {
1318                reopen_fd_through_proc(&fd, flags, &self.proc_self_fd)
1319            })
1320            .map(Arc::new)
1321            .map_err(|e| {
1322                if !e.silent() {
1323                    error!("{e}");
1324                }
1325                e.into_inner()
1326            })
1327    }
1328
1329    async fn allocate_inode(
1330        &self,
1331        inodes: &InodeStore,
1332        id: &InodeId,
1333        handle: &InodeHandle,
1334    ) -> io::Result<Inode> {
1335        if !self.cfg.use_host_ino {
1336            // If the inode has already been assigned before, the new inode is not reassigned,
1337            // ensuring that the same file is always the same inode
1338            match InodeMap::get_inode_locked(inodes, handle) {
1339                Some(a) => Ok(a),
1340                None => Ok(self.next_inode.fetch_add(1, Ordering::Relaxed)),
1341            }
1342        } else {
1343            let inode = if id.ino > MAX_HOST_INO {
1344                // Prefer looking for previous mappings from memory
1345                match InodeMap::get_inode_locked(inodes, handle) {
1346                    Some(ino) => ino,
1347                    None => self.ino_allocator.get_unique_inode(id)?,
1348                }
1349            } else {
1350                self.ino_allocator.get_unique_inode(id)?
1351            };
1352            // trace!("fuse: allocate inode: {} for id: {:?}", inode, id);
1353            Ok(inode)
1354        }
1355    }
1356
1357    async fn do_lookup(
1358        &self,
1359        parent: Inode,
1360        name: &CStr,
1361    ) -> std::result::Result<ReplyEntry, Errno> {
1362        let name = if parent == ROOT_ID && name.to_bytes_with_nul().starts_with(PARENT_DIR_CSTR) {
1363            // Safe as this is a constant value and a valid C string.
1364            CStr::from_bytes_with_nul(CURRENT_DIR_CSTR).unwrap()
1365        } else {
1366            name
1367        };
1368
1369        let dir = self.inode_map.get(parent).await?;
1370        let dir_file = dir.get_file()?;
1371
1372        // macOS lazy mode: child path = parent's path joined with `name`.
1373        // The parent must itself be Reopenable (lazy mode applies to the whole
1374        // FS), so `lazy_path()` returns Some.
1375        #[cfg(target_os = "macos")]
1376        let lazy_abs_path = if self.cfg.macos_lazy_inode_fd {
1377            dir.lazy_path().map(|parent_path| {
1378                let name_os = std::ffi::OsStr::from_bytes(name.to_bytes());
1379                parent_path.join(name_os)
1380            })
1381        } else {
1382            None
1383        };
1384
1385        let (inode_handle, st) = self
1386            .open_file_and_handle(
1387                &dir_file,
1388                name,
1389                #[cfg(target_os = "macos")]
1390                lazy_abs_path,
1391            )
1392            .await?;
1393        let id = InodeId::from_stat(&st);
1394        debug!(
1395            "do_lookup: parent: {}, name: {}, handle: {:?}, id: {:?}",
1396            parent,
1397            name.to_string_lossy(),
1398            inode_handle,
1399            id
1400        );
1401
1402        let mut found = None;
1403        'search: loop {
1404            match self.inode_map.get_alt(&id, &inode_handle).await {
1405                // No existing entry found
1406                None => break 'search,
1407                Some(data) => {
1408                    let curr = data.refcount.load(Ordering::Acquire);
1409                    // forgot_one() has just destroyed the entry, retry...
1410                    if curr == 0 {
1411                        continue 'search;
1412                    }
1413
1414                    // Saturating add to avoid integer overflow, it's not realistic to saturate u64.
1415                    let new = curr.saturating_add(1);
1416
1417                    // Synchronizes with the forgot_one()
1418                    if data
1419                        .refcount
1420                        .compare_exchange(curr, new, Ordering::AcqRel, Ordering::Acquire)
1421                        .is_ok()
1422                    {
1423                        found = Some(data.inode);
1424                        break;
1425                    }
1426                }
1427            }
1428        }
1429
1430        let inode = if let Some(v) = found {
1431            v
1432        } else {
1433            // Write guard get_alt_locked() and insert_lock() to avoid race conditions.
1434            let mut inodes = self.inode_map.inodes.write().await;
1435
1436            // Lookup inode_map again after acquiring the inode_map lock, as there might be another
1437            // racing thread already added an inode with the same id while we're not holding
1438            // the lock. If so just use the newly added inode, otherwise the inode will be replaced
1439            // and results in EBADF.
1440            // trace!("FS {} looking up inode for id: {:?} with handle: {:?}", self.uuid, id, handle);
1441            match InodeMap::get_alt_locked(&inodes, &id, &inode_handle) {
1442                Some(data) => {
1443                    // An inode was added concurrently while we did not hold a lock on
1444                    // `self.inodes_map`, so we use that instead. `handle` will be dropped.
1445                    // trace!("FS {} found existing inode: {}", self.uuid, data.inode);
1446                    data.refcount.fetch_add(1, Ordering::Relaxed);
1447                    data.inode
1448                }
1449                None => {
1450                    let inode = self.allocate_inode(&inodes, &id, &inode_handle).await?;
1451                    // trace!("FS {} allocated new inode: {} for id: {:?}", self.uuid, inode, id);
1452
1453                    if inode > VFS_MAX_INO {
1454                        error!("fuse: max inode number reached: {VFS_MAX_INO}");
1455                        return Err(io::Error::other(format!(
1456                            "max inode number reached: {VFS_MAX_INO}"
1457                        ))
1458                        .into());
1459                    }
1460
1461                    InodeMap::insert_locked(
1462                        inodes.deref_mut(),
1463                        Arc::new(InodeData::new(
1464                            inode,
1465                            inode_handle,
1466                            1,
1467                            id,
1468                            st.st.st_mode.into(),
1469                            st.btime
1470                                .ok_or_else(|| io::Error::other("birth time not available"))?,
1471                        )),
1472                    );
1473
1474                    inode
1475                }
1476            }
1477        };
1478
1479        let (entry_timeout, _) = if is_dir(st.st.st_mode.into()) {
1480            (self.dir_entry_timeout, self.dir_attr_timeout)
1481        } else {
1482            (self.cfg.entry_timeout, self.cfg.attr_timeout)
1483        };
1484
1485        // // Whether to enable file DAX according to the value of dax_file_size
1486        // let mut attr_flags: u32 = 0;
1487        // if let Some(dax_file_size) = self.cfg.dax_file_size {
1488        //     // st.stat.st_size is i64
1489        //     if self.perfile_dax.load().await
1490        //         && st.st.st_size >= 0x0
1491        //         && st.st.st_size as u64 >= dax_file_size
1492        //     {
1493        //         attr_flags |= FUSE_ATTR_DAX;
1494        //     }
1495        // }
1496        let mut attr_temp = convert_stat64_to_file_attr(st.st);
1497        attr_temp.ino = inode;
1498        attr_temp.uid = self.cfg.mapping.find_mapping(attr_temp.uid, true, true);
1499        attr_temp.gid = self.cfg.mapping.find_mapping(attr_temp.gid, true, false);
1500        Ok(ReplyEntry {
1501            ttl: entry_timeout,
1502            attr: attr_temp,
1503            generation: 0,
1504        })
1505    }
1506
1507    async fn forget_one(&self, inodes: &mut InodeStore, inode: Inode, count: u64) {
1508        // ROOT_ID should not be forgotten, or we're not able to access to files any more.
1509        if inode == ROOT_ID {
1510            return;
1511        }
1512
1513        if let Some(data) = inodes.get(&inode) {
1514            // Acquiring the write lock on the inode map prevents new lookups from incrementing the
1515            // refcount but there is the possibility that a previous lookup already acquired a
1516            // reference to the inode data and is in the process of updating the refcount so we need
1517            // to loop here until we can decrement successfully.
1518            loop {
1519                let curr = data.refcount.load(Ordering::Acquire);
1520
1521                // Saturating sub because it doesn't make sense for a refcount to go below zero and
1522                // we don't want misbehaving clients to cause integer overflow.
1523                let new = curr.saturating_sub(count);
1524
1525                // Synchronizes with the acquire load in `do_lookup`.
1526                if data
1527                    .refcount
1528                    .compare_exchange(curr, new, Ordering::AcqRel, Ordering::Acquire)
1529                    .is_ok()
1530                {
1531                    if new == 0 {
1532                        #[cfg(target_os = "linux")]
1533                        if data.handle.file_handle().is_some()
1534                            && (data.btime.tv_sec != 0 || data.btime.tv_nsec != 0)
1535                        {
1536                            let key = FileUniqueKey(data.id.ino, data.btime);
1537                            let cache = self.handle_cache.clone();
1538                            cache.invalidate(&key).await;
1539                        }
1540                        // Drop any LRU entry tracking this inode's lazy fd
1541                        // so capacity stays accurate. The cached `Arc<File>`
1542                        // (if any) drops with the `InodeData` once we
1543                        // remove from the map below.
1544                        #[cfg(target_os = "macos")]
1545                        if let Some(lru) = self.lazy_fd_lru.as_ref() {
1546                            lru.remove(inode);
1547                        }
1548                        // We just removed the last refcount for this inode.
1549                        // The allocated inode number should be kept in the map when use_host_ino
1550                        // is false or host inode(don't use the virtual 56bit inode) is bigger than MAX_HOST_INO.
1551                        let keep_mapping = !self.cfg.use_host_ino || data.id.ino > MAX_HOST_INO;
1552                        inodes.remove(&inode, keep_mapping);
1553                    }
1554                    break;
1555                }
1556            }
1557        }
1558    }
1559
1560    async fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1561        self.handle_map.release(handle, inode).await
1562    }
1563
1564    // Validate a path component, same as the one in vfs layer, but only do the validation if this
1565    // passthroughfs is used without vfs layer, to avoid double validation.
1566    fn validate_path_component(&self, name: &CStr) -> io::Result<()> {
1567        // !self.cfg.do_import means we're under vfs, and vfs has already done the validation
1568        if !self.cfg.do_import {
1569            return Ok(());
1570        }
1571        validate_path_component(name)
1572    }
1573
1574    //TODO: When seal_size is set, we don't allow operations that could change file size nor allocate
1575    // space beyond EOF
1576    // fn seal_size_check(
1577    //     &self,
1578    //     opcode: Opcode,
1579    //     file_size: u64,
1580    //     offset: u64,
1581    //     size: u64,
1582    //     mode: i32,
1583    // ) -> io::Result<()> {
1584    //     if offset.checked_add(size).is_none() {
1585    //         error!(
1586    //             "fuse: {:?}: invalid `offset` + `size` ({}+{}) overflows u64::MAX",
1587    //             opcode, offset, size
1588    //         );
1589    //         return Err(einval());
1590    //     }
1591
1592    //     match opcode {
1593    //         // write should not exceed the file size.
1594    //         Opcode::Write => {
1595    //             if size + offset > file_size {
1596    //                 return Err(eperm());
1597    //             }
1598    //         }
1599
1600    //         Opcode::Fallocate => {
1601    //             let op = mode & !(libc::FALLOC_FL_KEEP_SIZE | libc::FALLOC_FL_UNSHARE_RANGE);
1602    //             match op {
1603    //                 // Allocate, punch and zero, must not change file size.
1604    //                 0 | libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_ZERO_RANGE => {
1605    //                     if size + offset > file_size {
1606    //                         return Err(eperm());
1607    //                     }
1608    //                 }
1609    //                 // collapse and insert will change file size, forbid.
1610    //                 libc::FALLOC_FL_COLLAPSE_RANGE | libc::FALLOC_FL_INSERT_RANGE => {
1611    //                     return Err(eperm());
1612    //                 }
1613    //                 // Invalid operation
1614    //                 _ => return Err(einval()),
1615    //             }
1616    //         }
1617
1618    //         // setattr operation should be handled in setattr handler.
1619    //         _ => return Err(enosys()),
1620    //     }
1621
1622    //     Ok(())
1623    // }
1624
1625    async fn get_writeback_open_flags(&self, flags: i32) -> i32 {
1626        let mut new_flags = flags;
1627        let writeback = self.writeback.load(Ordering::Relaxed);
1628
1629        // When writeback caching is enabled, the kernel may send read requests even if the
1630        // userspace program opened the file write-only. So we need to ensure that we have opened
1631        // the file for reading as well as writing.
1632        if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
1633            new_flags &= !libc::O_ACCMODE;
1634            new_flags |= libc::O_RDWR;
1635        }
1636
1637        // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
1638        // However, this breaks atomicity as the file may have changed on disk, invalidating the
1639        // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
1640        // the file. Just allow this for now as it is the user's responsibility to enable writeback
1641        // caching only for directories that are not shared. It also means that we need to clear the
1642        // `O_APPEND` flag.
1643        if writeback && flags & libc::O_APPEND != 0 {
1644            new_flags &= !libc::O_APPEND;
1645        }
1646
1647        new_flags
1648    }
1649
1650    async fn get_mmap(
1651        &self,
1652        inode: Inode,
1653        offset: u64,
1654        file: &File,
1655    ) -> Option<(Arc<RwLock<mmap::MmapCachedValue>>, u64)> {
1656        let file_size = file.metadata().unwrap().len();
1657        let key = MmapChunkKey::new(inode, offset, file_size);
1658        let aligned_offset = key.aligned_offset;
1659
1660        if let Some(cached) = self.mmap_chunks.get(&key).await {
1661            let guard = cached.read().await;
1662            let cache_len = match &*guard {
1663                MmapCachedValue::Mmap(mmap) => mmap.len() as u64,
1664                MmapCachedValue::MmapMut(mmap_mut) => mmap_mut.len() as u64,
1665            };
1666            if offset < key.aligned_offset + cache_len {
1667                return Some((cached.clone(), key.aligned_offset));
1668            }
1669        }
1670
1671        let mmap = match mmap::create_mmap(offset, file).await {
1672            Ok(v) => v,
1673            Err(e) => {
1674                error!("Failed to create mmap:{e}");
1675                return None;
1676            }
1677        };
1678        self.mmap_chunks.insert(key, mmap.clone()).await;
1679        Some((mmap, aligned_offset))
1680    }
1681
1682    async fn read_from_mmap(
1683        &self,
1684        inode: Inode,
1685        offset: u64,
1686        size: u64,
1687        file: &File,
1688        buf: &mut [u8],
1689    ) -> Result<usize> {
1690        // check the buf size
1691        if buf.len() < size as usize {
1692            return Err(std::io::Error::new(
1693                std::io::ErrorKind::InvalidInput,
1694                format!("Buffer too small: {} < {}", buf.len(), size),
1695            ));
1696        }
1697
1698        let file_size = file.metadata()?.len();
1699
1700        // check the offset
1701        if offset >= file_size {
1702            return Ok(0); // offset exceeds file size, return 0 bytes read
1703        }
1704
1705        // compute the maximum readable length
1706        let max_readable = file_size - offset;
1707        let actual_size = cmp::min(size, max_readable) as usize;
1708
1709        let mut len = actual_size;
1710        let mut current_offset = offset;
1711        let mut buf_offset = 0;
1712
1713        while len > 0 {
1714            let (chunk, chunk_start_offset) = match self.get_mmap(inode, current_offset, file).await
1715            {
1716                Some((chunk, aligned_offset)) => (chunk, aligned_offset),
1717                None => {
1718                    return Err(std::io::Error::other("Failed to get mmap chunk"));
1719                }
1720            };
1721
1722            let chunk_guard = chunk.read().await;
1723            match &*chunk_guard {
1724                MmapCachedValue::Mmap(mmap) => {
1725                    let chunk_len = mmap.len();
1726
1727                    // compute the start offset within the chunk using cached alignment
1728                    let copy_start = (current_offset - chunk_start_offset) as usize;
1729
1730                    // ensure we don't read beyond the chunk boundary
1731                    let remaining_in_chunk = chunk_len - copy_start;
1732                    let copy_len = cmp::min(len, remaining_in_chunk);
1733
1734                    // ensure we don't read beyond the buffer boundary
1735                    let copy_len = cmp::min(copy_len, buf.len() - buf_offset);
1736
1737                    if copy_len == 0 {
1738                        break; // no more data to read
1739                    }
1740
1741                    // execute data copy
1742                    buf[buf_offset..buf_offset + copy_len]
1743                        .copy_from_slice(&mmap[copy_start..copy_start + copy_len]);
1744
1745                    buf_offset += copy_len;
1746                    len -= copy_len;
1747                    current_offset += copy_len as u64;
1748                }
1749                MmapCachedValue::MmapMut(mmap_mut) => {
1750                    let chunk_len = mmap_mut.len();
1751
1752                    // compute the start offset within the chunk using cached alignment
1753                    let copy_start = (current_offset - chunk_start_offset) as usize;
1754
1755                    // ensure we don't read beyond the chunk boundary
1756                    let remaining_in_chunk = chunk_len - copy_start;
1757                    let copy_len = cmp::min(len, remaining_in_chunk);
1758
1759                    // ensure we don't read beyond the buffer boundary
1760                    let copy_len = cmp::min(copy_len, buf.len() - buf_offset);
1761
1762                    if copy_len == 0 {
1763                        break; // no more data to read
1764                    }
1765
1766                    // execute data copy
1767                    buf[buf_offset..buf_offset + copy_len]
1768                        .copy_from_slice(&mmap_mut[copy_start..copy_start + copy_len]);
1769
1770                    buf_offset += copy_len;
1771                    len -= copy_len;
1772                    current_offset += copy_len as u64;
1773                }
1774            }
1775        }
1776        Ok(buf_offset)
1777    }
1778
1779    async fn write_to_mmap(
1780        &self,
1781        inode: Inode,
1782        offset: u64,
1783        data: &[u8],
1784        file: &File,
1785    ) -> Result<usize> {
1786        let file_size = file.metadata()?.len();
1787        let len = data.len();
1788
1789        // If the file needs to be extended, do so
1790        if offset + len as u64 > file_size {
1791            let raw_fd = file.as_raw_fd();
1792            let res = unsafe { libc::ftruncate(raw_fd, (offset + len as u64) as i64) };
1793
1794            if res < 0 {
1795                return Err(std::io::Error::other("error to ftruncate"));
1796            }
1797
1798            self.invalidate_mmap_cache(inode, file_size).await;
1799        }
1800
1801        let mut remaining = len;
1802        let mut current_offset = offset;
1803        let mut data_offset = 0;
1804
1805        while remaining > 0 {
1806            let (chunk, chunk_start_offset) = match self.get_mmap(inode, current_offset, file).await
1807            {
1808                Some((chunk, aligned_offset)) => (chunk, aligned_offset),
1809                None => {
1810                    return Err(std::io::Error::other("Failed to get mmap chunk"));
1811                }
1812            };
1813
1814            let mut chunk_guard = chunk.write().await;
1815            match &mut *chunk_guard {
1816                MmapCachedValue::Mmap(_) => {
1817                    return Err(std::io::Error::new(
1818                        std::io::ErrorKind::PermissionDenied,
1819                        "Cannot write to read-only mmap",
1820                    ));
1821                }
1822                MmapCachedValue::MmapMut(mmap_mut) => {
1823                    let chunk_len = mmap_mut.len();
1824
1825                    // Calculate the start position of the current chunk using cached alignment
1826                    let copy_start = (current_offset - chunk_start_offset) as usize;
1827
1828                    // Ensure we don't write beyond the chunk boundary
1829                    let remaining_in_chunk = chunk_len - copy_start;
1830                    let copy_len = cmp::min(remaining, remaining_in_chunk);
1831
1832                    // Ensure we don't write beyond the data boundary
1833                    let copy_len = cmp::min(copy_len, data.len() - data_offset);
1834
1835                    if copy_len == 0 {
1836                        break; // No more data to write
1837                    }
1838
1839                    // Perform data copy
1840                    mmap_mut[copy_start..copy_start + copy_len]
1841                        .copy_from_slice(&data[data_offset..data_offset + copy_len]);
1842
1843                    data_offset += copy_len;
1844                    remaining -= copy_len;
1845                    current_offset += copy_len as u64;
1846                    mmap_mut.flush_async_range(copy_start, copy_len)?;
1847                }
1848            }
1849        }
1850        Ok(data_offset)
1851    }
1852
1853    async fn invalidate_mmap_cache(&self, inode: Inode, old_size: u64) {
1854        let keys_to_remove: Vec<_> = self
1855            .mmap_chunks
1856            .iter()
1857            .filter(|item| {
1858                let key = item.0.clone();
1859                key.inode == inode && key.aligned_offset + mmap::MAX_WINDOW_SIZE as u64 >= old_size
1860            })
1861            .collect();
1862
1863        for item in keys_to_remove {
1864            self.mmap_chunks.invalidate(item.0.as_ref()).await;
1865        }
1866    }
1867}
1868
1869#[cfg(test)]
1870#[allow(unused_imports)]
1871#[allow(clippy::useless_conversion)]
1872mod tests {
1873    use crate::{
1874        passthrough::{PassthroughArgs, PassthroughFs, ROOT_ID, new_passthroughfs_layer},
1875        unwrap_or_skip_eperm, unwrap_or_skip_mount_error,
1876    };
1877    use std::ffi::{CStr, OsStr, OsString};
1878
1879    use nix::unistd::{Gid, Uid, getgid, getuid};
1880    use rfuse3::{
1881        MountOptions,
1882        raw::{Filesystem, Request, Session},
1883    };
1884
1885    macro_rules! pass {
1886        () => {
1887            ()
1888        };
1889        ($($tt:tt)*) => {
1890            ()
1891        };
1892    }
1893
1894    #[test]
1895    fn nofile_limit_raise_is_capped_by_hard_limit() {
1896        assert_eq!(
1897            super::desired_nofile_soft_limit(256, 4096, 8192),
1898            Some(4096)
1899        );
1900        assert_eq!(
1901            super::desired_nofile_soft_limit(256, 16384, 8192),
1902            Some(8192)
1903        );
1904        assert_eq!(super::desired_nofile_soft_limit(8192, 16384, 8192), None);
1905    }
1906
1907    #[cfg(target_os = "macos")]
1908    struct MacFuseMountCleanup {
1909        mount_dir: std::path::PathBuf,
1910    }
1911
1912    #[cfg(target_os = "macos")]
1913    impl Drop for MacFuseMountCleanup {
1914        fn drop(&mut self) {
1915            let _ = std::process::Command::new("umount")
1916                .arg(&self.mount_dir)
1917                .status();
1918            let _ = std::process::Command::new("diskutil")
1919                .arg("unmount")
1920                .arg("force")
1921                .arg(&self.mount_dir)
1922                .status();
1923        }
1924    }
1925
1926    /// This test attempts to mount a passthrough filesystem. It is explicitly
1927    /// gated because macFUSE availability depends on local kext approval and
1928    /// should not affect the default unit-test layer.
1929    #[tokio::test]
1930    async fn test_passthrough() {
1931        if std::env::var("RUN_MACFUSE_TESTS").ok().as_deref() != Some("1") {
1932            eprintln!("skip test_passthrough: RUN_MACFUSE_TESTS!=1");
1933            return;
1934        }
1935
1936        let temp_dir = tempfile::tempdir().expect("tempdir");
1937        let source_dir = temp_dir.path().join("src");
1938        let mount_dir = temp_dir.path().join("mnt");
1939        std::fs::create_dir_all(&source_dir).expect("create source dir");
1940        std::fs::create_dir_all(&mount_dir).expect("create mount dir");
1941        #[cfg(target_os = "macos")]
1942        let _cleanup = MacFuseMountCleanup {
1943            mount_dir: mount_dir.clone(),
1944        };
1945
1946        let args = PassthroughArgs {
1947            root_dir: source_dir.clone(),
1948            mapping: None::<&str>,
1949        };
1950        let fs = match super::new_passthroughfs_layer(args).await {
1951            Ok(fs) => fs,
1952            Err(e) => {
1953                eprintln!("skip test_passthrough: init failed: {e:?}");
1954                return;
1955            }
1956        };
1957
1958        let uid = unsafe { libc::getuid() };
1959        let gid = unsafe { libc::getgid() };
1960
1961        let mut mount_options = MountOptions::default();
1962        #[cfg(target_os = "linux")]
1963        mount_options.force_readdir_plus(true);
1964        mount_options.uid(uid).gid(gid);
1965        // Intentionally DO NOT call allow_other here to avoid requiring /etc/fuse.conf config.
1966
1967        let mount_path = OsString::from(mount_dir.as_os_str());
1968
1969        let session = Session::new(mount_options);
1970        let mount_handle = unwrap_or_skip_mount_error!(
1971            session.mount(fs, mount_path).await,
1972            "mount passthrough fs"
1973        );
1974
1975        // Immediately unmount to verify we at least mounted successfully.
1976        let _ = mount_handle.unmount().await; // errors ignored
1977    }
1978
1979    #[tokio::test]
1980    async fn lookup_rejects_nul_name_without_panicking() {
1981        use rfuse3::raw::{Filesystem, Request};
1982        use std::os::unix::ffi::OsStrExt;
1983
1984        let temp_dir = tempfile::tempdir().unwrap();
1985        let fs = new_passthroughfs_layer(PassthroughArgs {
1986            root_dir: temp_dir.path(),
1987            mapping: None::<&str>,
1988        })
1989        .await
1990        .unwrap();
1991
1992        let err = fs
1993            .lookup(Request::default(), ROOT_ID, OsStr::from_bytes(b"bad\0name"))
1994            .await
1995            .unwrap_err();
1996        let ioerr = std::io::Error::from(err);
1997        assert_eq!(ioerr.raw_os_error(), Some(libc::EINVAL));
1998    }
1999
2000    #[cfg(target_os = "macos")]
2001    #[test]
2002    fn macos_lazy_new_canonicalizes_root_dir() {
2003        use super::Config;
2004        use std::os::unix::fs::symlink;
2005
2006        let temp_dir = tempfile::tempdir().unwrap();
2007        let real_root = temp_dir.path().join("real-root");
2008        let link_root = temp_dir.path().join("link-root");
2009        std::fs::create_dir(&real_root).unwrap();
2010        symlink(&real_root, &link_root).unwrap();
2011
2012        let cfg = Config {
2013            root_dir: link_root.clone(),
2014            macos_lazy_inode_fd: true,
2015            ..Default::default()
2016        };
2017        let fs = PassthroughFs::<()>::new(cfg).expect("new fs");
2018
2019        assert_eq!(fs.cfg.root_dir, real_root.canonicalize().unwrap());
2020        assert_ne!(fs.cfg.root_dir, link_root);
2021    }
2022
2023    #[cfg(target_os = "macos")]
2024    #[tokio::test]
2025    async fn macos_lookup_symlink_entry_does_not_return_eloop() {
2026        use std::os::unix::fs::symlink;
2027
2028        let temp_dir = tempfile::tempdir().unwrap();
2029        std::fs::write(temp_dir.path().join("target.txt"), "target").unwrap();
2030        symlink("target.txt", temp_dir.path().join("link.txt")).unwrap();
2031
2032        let fs = new_passthroughfs_layer(PassthroughArgs {
2033            root_dir: temp_dir.path(),
2034            mapping: None::<&str>,
2035        })
2036        .await
2037        .unwrap();
2038        let name = c"link.txt";
2039
2040        let entry = fs.do_lookup(ROOT_ID, name).await.unwrap();
2041
2042        assert_eq!(entry.attr.kind, rfuse3::FileType::Symlink);
2043    }
2044
2045    /// PR-9.3 finding: `O_NOFOLLOW_ANY` is *not* a drop-in upgrade for
2046    /// `O_NOFOLLOW` on macOS — it conflicts with combining and rejects
2047    /// `/tmp`-rooted paths because `/tmp` itself is a symlink. Instead
2048    /// of asserting a no-op behaviour, this regression guard documents
2049    /// the lazy-fd happy path: a file directly under tmpdir opens, and
2050    /// trailing-symlink retry still kicks in.
2051    #[cfg(target_os = "macos")]
2052    #[test]
2053    fn macos_lazy_open_path_two_step_works() {
2054        use std::ffi::CString;
2055        use std::os::unix::ffi::OsStrExt;
2056        use std::os::unix::fs::symlink;
2057
2058        let temp = tempfile::tempdir().unwrap();
2059        std::fs::write(temp.path().join("file.txt"), b"PR93").unwrap();
2060        symlink("file.txt", temp.path().join("link.txt")).unwrap();
2061
2062        // Regular file: O_NOFOLLOW path returns the fd.
2063        let file_c = CString::new(temp.path().join("file.txt").as_os_str().as_bytes()).unwrap();
2064        let fd = super::lazy_open_path(&file_c, libc::O_RDONLY).expect("regular open failed");
2065        assert!(fd >= 0);
2066        unsafe { libc::close(fd) };
2067
2068        // Trailing-symlink: ELOOP on first try → O_SYMLINK retry returns
2069        // an fd to the link itself.
2070        let link_c = CString::new(temp.path().join("link.txt").as_os_str().as_bytes()).unwrap();
2071        let fd = super::lazy_open_path(&link_c, libc::O_RDONLY).expect("symlink retry path failed");
2072        assert!(fd >= 0);
2073        unsafe { libc::close(fd) };
2074    }
2075
2076    /// Verifies that renaming a cached directory rewrites the cached
2077    /// `lazy_path` of every descendant, so post-eviction reopens land at
2078    /// the new location instead of `ENOENT` at the old one.
2079    #[cfg(target_os = "macos")]
2080    #[tokio::test]
2081    async fn macos_lazy_dir_rename_rewrites_descendants() {
2082        use super::Config;
2083        use rfuse3::raw::Request;
2084        use std::ffi::OsStr;
2085
2086        let temp_dir = tempfile::tempdir().unwrap();
2087        std::fs::create_dir(temp_dir.path().join("a")).unwrap();
2088        std::fs::create_dir(temp_dir.path().join("a/sub")).unwrap();
2089        std::fs::write(temp_dir.path().join("a/sub/file.txt"), b"hi").unwrap();
2090
2091        let cfg = Config {
2092            root_dir: temp_dir.path().to_path_buf(),
2093            xattr: true,
2094            do_import: true,
2095            macos_lazy_inode_fd: true,
2096            ..Default::default()
2097        };
2098        let fs = PassthroughFs::<()>::new(cfg).expect("new fs");
2099        fs.import().await.unwrap();
2100
2101        // Walk the tree to populate cached `lazy_path` on every node.
2102        let a_entry = fs.do_lookup(ROOT_ID, c"a").await.unwrap();
2103        let sub_entry = fs.do_lookup(a_entry.attr.ino, c"sub").await.unwrap();
2104        let file_entry = fs.do_lookup(sub_entry.attr.ino, c"file.txt").await.unwrap();
2105
2106        // Drive the FUSE rename trait directly — it issues the underlying
2107        // `renameat(2)` and runs the lazy-path descendant walk.
2108        use rfuse3::raw::Filesystem;
2109        fs.rename(
2110            Request::default(),
2111            ROOT_ID,
2112            OsStr::new("a"),
2113            ROOT_ID,
2114            OsStr::new("b"),
2115        )
2116        .await
2117        .unwrap();
2118
2119        // Every descendant must now resolve under "/…/b/sub/...".
2120        let new_root = temp_dir.path().canonicalize().unwrap();
2121        for ino in [a_entry.attr.ino, sub_entry.attr.ino, file_entry.attr.ino] {
2122            let data = fs.inode_map.get(ino).await.unwrap();
2123            let path = data.lazy_path().expect("Reopenable on macOS lazy mode");
2124            assert!(
2125                path.starts_with(new_root.join("b")),
2126                "inode {ino} path {path:?} should be under {:?} after rename",
2127                new_root.join("b"),
2128            );
2129        }
2130    }
2131
2132    /// Verifies the lazy-fd LRU bounds the cache at the configured cap and
2133    /// that exceeding it evicts the LRU entry. Drives `do_lookup` directly
2134    /// (no real mount) so the cache populate path runs through `get_file()`.
2135    #[cfg(target_os = "macos")]
2136    #[tokio::test]
2137    async fn macos_lazy_fd_lru_bounds_cache() {
2138        use super::Config;
2139        use std::num::NonZeroUsize;
2140
2141        let temp_dir = tempfile::tempdir().unwrap();
2142        // 4 sibling files; cap=2 so the 3rd lookup must evict.
2143        for i in 0..4 {
2144            std::fs::write(temp_dir.path().join(format!("f{i}.txt")), b"x").unwrap();
2145        }
2146
2147        let cfg = Config {
2148            root_dir: temp_dir.path().to_path_buf(),
2149            xattr: true,
2150            do_import: true,
2151            macos_lazy_inode_fd: true,
2152            macos_lazy_fd_lru_max: Some(NonZeroUsize::new(2).unwrap()),
2153            ..Default::default()
2154        };
2155        let fs = PassthroughFs::<()>::new(cfg).expect("new fs");
2156        fs.import().await.unwrap();
2157
2158        assert_eq!(fs.macos_lazy_fd_cap(), Some(2));
2159
2160        // Look up + force-open each child to populate the lazy cache.
2161        for i in 0..4 {
2162            let name = OsString::from(format!("f{i}.txt"));
2163            let bytes: Vec<u8> = name
2164                .as_os_str()
2165                .as_encoded_bytes()
2166                .iter()
2167                .copied()
2168                .chain(std::iter::once(0))
2169                .collect();
2170            let cname = CStr::from_bytes_with_nul(&bytes).unwrap();
2171            let entry = fs.do_lookup(ROOT_ID, cname).await.unwrap();
2172            // get_file() is what populates the lazy cache; emulate one
2173            // hot-path read so the LRU records this inode.
2174            let inode = entry.attr.ino;
2175            let data = fs.inode_map.get(inode).await.unwrap();
2176            let _ = data.get_file().unwrap();
2177        }
2178
2179        let len = fs.macos_lazy_fd_cache_len().expect("lru enabled");
2180        let reopens = fs.macos_lazy_fd_reopen_count().expect("lru enabled");
2181        // 4 lookups all populated cache → 4 reopens. Cache length is bounded
2182        // by cap (2). Exact LRU order isn't asserted to keep the test
2183        // resilient against insertion-order quirks.
2184        assert!(
2185            len <= 2,
2186            "cache length {len} exceeded cap 2 — LRU eviction is broken",
2187        );
2188        assert!(
2189            reopens >= 4,
2190            "expected ≥4 reopens, saw {reopens} — counter not bumping",
2191        );
2192    }
2193
2194    /// Stress: 200 files, cap=8, ensures the LRU caps both the in-memory
2195    /// cache **and** real OS fd usage, and that `forget_one()` releases
2196    /// LRU entries when refcount hits zero.
2197    ///
2198    /// Counts process fds via `/dev/fd` (macOS) — every entry under that
2199    /// directory is an open fd. After populating the cache we expect the
2200    /// per-inode fd cost to be capped at `cap`; after forget-all, fd usage
2201    /// should drop back near baseline.
2202    #[cfg(target_os = "macos")]
2203    #[tokio::test]
2204    async fn macos_lazy_fd_pressure_caps_real_fds() {
2205        use super::Config;
2206        use std::num::NonZeroUsize;
2207
2208        const FILES: usize = 200;
2209        const CAP: usize = 8;
2210        // Allow a small slack for ancillary fds (libfuse-fs itself opens
2211        // a handful: proc_self_fd, mount_fds, mmap chunks, allocator pools,
2212        // tokio runtime pipes, …). 32 covers observed steady-state churn.
2213        const FD_SLACK: usize = 32;
2214
2215        let temp_dir = tempfile::tempdir().unwrap();
2216        for i in 0..FILES {
2217            std::fs::write(temp_dir.path().join(format!("f{i:04}.txt")), b"x").unwrap();
2218        }
2219
2220        let cfg = Config {
2221            root_dir: temp_dir.path().to_path_buf(),
2222            xattr: true,
2223            do_import: true,
2224            macos_lazy_inode_fd: true,
2225            macos_lazy_fd_lru_max: Some(NonZeroUsize::new(CAP).unwrap()),
2226            ..Default::default()
2227        };
2228        let fs = PassthroughFs::<()>::new(cfg).expect("new fs");
2229        fs.import().await.unwrap();
2230
2231        // Baseline fd count *after* fs construction so we factor out the
2232        // ancillary fds that PassthroughFs::new opens (proc_self_fd, etc.).
2233        let baseline_fds = count_open_fds();
2234
2235        let mut inodes = Vec::with_capacity(FILES);
2236        for i in 0..FILES {
2237            let name = format!("f{i:04}.txt");
2238            let bytes: Vec<u8> = name
2239                .as_bytes()
2240                .iter()
2241                .copied()
2242                .chain(std::iter::once(0))
2243                .collect();
2244            let cname = CStr::from_bytes_with_nul(&bytes).unwrap();
2245            let entry = fs.do_lookup(ROOT_ID, cname).await.unwrap();
2246            let inode = entry.attr.ino;
2247            // Force the lazy-fd path to open + cache.
2248            let data = fs.inode_map.get(inode).await.unwrap();
2249            let _ = data.get_file().unwrap();
2250            inodes.push(inode);
2251        }
2252
2253        let after_lookup_fds = count_open_fds();
2254        let cache_len = fs.macos_lazy_fd_cache_len().unwrap();
2255        let reopens = fs.macos_lazy_fd_reopen_count().unwrap();
2256        assert_eq!(
2257            cache_len, CAP,
2258            "cache should saturate at cap={CAP}, saw {cache_len}",
2259        );
2260        assert!(
2261            reopens as usize >= FILES,
2262            "expected ≥ {FILES} reopens (one per lookup), saw {reopens}",
2263        );
2264        assert!(
2265            after_lookup_fds <= baseline_fds + CAP + FD_SLACK,
2266            "fd usage exploded: baseline={baseline_fds}, after={after_lookup_fds}, \
2267             cap={CAP}, slack={FD_SLACK}",
2268        );
2269
2270        // Forget every inode and confirm LRU drains and fd count returns
2271        // near baseline.
2272        let mut store = fs.inode_map.inodes.write().await;
2273        for inode in &inodes {
2274            fs.forget_one(&mut store, *inode, 1).await;
2275        }
2276        drop(store);
2277
2278        let after_forget_fds = count_open_fds();
2279        let final_cache_len = fs.macos_lazy_fd_cache_len().unwrap();
2280        assert_eq!(
2281            final_cache_len, 0,
2282            "LRU should drain after forget-all, saw {final_cache_len}",
2283        );
2284        assert!(
2285            after_forget_fds <= baseline_fds + FD_SLACK,
2286            "fd usage didn't drop after forget-all: baseline={baseline_fds}, \
2287             after_forget={after_forget_fds}, slack={FD_SLACK}",
2288        );
2289    }
2290
2291    /// Counts entries under `/dev/fd`, which on macOS exposes the calling
2292    /// process's open file descriptors. Used by the fd-pressure stress
2293    /// test to assert real (not just LRU-internal) fd accounting.
2294    #[cfg(target_os = "macos")]
2295    fn count_open_fds() -> usize {
2296        std::fs::read_dir("/dev/fd")
2297            .map(|d| d.filter_map(|e| e.ok()).count())
2298            .unwrap_or(0)
2299    }
2300
2301    // ----- macOS-only opcodes (PR-7.2) ----------------------------------
2302
2303    /// `setvolname` accepts and ignores. We only verify the trait dispatch
2304    /// resolves to a successful response (not the trait default's ENOSYS).
2305    /// A real Finder-level test would need a mounted volume + admin
2306    /// privileges.
2307    #[cfg(target_os = "macos")]
2308    #[tokio::test]
2309    async fn macos_setvolname_accepts_and_returns_ok() {
2310        use rfuse3::raw::{Filesystem, Request};
2311        use std::ffi::OsStr;
2312
2313        let temp_dir = tempfile::tempdir().unwrap();
2314        let fs = new_passthroughfs_layer(PassthroughArgs {
2315            root_dir: temp_dir.path(),
2316            mapping: None::<&str>,
2317        })
2318        .await
2319        .unwrap();
2320        let res = fs
2321            .setvolname(Request::default(), OsStr::new("MyVolume"))
2322            .await;
2323        assert!(
2324            res.is_ok(),
2325            "setvolname must not return ENOSYS, got {res:?}"
2326        );
2327    }
2328
2329    /// `getxtimes` returns `st_birthtimespec` for both fields. Test creates
2330    /// a file, looks it up, then queries getxtimes and compares against
2331    /// the on-disk creation time.
2332    #[cfg(target_os = "macos")]
2333    #[tokio::test]
2334    async fn macos_getxtimes_reports_creation_time() {
2335        use rfuse3::raw::{Filesystem, Request};
2336
2337        let temp_dir = tempfile::tempdir().unwrap();
2338        let target = temp_dir.path().join("birthcheck.txt");
2339        std::fs::write(&target, b"hi").unwrap();
2340
2341        let fs = new_passthroughfs_layer(PassthroughArgs {
2342            root_dir: temp_dir.path(),
2343            mapping: None::<&str>,
2344        })
2345        .await
2346        .unwrap();
2347        let cname = c"birthcheck.txt";
2348        let entry = fs.do_lookup(ROOT_ID, cname).await.unwrap();
2349        let times = fs
2350            .getxtimes(Request::default(), entry.attr.ino)
2351            .await
2352            .expect("getxtimes must not return ENOSYS");
2353
2354        // crtime sec must be > 0 and equal across both fields. We don't
2355        // assert an exact value (filesystem time precision varies).
2356        assert_eq!(times.bkuptime, times.crtime);
2357        assert!(
2358            times.crtime.sec > 0,
2359            "crtime should be a real birthtime, got {:?}",
2360            times.crtime,
2361        );
2362    }
2363
2364    /// `exchange` must atomically swap two siblings in place. After the
2365    /// swap, the inode bound to "a" should hold the contents that used
2366    /// to be at "b" and vice versa.
2367    #[cfg(target_os = "macos")]
2368    #[tokio::test]
2369    async fn macos_exchange_swaps_two_siblings() {
2370        use rfuse3::raw::{Filesystem, Request};
2371        use std::ffi::OsStr;
2372
2373        let temp_dir = tempfile::tempdir().unwrap();
2374        std::fs::write(temp_dir.path().join("a.txt"), b"A_PAYLOAD").unwrap();
2375        std::fs::write(temp_dir.path().join("b.txt"), b"B_PAYLOAD").unwrap();
2376
2377        let fs = new_passthroughfs_layer(PassthroughArgs {
2378            root_dir: temp_dir.path(),
2379            mapping: None::<&str>,
2380        })
2381        .await
2382        .unwrap();
2383
2384        fs.exchange(
2385            Request::default(),
2386            ROOT_ID,
2387            OsStr::new("a.txt"),
2388            ROOT_ID,
2389            OsStr::new("b.txt"),
2390            0,
2391        )
2392        .await
2393        .expect("exchange must not return ENOSYS");
2394
2395        // After RENAME_SWAP: a.txt now holds B_PAYLOAD; b.txt holds A_PAYLOAD.
2396        let after_a = std::fs::read(temp_dir.path().join("a.txt")).unwrap();
2397        let after_b = std::fs::read(temp_dir.path().join("b.txt")).unwrap();
2398        assert_eq!(
2399            after_a, b"B_PAYLOAD",
2400            "exchange did not move B's content to a.txt"
2401        );
2402        assert_eq!(
2403            after_b, b"A_PAYLOAD",
2404            "exchange did not move A's content to b.txt"
2405        );
2406    }
2407
2408    /// Resource fork xattrs are the one macOS xattr namespace where the
2409    /// kernel-supplied position matters. Preserve it instead of flattening
2410    /// every write to offset 0.
2411    #[cfg(target_os = "macos")]
2412    #[tokio::test]
2413    async fn macos_resource_fork_xattr_honors_position() {
2414        use rfuse3::raw::{Filesystem, Request};
2415        use std::ffi::OsStr;
2416
2417        let temp_dir = tempfile::tempdir().unwrap();
2418        std::fs::write(temp_dir.path().join("forked.txt"), b"data").unwrap();
2419
2420        let fs = new_passthroughfs_layer(PassthroughArgs {
2421            root_dir: temp_dir.path(),
2422            mapping: None::<&str>,
2423        })
2424        .await
2425        .unwrap();
2426        let entry = fs.do_lookup(ROOT_ID, c"forked.txt").await.unwrap();
2427        let attr = OsStr::new("com.apple.ResourceFork");
2428
2429        fs.setxattr(Request::default(), entry.attr.ino, attr, b"abcd", 0, 0)
2430            .await
2431            .unwrap();
2432        fs.setxattr(Request::default(), entry.attr.ino, attr, b"EF", 0, 2)
2433            .await
2434            .unwrap();
2435
2436        let data = fs
2437            .getxattr(Request::default(), entry.attr.ino, attr, 4)
2438            .await
2439            .unwrap();
2440        match data {
2441            rfuse3::raw::reply::ReplyXAttr::Data(bytes) => assert_eq!(&bytes[..], b"abEF"),
2442            other => panic!("expected resource-fork data, got {other:?}"),
2443        }
2444    }
2445
2446    // // Test for uid/gid mapping
2447    // async fn setup(
2448    //     mapping: Option<&str>,
2449    // ) -> (PassthroughFs, tempfile::TempDir, Uid, Gid, Uid, Gid) {
2450    //     let tmp_dir = tempfile::tempdir().unwrap();
2451    //     let src_dir = tmp_dir.path();
2452
2453    //     let cur_uid = getuid();
2454    //     let cur_gid = getgid();
2455
2456    //     let container_uid = Uid::from_raw(1000);
2457    //     let container_gid = Gid::from_raw(1000);
2458
2459    //     let args = PassthroughArgs {
2460    //         root_dir: src_dir.to_path_buf(),
2461    //         mapping: mapping,
2462    //     };
2463    //     let fs = new_passthroughfs_layer(args).await.unwrap();
2464
2465    //     (fs, tmp_dir, cur_uid, cur_gid, container_uid, container_gid)
2466    // }
2467
2468    /// Tests the reverse mapping (host -> container) for `lookup` and `getattr` operations.
2469    ///
2470    /// It sets up a mapping from the current host user to a container user (UID/GID 1000).
2471    /// Then, it creates a file owned by the host user and verifies that when FUSE looks up
2472    /// or gets attributes for this file, the returned UID/GID are correctly mapped to 1000.
2473    ///
2474    /// Unfortunately, this can not work because `do_lookup` calls `to_openable_handle` which
2475    /// requires CAP_DAC_READ_SEARCH capability, which is not available in unprivileged test environments.
2476    /// So this test is commented out for now.
2477    #[tokio::test]
2478    async fn test_lookup_and_getattr() {
2479        pass!()
2480    }
2481    // async fn test_lookup_and_getattr() {
2482    //     let cur_uid = getuid().as_raw();
2483    //     let cur_gid = getgid().as_raw();
2484    //     let mapping = format!("uidmapping={cur_uid}:1000:1,gidmapping={cur_gid}:1000:1");
2485
2486    //     let (fs, tmp_dir, ..) = setup(Some(&mapping)).await;
2487    //     let src = tmp_dir.path();
2488
2489    //     // Create a file in the source directory, owned by the current host user.
2490    //     let file_path = src.join("test_file.txt");
2491    //     std::fs::File::create(&file_path).unwrap();
2492    //     std::os::unix::fs::chown(&file_path, Some(cur_uid), Some(cur_gid)).unwrap();
2493
2494    //     // Simulate a FUSE request from the container user (UID/GID 1000).
2495    //     let req = Request::default();
2496    //     // Perform a lookup, which should trigger attribute fetching.
2497    //     let reply = fs
2498    //         .do_lookup(
2499    //             ROOT_ID,
2500    //             CStr::from_bytes_with_nul(b"test_file.txt\0").unwrap(),
2501    //         )
2502    //         .await
2503    //         .unwrap();
2504
2505    //     // Verify that the returned attributes are mapped to the container's perspective.
2506    //     assert_eq!(reply.attr.uid, 1000);
2507    //     assert_eq!(reply.attr.gid, 1000);
2508
2509    //     // Explicitly call getattr and verify the same mapping logic.
2510    //     let getattr_reply = fs.getattr(req, reply.attr.ino, None, 0).await.unwrap();
2511    //     assert_eq!(getattr_reply.attr.uid, 1000);
2512    //     assert_eq!(getattr_reply.attr.gid, 1000);
2513    // }
2514
2515    /// Tests the forward mapping (container -> host) for the `create` operation.
2516    ///
2517    /// It sets up a mapping from the current host user to a container user (UID/GID 1000).
2518    /// It then simulates a `create` request from the container user and verifies two things:
2519    /// 1. The newly created file on the host filesystem is owned by the mapped host user.
2520    /// 2. The attributes returned in the FUSE reply are correctly mapped back to the container user's ID.
2521    #[tokio::test]
2522    async fn test_create() {
2523        pass!()
2524    }
2525    // #[tokio::test]
2526    // async fn test_create() {
2527    //     let cur_uid = getuid().as_raw();
2528    //     let cur_gid = getgid().as_raw();
2529    //     let mapping = format!("uidmapping={cur_uid}:1000:1,gidmapping={cur_gid}:1000:1");
2530
2531    //     let (fs, tmp_dir, host_uid, host_gid, container_uid, container_gid) =
2532    //         setup(Some(&mapping)).await;
2533
2534    //     // Simulate a request coming from the container user (1000).
2535    //     let mut req = Request::default();
2536    //     req.uid = container_uid.as_raw();
2537    //     req.gid = container_gid.as_raw();
2538
2539    //     let file_name = OsStr::new("new_file.txt");
2540    //     let mode = libc::S_IFREG | 0o644;
2541
2542    //     // Perform the create operation.
2543    //     let created_reply = fs
2544    //         .create(req, ROOT_ID, file_name, mode, libc::O_CREAT as u32)
2545    //         .await
2546    //         .unwrap();
2547
2548    //     let file_path = tmp_dir.path().join(file_name);
2549    //     let metadata = std::fs::metadata(file_path).unwrap();
2550
2551    //     // Verify forward mapping: the file owner on the host should be the mapped host user.
2552    //     use std::os::unix::fs::MetadataExt;
2553    //     assert_eq!(Uid::from_raw(metadata.uid()), host_uid);
2554    //     assert_eq!(Gid::from_raw(metadata.gid()), host_gid);
2555
2556    //     // Verify reverse mapping in the reply: the attributes sent back to the container
2557    //     // should reflect the container's user ID.
2558    //     assert_eq!(created_reply.attr.uid, container_uid.as_raw());
2559    //     assert_eq!(created_reply.attr.gid, container_gid.as_raw());
2560    // }
2561}