Skip to main content

libfuse_fs/passthrough/
async_io.rs

1use crate::util::open_options::OpenOptions;
2use bytes::Bytes;
3use futures::stream;
4use libc::{off_t, pread, size_t};
5use rfuse3::{Errno, Inode, Result, raw::prelude::*};
6use std::{
7    ffi::{CStr, CString, OsStr, OsString},
8    fs::File,
9    io,
10    mem::MaybeUninit,
11    num::NonZeroU32,
12    os::{
13        fd::{AsRawFd, RawFd},
14        raw::c_int,
15        unix::ffi::OsStringExt,
16    },
17    sync::{Arc, atomic::Ordering},
18    time::Duration,
19};
20use tracing::{debug, error, info, trace};
21
22use vm_memory::{ByteValued, bitmap::BitmapSlice};
23
24#[cfg(target_os = "linux")]
25use crate::passthrough::{FileUniqueKey, statx::statx};
26use crate::{
27    passthrough::{CURRENT_DIR_CSTR, EMPTY_CSTR, PARENT_DIR_CSTR},
28    util::{convert_stat64_to_file_attr, filetype_from_mode},
29};
30
31use super::ebadf;
32#[cfg(target_os = "linux")]
33use super::util::fd_path_cstr;
34use super::util::{
35    self, AT_EMPTY_PATH, SLASH_ASCII, einval, enosys, is_safe_inode, osstr_to_cstr, set_creds,
36    stat_fd, stat64,
37};
38#[cfg(target_os = "macos")]
39use super::util::{is_linux_only_xattr, join_dir_and_name};
40use super::{Handle, HandleData, PassthroughFs, config::CachePolicy, os_compat::LinuxDirent64};
41#[cfg(target_os = "macos")]
42use super::{InodeData, inode_store, statx};
43#[cfg(target_os = "macos")]
44pub const O_DIRECT: libc::c_int = 0;
45#[cfg(target_os = "linux")]
46pub use libc::O_DIRECT;
47
48#[inline]
49fn osstr_to_cstr_or_einval(name: &OsStr) -> io::Result<CString> {
50    osstr_to_cstr(name).map_err(|_| einval())
51}
52
53impl<S: BitmapSlice + Send + Sync> PassthroughFs<S> {
54    async fn open_inode(&self, inode: Inode, flags: i32) -> io::Result<File> {
55        let data = self.inode_map.get(inode).await?;
56        if !is_safe_inode(data.mode) {
57            Err(ebadf())
58        } else {
59            let mut new_flags = self.get_writeback_open_flags(flags).await;
60            #[allow(clippy::bad_bit_mask)]
61            if !self.cfg.allow_direct_io && flags & O_DIRECT != 0 {
62                new_flags &= !O_DIRECT;
63            }
64            data.open_file(new_flags | libc::O_CLOEXEC, &self.proc_self_fd)
65        }
66    }
67
68    /// Check the HandleData flags against the flags from the current request
69    /// if these do not match update the file descriptor flags and store the new
70    /// result in the HandleData entry
71    async fn check_fd_flags(
72        &self,
73        data: &Arc<HandleData>,
74        fd: RawFd,
75        flags: u32,
76    ) -> io::Result<()> {
77        let open_flags = data.get_flags().await;
78        if open_flags != flags {
79            let ret = unsafe { libc::fcntl(fd, libc::F_SETFL, flags) };
80            if ret != 0 {
81                return Err(io::Error::last_os_error());
82            }
83            data.set_flags(flags).await;
84        }
85        Ok(())
86    }
87
88    async fn do_readdir(
89        &self,
90        inode: Inode,
91        handle: Handle,
92        offset: u64,
93        entry_list: &mut Vec<std::result::Result<DirectoryEntry, Errno>>,
94    ) -> io::Result<()> {
95        const BUFFER_SIZE: usize = 8192;
96
97        let data = self.get_dirdata(handle, inode, libc::O_RDONLY).await?;
98
99        // Since we are going to work with the kernel offset, we have to acquire the file lock
100        // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread
101        // changes the kernel offset while we are using it.
102        let (_guard, dir) = data.get_file_mut().await;
103
104        // Allocate buffer; pay attention to alignment.
105        let mut buffer = vec![0u8; BUFFER_SIZE];
106
107        // Syscall `getdents64` implementation
108        #[cfg(target_os = "linux")]
109        let res =
110            unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) };
111        #[cfg(target_os = "macos")]
112        let res = unsafe { libc::lseek(dir.as_raw_fd(), offset as libc::off_t, libc::SEEK_SET) };
113
114        if res < 0 {
115            return Err(io::Error::last_os_error());
116        }
117
118        loop {
119            // call getdents64 system call
120            #[cfg(target_os = "linux")]
121            {
122                let result = unsafe {
123                    libc::syscall(
124                        libc::SYS_getdents64,
125                        dir.as_raw_fd(),
126                        buffer.as_mut_ptr() as *mut LinuxDirent64,
127                        BUFFER_SIZE,
128                    )
129                };
130
131                if result == -1 {
132                    return Err(std::io::Error::last_os_error());
133                }
134
135                let bytes_read = result as usize;
136                if bytes_read == 0 {
137                    break; // no more
138                }
139
140                // push every entry .
141                let mut offset = 0;
142                while offset < bytes_read {
143                    //let (front, back) = buffer.split_at(size_of::<LinuxDirent64>());
144                    //size_of::<LinuxDirent64>()
145                    let front = &buffer[offset..offset + size_of::<LinuxDirent64>()];
146                    let back = &buffer[offset + size_of::<LinuxDirent64>()..];
147
148                    let dirent64 = LinuxDirent64::from_slice(front)
149                        .expect("fuse: unable to get LinuxDirent64 from slice");
150
151                    let namelen = dirent64.d_reclen as usize - size_of::<LinuxDirent64>();
152                    debug_assert!(
153                        namelen <= back.len(),
154                        "fuse: back is smaller than `namelen`"
155                    );
156
157                    let name = &back[..namelen];
158                    if name.eq(CURRENT_DIR_CSTR) || name.eq(PARENT_DIR_CSTR) {
159                        offset += dirent64.d_reclen as usize;
160                        continue;
161                    }
162                    let name = bytes_to_cstr(name)
163                        .map_err(|e| {
164                            error!("fuse: do_readdir: {e:?}");
165                            einval()
166                        })?
167                        .to_bytes();
168
169                    let mut entry = DirectoryEntry {
170                        inode: dirent64.d_ino,
171                        kind: filetype_from_mode(dirent64.d_ty as u32 * 0x1000u32),
172                        name: OsString::from_vec(name.to_vec()),
173                        offset: dirent64.d_off,
174                    };
175                    // Safe because do_readdir() has ensured dir_entry.name is a
176                    // valid [u8] generated by CStr::to_bytes().
177                    let name = osstr_to_cstr(&entry.name)?;
178                    // trace!("do_readdir: inode={}, name={}", inode, name.to_str().unwrap());
179                    let _entry = self.do_lookup(inode, &name).await?;
180                    let mut inodes = self.inode_map.inodes.write().await;
181
182                    self.forget_one(&mut inodes, _entry.attr.ino, 1).await;
183                    entry.inode = _entry.attr.ino;
184                    entry_list.push(Ok(entry));
185
186                    // move to next entry
187                    offset += dirent64.d_reclen as usize;
188                }
189            }
190            #[cfg(target_os = "macos")]
191            {
192                unsafe extern "C" {
193                    fn getdirentries(
194                        fd: libc::c_int,
195                        buf: *mut libc::c_char,
196                        nbytes: libc::size_t,
197                        basep: *mut libc::off_t,
198                    ) -> libc::c_int;
199                }
200
201                let mut base: libc::off_t = 0;
202                let result = unsafe {
203                    getdirentries(
204                        dir.as_raw_fd(),
205                        buffer.as_mut_ptr() as *mut libc::c_char,
206                        BUFFER_SIZE,
207                        &mut base,
208                    )
209                };
210
211                if result == -1 {
212                    return Err(std::io::Error::last_os_error());
213                }
214
215                let bytes_read = result as usize;
216                if bytes_read == 0 {
217                    break; // no more
218                }
219
220                let mut offset = 0;
221                while offset < bytes_read {
222                    let p = unsafe { buffer.as_ptr().add(offset) };
223
224                    if offset + 8 > bytes_read {
225                        break;
226                    }
227
228                    // Logically determined layout from logs:
229                    // 0: d_ino (u32)
230                    // 4: d_reclen (u16)
231                    // 6: d_type (u8)
232                    // 7: d_namlen (u8)
233                    // 8: d_name
234
235                    let d_ino = unsafe { std::ptr::read_unaligned(p as *const u32) } as u64;
236                    let d_reclen = unsafe { std::ptr::read_unaligned(p.add(4) as *const u16) };
237                    let d_type = unsafe { std::ptr::read_unaligned(p.add(6)) };
238                    let d_namlen = unsafe { std::ptr::read_unaligned(p.add(7)) };
239
240                    debug!(
241                        "readdir parsed: offset={} d_ino={} d_reclen={} d_namlen={} d_type={}",
242                        offset, d_ino, d_reclen, d_namlen, d_type
243                    );
244
245                    if d_reclen == 0 {
246                        break;
247                    }
248                    if offset + d_reclen as usize > bytes_read {
249                        break;
250                    }
251
252                    let name_ptr = unsafe { p.add(8) };
253                    // use d_namlen
254                    let safe_namlen = std::cmp::min(d_namlen as usize, d_reclen as usize - 8);
255                    let name_slice = unsafe { std::slice::from_raw_parts(name_ptr, safe_namlen) };
256
257                    if name_slice == CURRENT_DIR_CSTR || name_slice == PARENT_DIR_CSTR {
258                        offset += d_reclen as usize;
259                        continue;
260                    }
261
262                    // Generate a resume offset for the next readdir call.
263                    let current_entry_offset = base as u64 + offset as u64 + d_reclen as u64;
264
265                    // Extract the entry name from the buffer.
266
267                    let name_vec = name_slice.to_vec();
268
269                    let mut entry = DirectoryEntry {
270                        inode: d_ino,
271                        kind: filetype_from_mode(d_type as u32 * 0x1000),
272                        name: OsString::from_vec(name_vec.clone()),
273                        offset: current_entry_offset as i64,
274                    };
275
276                    // We need to process name to be sure it's valid CStr for do_lookup if needed.
277                    // But OsString::from_vec handles bytes.
278
279                    // The Linux code calls do_lookup. This adds overhead but refreshes attrs.
280                    // We can try to skip it if basic ls is enough, but to be safe and consistent:
281                    // Sanitize name_vec: take up to the first null byte
282                    let name_bytes: Vec<u8> =
283                        name_vec.iter().take_while(|&&b| b != 0).cloned().collect();
284
285                    let name_cstr = match CString::new(name_bytes.clone()) {
286                        Ok(c) => c,
287                        Err(e) => {
288                            error!(
289                                "fuse: do_readdir: invalid name bytes after sanitization: {:?} original: {:?} error: {}",
290                                name_bytes, name_vec, e
291                            );
292                            return Err(einval());
293                        }
294                    };
295
296                    let _entry = self.do_lookup(inode, &name_cstr).await?;
297                    let mut inodes = self.inode_map.inodes.write().await;
298                    self.forget_one(&mut inodes, _entry.attr.ino, 1).await;
299                    entry.inode = _entry.attr.ino;
300
301                    entry_list.push(Ok(entry));
302
303                    offset += d_reclen as usize;
304                }
305            }
306        }
307
308        Ok(())
309    }
310
311    async fn do_readdirplus(
312        &self,
313        inode: Inode,
314        handle: Handle,
315        offset: u64,
316        entry_list: &mut Vec<std::result::Result<DirectoryEntryPlus, Errno>>,
317    ) -> io::Result<()> {
318        const BUFFER_SIZE: usize = 8192;
319
320        let data = self.get_dirdata(handle, inode, libc::O_RDONLY).await?;
321
322        // Since we are going to work with the kernel offset, we have to acquire the file lock
323        // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread
324        // changes the kernel offset while we are using it.
325        let (_guard, dir) = data.get_file_mut().await;
326
327        // Allocate buffer; pay attention to alignment.
328        #[allow(unused_mut)]
329        let mut buffer = vec![0u8; BUFFER_SIZE];
330
331        // Syscall `getdents64` implementation
332        #[cfg(target_os = "linux")]
333        let res =
334            unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) };
335        #[cfg(target_os = "macos")]
336        let res = unsafe { libc::lseek(dir.as_raw_fd(), offset as libc::off_t, libc::SEEK_SET) };
337
338        if res < 0 {
339            return Err(io::Error::last_os_error());
340        }
341
342        loop {
343            // call getdents64 system call
344            #[cfg(target_os = "linux")]
345            let result = unsafe {
346                libc::syscall(
347                    libc::SYS_getdents64,
348                    dir.as_raw_fd(),
349                    buffer.as_mut_ptr() as *mut LinuxDirent64,
350                    BUFFER_SIZE,
351                )
352            };
353            #[cfg(target_os = "macos")]
354            let result = {
355                // Stub for now
356                unsafe { *libc::__error() = libc::ENOSYS };
357                -1
358            };
359
360            if result == -1 {
361                return Err(std::io::Error::last_os_error());
362            }
363
364            let bytes_read = result as usize;
365            if bytes_read == 0 {
366                break;
367            }
368
369            let mut offset = 0;
370            while offset < bytes_read {
371                //size_of::<LinuxDirent64>()
372                let front = &buffer[offset..offset + size_of::<LinuxDirent64>()];
373                let back = &buffer[offset + size_of::<LinuxDirent64>()..];
374                //let (front, back) = buffer.split_at(size_of::<LinuxDirent64>());
375
376                let dirent64 = LinuxDirent64::from_slice(front)
377                    .expect("fuse: unable to get LinuxDirent64 from slice");
378
379                let namelen = dirent64.d_reclen as usize - size_of::<LinuxDirent64>();
380                debug_assert!(
381                    namelen <= back.len(),
382                    "fuse: back is smaller than `namelen`"
383                );
384
385                let name = &back[..namelen];
386                if name.starts_with(CURRENT_DIR_CSTR) || name.starts_with(PARENT_DIR_CSTR) {
387                    offset += dirent64.d_reclen as usize;
388                    continue;
389                }
390                let name = bytes_to_cstr(name)
391                    .map_err(|e| {
392                        error!("fuse: do_readdir: {e:?}");
393                        einval()
394                    })?
395                    .to_bytes();
396
397                let mut entry = DirectoryEntry {
398                    inode: dirent64.d_ino,
399                    kind: filetype_from_mode((dirent64.d_ty as u16 * 0x1000u16).into()),
400                    name: OsString::from_vec(name.to_vec()),
401                    offset: dirent64.d_off,
402                };
403                // Safe because do_readdir() has ensured dir_entry.name is a
404                // valid [u8] generated by CStr::to_bytes().
405                let name = osstr_to_cstr(&entry.name)?;
406                debug!("readdir:{}", name.to_string_lossy());
407                let _entry = self.do_lookup(inode, &name).await?;
408                entry.inode = _entry.attr.ino;
409
410                entry_list.push(Ok(DirectoryEntryPlus {
411                    inode: entry.inode,
412                    generation: _entry.generation,
413                    kind: entry.kind,
414                    name: entry.name,
415                    offset: entry.offset,
416                    attr: _entry.attr,
417                    entry_ttl: _entry.ttl,
418                    attr_ttl: _entry.ttl,
419                }));
420                // add the offset.
421                offset += dirent64.d_reclen as usize;
422            }
423        }
424        Ok(())
425    }
426
427    async fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
428        let file = self.open_inode(inode, flags as i32).await?;
429
430        let data = HandleData::new(inode, file, flags);
431        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
432        self.handle_map.insert(handle, data).await;
433
434        let mut opts = OpenOptions::empty();
435        match self.cfg.cache_policy {
436            // We only set the direct I/O option on files.
437            CachePolicy::Never => opts.set(
438                OpenOptions::DIRECT_IO,
439                flags & (libc::O_DIRECTORY as u32) == 0,
440            ),
441            CachePolicy::Metadata => {
442                if flags & (libc::O_DIRECTORY as u32) == 0 {
443                    opts |= OpenOptions::DIRECT_IO;
444                } else {
445                    opts |= OpenOptions::CACHE_DIR | OpenOptions::KEEP_CACHE;
446                }
447            }
448            CachePolicy::Always => {
449                opts |= OpenOptions::KEEP_CACHE;
450                if flags & (libc::O_DIRECTORY as u32) != 0 {
451                    opts |= OpenOptions::CACHE_DIR;
452                }
453            }
454            _ => {}
455        };
456
457        Ok((Some(handle), opts))
458    }
459
460    /// Core implementation for `getattr`.
461    ///
462    /// This is the internal function that performs the actual `stat` system call.
463    /// It contains a crucial `mapping` parameter that controls its behavior:
464    /// - `mapping: true`: Applies reverse ID mapping (host -> container) to the `uid` and `gid`.
465    ///   This is for external FUSE clients.
466    /// - `mapping: false`: Returns the raw, unmapped host attributes. This is for internal
467    ///   callers like `overlayfs`'s copy-up logic.
468    pub(crate) async fn do_getattr_inner(
469        &self,
470        inode: Inode,
471        handle: Option<Handle>,
472        mapping: bool,
473    ) -> io::Result<(stat64, Duration)> {
474        // trace!("FS {} passthrough: do_getattr: before get: inode={}, handle={:?}", self.uuid, inode, handle);
475        let data = self.inode_map.get(inode).await.map_err(|e| {
476            error!("fuse: do_getattr ino {inode} Not find err {e:?}");
477            e
478        })?;
479        // trace!("do_getattr: got data {:?}", data);
480
481        // kernel sends 0 as handle in case of no_open, and it depends on fuse server to handle
482        // this case correctly.
483        let st = if !self.no_open.load(Ordering::Relaxed)
484            && let Some(handle_id) = handle
485        {
486            let hd = self.handle_map.get(handle_id, inode).await?;
487            // trace!("FS {} passthrough: do_getattr: before stat_fd", self.uuid);
488            util::stat_fd(hd.get_file(), None)
489        } else {
490            // trace!("FS {} passthrough: do_getattr: before stat", self.uuid);
491            data.handle.stat()
492        };
493        // trace!("FS {} passthrough: do_getattr: after stat", self.uuid);
494
495        let mut st = st.map_err(|e| {
496            if e.raw_os_error() == Some(libc::ESTALE) {
497                // debug!("fuse: do_getattr stat failed ino {inode} err {e:?}");
498                // ignore
499            } else {
500                error!("fuse: do_getattr stat failed ino {inode} err {e:?}");
501            }
502            e
503        })?;
504        st.st_ino = inode;
505        if mapping {
506            st.st_uid = self.cfg.mapping.find_mapping(st.st_uid, true, true);
507            st.st_gid = self.cfg.mapping.find_mapping(st.st_gid, true, false);
508        }
509        Ok((st, self.cfg.attr_timeout))
510    }
511
512    /// Public `getattr` wrapper for FUSE clients.
513    ///
514    /// This function serves as the standard entry point for `getattr` requests from the FUSE
515    /// kernel module. It always performs ID mapping by calling [`do_getattr_inner`][Self::do_getattr_inner] with
516    /// `mapping: true` to ensure clients see attributes from the container's perspective.
517    async fn do_getattr(&self, inode: Inode, fh: Option<u64>) -> io::Result<(stat64, Duration)> {
518        let inode_data = self.inode_map.get(inode).await?;
519        if let Some(handle) = fh {
520            let hd = self.handle_map.get(handle, inode).await?;
521            let file = hd.get_file();
522            return util::stat_fd(file, None).map(|st| (st, self.cfg.attr_timeout));
523        }
524
525        let file = inode_data.get_file()?;
526        util::stat_fd(&file, None).map(|st| (st, self.cfg.attr_timeout))
527    }
528
529    /// Internal `getattr` helper that skips ID mapping.
530    ///
531    /// This helper is specifically designed for internal use by `overlayfs`. It calls
532    /// [`do_getattr_inner`][Self::do_getattr_inner] with `mapping: false` to retrieve the raw, unmodified host
533    /// attributes of a file. This is essential for the `copy_up` process to correctly
534    /// preserve the original file ownership.
535    pub async fn do_getattr_helper(
536        &self,
537        inode: Inode,
538        fh: Option<u64>,
539    ) -> io::Result<(stat64, Duration)> {
540        self.do_getattr_inner(inode, fh, false).await
541    }
542
543    async fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> {
544        let data = self.inode_map.get(parent).await?;
545        let file = data.get_file()?;
546        #[cfg(target_os = "linux")]
547        let st = statx(&file, Some(name)).ok();
548        // Safe because this doesn't modify any memory and we check the return value.
549        let res = unsafe { libc::unlinkat(file.as_raw_fd(), name.as_ptr(), flags) };
550        if res == 0 {
551            #[cfg(target_os = "linux")]
552            if let Some(st) = st
553                && let Some(btime) = st.btime
554                && (btime.tv_sec != 0 || btime.tv_nsec != 0)
555            {
556                let key = FileUniqueKey(st.st.st_ino, btime);
557                self.handle_cache.invalidate(&key).await;
558            }
559
560            Ok(())
561        } else {
562            Err(io::Error::last_os_error())
563        }
564    }
565
566    async fn get_dirdata(
567        &self,
568        handle: Handle,
569        inode: Inode,
570        flags: libc::c_int,
571    ) -> io::Result<Arc<HandleData>> {
572        let no_open = self.no_opendir.load(Ordering::Relaxed);
573        if !no_open {
574            self.handle_map.get(handle, inode).await
575        } else {
576            let file = self.open_inode(inode, flags | libc::O_DIRECTORY).await?;
577            Ok(Arc::new(HandleData::new(inode, file, flags as u32)))
578        }
579    }
580
581    async fn get_data(
582        &self,
583        handle: Handle,
584        inode: Inode,
585        flags: libc::c_int,
586    ) -> io::Result<Arc<HandleData>> {
587        let no_open = self.no_open.load(Ordering::Relaxed);
588        if !no_open {
589            self.handle_map.get(handle, inode).await
590        } else {
591            let file = self.open_inode(inode, flags).await?;
592            Ok(Arc::new(HandleData::new(inode, file, flags as u32)))
593        }
594    }
595
596    /// Core implementation for `create`.
597    ///
598    /// It uses the provided `uid` and `gid` for credential switching if they are `Some`;
599    /// otherwise, it falls back to the credentials from the `Request`. This allows internal
600    /// callers like `overlayfs` to specify an exact host UID/GID.
601    #[allow(clippy::too_many_arguments)]
602    async fn do_create_inner(
603        &self,
604        req: Request,
605        parent: Inode,
606        name: &OsStr,
607        mode: u32,
608        flags: u32,
609        uid: Option<u32>,
610        gid: Option<u32>,
611    ) -> Result<ReplyCreated> {
612        let name = osstr_to_cstr_or_einval(name)?;
613        let name = name.as_ref();
614        self.validate_path_component(name)?;
615
616        let dir = self.inode_map.get(parent).await?;
617        let dir_file = dir.get_file()?;
618
619        let new_file = {
620            // Here we need to adjust the code order because guard doesn't allowed to cross await point
621            let flags = self.get_writeback_open_flags(flags as i32).await;
622            let _guard = set_creds(
623                uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
624                gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
625            )?;
626            Self::create_file_excl(&dir_file, name, flags, mode)?
627        };
628
629        let entry = self.do_lookup(parent, name).await?;
630        let file = match new_file {
631            // File didn't exist, now created by create_file_excl()
632            Some(f) => f,
633            // File exists, and args.flags doesn't contain O_EXCL. Now let's open it with
634            // open_inode().
635            None => {
636                // Cap restored when _killpriv is dropped
637                // let _killpriv = if self.killpriv_v2.load().await
638                //     && (args.fuse_flags & FOPEN_IN_KILL_SUIDGID != 0)
639                // {
640                //     self::drop_cap_fsetid()?
641                // } else {
642                //     None
643                // };
644
645                // Here we can not call self.open_inode() directly because guard doesn't allowed to cross await point
646                let data = self.inode_map.get(entry.attr.ino).await?;
647                if !is_safe_inode(data.mode) {
648                    return Err(ebadf().into());
649                }
650
651                // Calculate the final flags. This involves an async call.
652                #[allow(clippy::bad_bit_mask)]
653                let mut final_flags = self.get_writeback_open_flags(flags as i32).await;
654                #[allow(clippy::bad_bit_mask)]
655                if !self.cfg.allow_direct_io && (flags as i32) & O_DIRECT != 0 {
656                    final_flags &= !O_DIRECT;
657                }
658                final_flags |= libc::O_CLOEXEC;
659
660                {
661                    let _guard = set_creds(
662                        uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
663                        gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
664                    )?;
665                    // Maybe buggy because `open_file` may call `open_by_handle_at`, which requires CAP_DAC_READ_SEARCH.
666                    data.open_file(final_flags, &self.proc_self_fd)?
667                }
668            }
669        };
670
671        let ret_handle = if !self.no_open.load(Ordering::Relaxed) {
672            let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
673            let data = HandleData::new(entry.attr.ino, file, flags);
674            self.handle_map.insert(handle, data).await;
675            handle
676        } else {
677            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
678        };
679
680        let mut opts = OpenOptions::empty();
681        match self.cfg.cache_policy {
682            CachePolicy::Never => opts |= OpenOptions::DIRECT_IO,
683            CachePolicy::Metadata => opts |= OpenOptions::DIRECT_IO,
684            CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE,
685            _ => {}
686        };
687        Ok(ReplyCreated {
688            ttl: entry.ttl,
689            attr: entry.attr,
690            generation: entry.generation,
691            fh: ret_handle,
692            flags: opts.bits(),
693        })
694    }
695
696    /// A wrapper for `create`, used by [`copy_regfile_up`][crate::overlayfs::OverlayFs::copy_regfile_up].
697    ///
698    /// This helper is called during a copy-up operation to create a file in the upper
699    /// layer while preserving the original host UID/GID from the lower layer file.
700    #[allow(clippy::too_many_arguments)]
701    pub async fn do_create_helper(
702        &self,
703        req: Request,
704        parent: Inode,
705        name: &OsStr,
706        mode: u32,
707        flags: u32,
708        uid: u32,
709        gid: u32,
710    ) -> Result<ReplyCreated> {
711        self.do_create_inner(req, parent, name, mode, flags, Some(uid), Some(gid))
712            .await
713    }
714
715    /// Core implementation for `mkdir`.
716    ///
717    /// It uses the provided `uid` and `gid` for credential switching if they are `Some`;
718    /// otherwise, it falls back to the credentials from the `Request`.
719    #[allow(clippy::too_many_arguments)]
720    async fn do_mkdir_inner(
721        &self,
722        req: Request,
723        parent: Inode,
724        name: &OsStr,
725        mode: u32,
726        umask: u32,
727        uid: Option<u32>,
728        gid: Option<u32>,
729    ) -> Result<ReplyEntry> {
730        let name = osstr_to_cstr_or_einval(name)?;
731        let name = name.as_ref();
732        self.validate_path_component(name)?;
733
734        let data = self.inode_map.get(parent).await?;
735        let file = data.get_file()?;
736
737        let res = {
738            let _guard = set_creds(
739                uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
740                gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
741            )?;
742
743            // Safe because this doesn't modify any memory and we check the return value.
744            unsafe {
745                libc::mkdirat(
746                    file.as_raw_fd(),
747                    name.as_ptr(),
748                    (mode & !umask) as libc::mode_t,
749                )
750            }
751        };
752        if res < 0 {
753            return Err(io::Error::last_os_error().into());
754        }
755
756        self.do_lookup(parent, name).await
757    }
758
759    /// A wrapper for `mkdir`, used by [`create_upper_dir`][crate::overlayfs::OverlayInode::create_upper_dir] function.
760    ///
761    /// This helper is called during a copy-up operation when a parent directory needs to be
762    /// created in the upper layer, preserving the original host UID/GID.
763    #[allow(clippy::too_many_arguments)]
764    pub async fn do_mkdir_helper(
765        &self,
766        req: Request,
767        parent: Inode,
768        name: &OsStr,
769        mode: u32,
770        umask: u32,
771        uid: u32,
772        gid: u32,
773    ) -> Result<ReplyEntry> {
774        self.do_mkdir_inner(req, parent, name, mode, umask, Some(uid), Some(gid))
775            .await
776    }
777
778    /// Core implementation for `symlink`.
779    ///
780    /// It uses the provided `uid` and `gid` for credential switching if they are `Some`;
781    /// otherwise, it falls back to the credentials from the `Request`.
782    async fn do_symlink_inner(
783        &self,
784        req: Request,
785        parent: Inode,
786        name: &OsStr,
787        link: &OsStr,
788        uid: Option<u32>,
789        gid: Option<u32>,
790    ) -> Result<ReplyEntry> {
791        let name = osstr_to_cstr_or_einval(name)?;
792        let name = name.as_ref();
793        let link = osstr_to_cstr_or_einval(link)?;
794        let link = link.as_ref();
795        self.validate_path_component(name)?;
796
797        let data = self.inode_map.get(parent).await?;
798        let file = data.get_file()?;
799
800        let res = {
801            let _guard = set_creds(
802                uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
803                gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
804            )?;
805
806            // Safe because this doesn't modify any memory and we check the return value.
807            unsafe { libc::symlinkat(link.as_ptr(), file.as_raw_fd(), name.as_ptr()) }
808        };
809        if res == 0 {
810            self.do_lookup(parent, name).await
811        } else {
812            Err(io::Error::last_os_error().into())
813        }
814    }
815
816    /// A wrapper for `symlink`, used by [`copy_symlink_up`][crate::overlayfs::OverlayFs::copy_symlink_up] function.
817    ///
818    /// This helper is called during a copy-up operation to create a symbolic link in the
819    /// upper layer while preserving the original host UID/GID from the lower layer link.
820    pub async fn do_symlink_helper(
821        &self,
822        req: Request,
823        parent: Inode,
824        name: &OsStr,
825        link: &OsStr,
826        uid: u32,
827        gid: u32,
828    ) -> Result<ReplyEntry> {
829        self.do_symlink_inner(req, parent, name, link, Some(uid), Some(gid))
830            .await
831    }
832
833    /// macOS lazy-fd: rewrite the cached path of `src_id_before` to point at
834    /// `<new_parent>.lazy_path()/<new_name>` after a successful rename. No-op
835    /// if lazy mode is off, the source isn't tracked, or the new parent isn't
836    /// a `Reopenable` inode (which can only happen if the user mixed lazy/eager
837    /// configurations across mounts).
838    ///
839    /// When the renamed entry is a directory, every cached `Reopenable`
840    /// inode whose absolute path is a descendant of the **old** path is
841    /// rewritten to use the new path. Without this, descendants would
842    /// reopen at stale paths after an LRU eviction. The walk takes the
843    /// inode-map read lock and is `O(N_inodes)` — fine for typical
844    /// workloads since rename is rare; if it ever shows up in profiles,
845    /// switch to an explicit parent→children index.
846    #[cfg(target_os = "macos")]
847    async fn macos_lazy_after_rename(
848        &self,
849        new_parent: &Arc<InodeData>,
850        new_name: &OsStr,
851        src_id_before: Option<inode_store::InodeId>,
852    ) {
853        if !self.cfg.macos_lazy_inode_fd {
854            return;
855        }
856        let Some(id) = src_id_before else { return };
857        let Some(new_parent_path) = new_parent.lazy_path() else {
858            return;
859        };
860        let new_path = new_parent_path.join(new_name);
861        let inodes = self.inode_map.inodes.read().await;
862        let Some(data) = inodes.get_by_id(&id) else {
863            return;
864        };
865
866        // Capture the renamed target's old absolute path *before* we
867        // overwrite it — descendants share this string as a prefix.
868        let old_path = data.lazy_path();
869        let target_inode = data.inode;
870        let target_is_dir = util::is_dir(data.mode.into());
871        data.update_lazy_path(new_path.clone());
872
873        if !target_is_dir {
874            return;
875        }
876        let Some(old_path) = old_path else { return };
877
878        // Rewrite every descendant whose path starts with `old_path`. The
879        // target itself was already updated above; skip it by inode number.
880        for (other_ino, other) in inodes.iter() {
881            if *other_ino == target_inode {
882                continue;
883            }
884            let Some(other_path) = other.lazy_path() else {
885                continue;
886            };
887            // `strip_prefix` requires a path-component match (won't match
888            // "/foo" against "/foobar"), which is exactly what we want.
889            if let Ok(suffix) = other_path.strip_prefix(&old_path) {
890                let mut rewritten = new_path.clone();
891                rewritten.push(suffix);
892                other.update_lazy_path(rewritten);
893            }
894        }
895    }
896}
897
898impl Filesystem for PassthroughFs {
899    /// initialize filesystem. Called before any other filesystem method.
900    async fn init(&self, _req: Request) -> Result<ReplyInit> {
901        if self.cfg.do_import {
902            self.import().await?;
903        }
904
905        Ok(ReplyInit {
906            max_write: NonZeroU32::new(128 * 1024).unwrap(),
907        })
908    }
909
910    /// clean up filesystem. Called on filesystem exit which is fuseblk, in normal fuse filesystem,
911    /// kernel may call forget for root. There is some discuss for this
912    /// <https://github.com/bazil/fuse/issues/82#issuecomment-88126886>,
913    /// <https://sourceforge.net/p/fuse/mailman/message/31995737/>
914    async fn destroy(&self, _req: Request) {
915        self.handle_map.clear().await;
916        self.inode_map.clear().await;
917
918        if let Err(e) = self.import().await {
919            error!("fuse: failed to destroy instance, {e:?}");
920        };
921    }
922
923    /// look up a directory entry by name and get its attributes.
924    async fn lookup(&self, _req: Request, parent: Inode, name: &OsStr) -> Result<ReplyEntry> {
925        // Don't use is_safe_path_component(), allow "." and ".." for NFS export support
926        if name.to_string_lossy().as_bytes().contains(&SLASH_ASCII) {
927            return Err(einval().into());
928        }
929        let name = osstr_to_cstr_or_einval(name)?;
930        // trace!("lookup: parent={}, name={}", parent, name.to_str().unwrap());
931        self.do_lookup(parent, name.as_ref()).await
932    }
933
934    /// forget an inode. The nlookup parameter indicates the number of lookups previously
935    /// performed on this inode. If the filesystem implements inode lifetimes, it is recommended
936    /// that inodes acquire a single reference on each lookup, and lose nlookup references on each
937    /// forget. The filesystem may ignore forget calls, if the inodes don't need to have a limited
938    /// lifetime. On unmount it is not guaranteed, that all referenced inodes will receive a forget
939    /// message. When filesystem is normal(not fuseblk) and unmounting, kernel may send forget
940    /// request for root and this library will stop session after call forget. There is some
941    /// discussion for this <https://github.com/bazil/fuse/issues/82#issuecomment-88126886>,
942    /// <https://sourceforge.net/p/fuse/mailman/message/31995737/>
943    async fn forget(&self, _req: Request, inode: Inode, nlookup: u64) {
944        let mut inodes = self.inode_map.inodes.write().await;
945
946        self.forget_one(&mut inodes, inode, nlookup).await
947    }
948
949    /// get file attributes. If `fh` is None, means `fh` is not set.
950    async fn getattr(
951        &self,
952        _req: Request,
953        inode: Inode,
954        fh: Option<u64>,
955        _flags: u32,
956    ) -> Result<ReplyAttr> {
957        let re = self.do_getattr(inode, fh).await?;
958        Ok(ReplyAttr {
959            ttl: re.1,
960            attr: convert_stat64_to_file_attr(re.0),
961        })
962    }
963
964    /// set file attributes. If `fh` is None, means `fh` is not set.
965    async fn setattr(
966        &self,
967        req: Request,
968        inode: Inode,
969        fh: Option<u64>,
970        set_attr: SetAttr,
971    ) -> Result<ReplyAttr> {
972        let inode_data = self.inode_map.get(inode).await?;
973
974        enum Data {
975            Handle(Arc<HandleData>),
976            ProcPath(CString),
977        }
978
979        let file = inode_data.get_file()?;
980        let data = if self.no_open.load(Ordering::Relaxed) {
981            let pathname = CString::new(format!("{}", file.as_raw_fd()))
982                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
983            Data::ProcPath(pathname)
984        } else {
985            // If we have a handle then use it otherwise get a new fd from the inode.
986            if let Some(handle) = fh {
987                let hd = self.handle_map.get(handle, inode).await?;
988                Data::Handle(hd)
989            } else {
990                let pathname = CString::new(format!("{}", file.as_raw_fd()))
991                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
992                Data::ProcPath(pathname)
993            }
994        };
995
996        if set_attr.size.is_some() && self.seal_size.load(Ordering::Relaxed) {
997            return Err(io::Error::from_raw_os_error(libc::EPERM).into());
998        }
999
1000        if let Some(mode) = set_attr.mode {
1001            // Safe because this doesn't modify any memory and we check the return value.
1002            let res = unsafe {
1003                match data {
1004                    Data::Handle(ref h) => libc::fchmod(h.borrow_fd().as_raw_fd(), mode),
1005                    Data::ProcPath(ref p) => {
1006                        libc::fchmodat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), mode, 0)
1007                    }
1008                }
1009            };
1010            if res < 0 {
1011                return Err(io::Error::last_os_error().into());
1012            }
1013        }
1014
1015        if let (Some(uid_in), Some(gid_in)) = (set_attr.uid, set_attr.gid) {
1016            //valid.intersects(SetattrValid::UID | SetattrValid::GID)
1017            let uid = self.cfg.mapping.get_uid(uid_in);
1018            let gid = self.cfg.mapping.get_gid(gid_in);
1019
1020            // Safe because this is a constant value and a valid C string.
1021            let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1022
1023            // Safe because this doesn't modify any memory and we check the return value.
1024            let res = unsafe {
1025                libc::fchownat(
1026                    file.as_raw_fd(),
1027                    empty.as_ptr(),
1028                    uid,
1029                    gid,
1030                    AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
1031                )
1032            };
1033            if res < 0 {
1034                return Err(io::Error::last_os_error().into());
1035            }
1036        }
1037
1038        if let Some(size) = set_attr.size {
1039            // Safe because this doesn't modify any memory and we check the return value.
1040            let res = match data {
1041                Data::Handle(ref h) => unsafe {
1042                    libc::ftruncate(h.borrow_fd().as_raw_fd(), size.try_into().unwrap())
1043                },
1044                _ => {
1045                    // There is no `ftruncateat` so we need to get a new fd and truncate it.
1046                    let f = self
1047                        .open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)
1048                        .await?;
1049                    unsafe { libc::ftruncate(f.as_raw_fd(), size.try_into().unwrap()) }
1050                }
1051            };
1052            if res < 0 {
1053                return Err(io::Error::last_os_error().into());
1054            }
1055        }
1056
1057        if set_attr.atime.is_some() || set_attr.mtime.is_some() {
1058            // POSIX utime() permission rules:
1059            // - utime(NULL): requires owner OR write permission
1060            // - utime(&times): requires owner only
1061            //
1062            // At FUSE level, we cannot reliably distinguish these cases because VFS
1063            // converts both to actual timestamps. We use a heuristic:
1064            // - If both nsec == 0 and timestamp is in the past: likely utime(&times)
1065            // - Otherwise: likely utime(NULL) which gets current time with nsec precision
1066
1067            // SAFETY: libc::time with null pointer is a read-only syscall that always
1068            // succeeds and doesn't modify memory.
1069            let now = unsafe { libc::time(std::ptr::null_mut()) };
1070
1071            // Heuristic: utime(&times) typically sets whole seconds (both nsec=0) to past times.
1072            // utime(NULL) sets current time which usually has non-zero nsec.
1073            // Both timestamps and both conditions must be satisfied to avoid false positives.
1074            let is_utime_times =
1075                if let (Some(atime_ts), Some(mtime_ts)) = (set_attr.atime, set_attr.mtime) {
1076                    (atime_ts.nsec == 0 && mtime_ts.nsec == 0)
1077                        && (atime_ts.sec < now && mtime_ts.sec < now)
1078                } else {
1079                    // If one is None, it's likely a specific update, treat as requiring ownership.
1080                    true
1081                };
1082
1083            let st = stat_fd(&file, None)?;
1084            let uid = self.cfg.mapping.get_uid(req.uid);
1085            let gid = self.cfg.mapping.get_gid(req.gid);
1086
1087            let is_owner = st.st_uid == uid;
1088
1089            if !is_owner {
1090                if is_utime_times {
1091                    // utime(&times): only owner allowed
1092                    return Err(io::Error::from_raw_os_error(libc::EPERM).into());
1093                } else {
1094                    // utime(NULL): check for write permission
1095                    // Check user, group, and other permissions
1096                    // NOTE: This currently only checks the primary gid. A complete POSIX-compliant
1097                    // implementation should check all supplementary groups from req.groups if available.
1098                    // However, rfuse3::Request currently doesn't expose supplementary group information.
1099                    let has_user_write = st.st_uid == uid && st.st_mode & 0o200 != 0;
1100                    let has_group_write = st.st_gid == gid && st.st_mode & 0o020 != 0;
1101                    let has_other_write = st.st_mode & 0o002 != 0;
1102
1103                    if !has_user_write && !has_group_write && !has_other_write {
1104                        return Err(io::Error::from_raw_os_error(libc::EPERM).into());
1105                    }
1106                }
1107            }
1108            let mut tvs: [libc::timespec; 2] = [
1109                libc::timespec {
1110                    tv_sec: 0,
1111                    tv_nsec: libc::UTIME_OMIT,
1112                },
1113                libc::timespec {
1114                    tv_sec: 0,
1115                    tv_nsec: libc::UTIME_OMIT,
1116                },
1117            ];
1118            if let Some(atime_ts) = set_attr.atime {
1119                tvs[0].tv_sec = atime_ts.sec;
1120                tvs[0].tv_nsec = atime_ts.nsec as i64;
1121            }
1122            if let Some(mtime_ts) = set_attr.mtime {
1123                tvs[1].tv_sec = mtime_ts.sec;
1124                tvs[1].tv_nsec = mtime_ts.nsec as i64;
1125            }
1126
1127            // Safe because this doesn't modify any memory and we check the return value.
1128            let res = match data {
1129                Data::Handle(ref h) => unsafe {
1130                    libc::futimens(h.borrow_fd().as_raw_fd(), tvs.as_ptr())
1131                },
1132                Data::ProcPath(ref p) => unsafe {
1133                    libc::utimensat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0)
1134                },
1135            };
1136            if res < 0 {
1137                return Err(io::Error::last_os_error().into());
1138            }
1139        }
1140
1141        // After any successful modification, re-stat the file to get fresh attributes.
1142        // Use `do_getattr` which correctly handles ID mapping.
1143        let (new_stat, _attr_timeout) = self.do_getattr(inode, fh).await?;
1144        // Crucially, return a ReplyAttr with a zero TTL.
1145        // This tells the kernel to invalidate its attribute cache for this inode immediately.
1146        // Subsequent `stat()` calls from clients will trigger a fresh `getattr` request.
1147        Ok(ReplyAttr {
1148            ttl: Duration::new(0, 0),
1149            attr: convert_stat64_to_file_attr(new_stat),
1150        })
1151    }
1152
1153    /// read symbolic link.
1154    async fn readlink(&self, _req: Request, inode: Inode) -> Result<ReplyData> {
1155        // Safe because this is a constant value and a valid C string.
1156        let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1157        let mut buf = Vec::<u8>::with_capacity(libc::PATH_MAX as usize);
1158        let data = self.inode_map.get(inode).await?;
1159
1160        let file = data.get_file()?;
1161
1162        // Safe because this will only modify the contents of `buf` and we check the return value.
1163        let res = unsafe {
1164            libc::readlinkat(
1165                file.as_raw_fd(),
1166                empty.as_ptr(),
1167                buf.as_mut_ptr() as *mut libc::c_char,
1168                libc::PATH_MAX as usize,
1169            )
1170        };
1171        if res < 0 {
1172            return Err(io::Error::last_os_error().into());
1173        }
1174
1175        // Safe because we trust the value returned by kernel.
1176        unsafe { buf.set_len(res as usize) };
1177
1178        Ok(ReplyData {
1179            data: Bytes::from(buf),
1180        })
1181    }
1182
1183    /// create a symbolic link.
1184    async fn symlink(
1185        &self,
1186        req: Request,
1187        parent: Inode,
1188        name: &OsStr,
1189        link: &OsStr,
1190    ) -> Result<ReplyEntry> {
1191        self.do_symlink_inner(req, parent, name, link, None, None)
1192            .await
1193    }
1194
1195    /// create file node. Create a regular file, character device, block device, fifo or socket
1196    /// node. When creating file, most cases user only need to implement
1197    /// [`create`][Filesystem::create].
1198    async fn mknod(
1199        &self,
1200        req: Request,
1201        parent: Inode,
1202        name: &OsStr,
1203        mode: u32,
1204        rdev: u32,
1205    ) -> Result<ReplyEntry> {
1206        let name = osstr_to_cstr_or_einval(name)?;
1207        let name = name.as_ref();
1208        self.validate_path_component(name)?;
1209
1210        let data = self.inode_map.get(parent).await?;
1211        let file = data.get_file()?;
1212
1213        let res = {
1214            let (_uid, _gid) = set_creds(
1215                self.cfg.mapping.get_uid(req.uid),
1216                self.cfg.mapping.get_gid(req.gid),
1217            )?;
1218
1219            // Safe because this doesn't modify any memory and we check the return value.
1220            unsafe {
1221                libc::mknodat(
1222                    file.as_raw_fd(),
1223                    name.as_ptr(),
1224                    (mode) as libc::mode_t,
1225                    rdev as libc::dev_t,
1226                )
1227            }
1228        };
1229        if res < 0 {
1230            Err(io::Error::last_os_error().into())
1231        } else {
1232            self.do_lookup(parent, name).await
1233        }
1234    }
1235
1236    /// create a directory.
1237    async fn mkdir(
1238        &self,
1239        req: Request,
1240        parent: Inode,
1241        name: &OsStr,
1242        mode: u32,
1243        umask: u32,
1244    ) -> Result<ReplyEntry> {
1245        self.do_mkdir_inner(req, parent, name, mode, umask, None, None)
1246            .await
1247    }
1248
1249    /// remove a file.
1250    async fn unlink(&self, _req: Request, parent: Inode, name: &OsStr) -> Result<()> {
1251        let name = osstr_to_cstr_or_einval(name)?;
1252        let name = name.as_ref();
1253        self.validate_path_component(name)?;
1254        self.do_unlink(parent, name, 0).await.map_err(|e| e.into())
1255    }
1256
1257    /// remove a directory.
1258    async fn rmdir(&self, _req: Request, parent: Inode, name: &OsStr) -> Result<()> {
1259        let name = osstr_to_cstr_or_einval(name)?;
1260        let name = name.as_ref();
1261        self.validate_path_component(name)?;
1262        self.do_unlink(parent, name, libc::AT_REMOVEDIR)
1263            .await
1264            .map_err(|e| e.into())
1265    }
1266
1267    /// create a hard link.
1268    async fn link(
1269        &self,
1270        _req: Request,
1271        inode: Inode,
1272        new_parent: Inode,
1273        new_name: &OsStr,
1274    ) -> Result<ReplyEntry> {
1275        trace!(
1276            "passthrough: link: inode={}, new_parent={}, new_name={}",
1277            inode,
1278            new_parent,
1279            new_name.to_string_lossy()
1280        );
1281        let newname = osstr_to_cstr_or_einval(new_name)?;
1282        let newname = newname.as_ref();
1283        self.validate_path_component(newname)?;
1284
1285        trace!("link: trying to get inode {inode}");
1286        let data = self.inode_map.get(inode).await?;
1287        trace!("link: trying to get new parent {new_parent}");
1288        let new_inode = self.inode_map.get(new_parent).await?;
1289        let file = data.get_file()?;
1290        let new_file = new_inode.get_file()?;
1291
1292        // Safe because this is a constant value and a valid C string.
1293        let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1294
1295        // Safe because this doesn't modify any memory and we check the return value.
1296        let res = unsafe {
1297            libc::linkat(
1298                file.as_raw_fd(),
1299                empty.as_ptr(),
1300                new_file.as_raw_fd(),
1301                newname.as_ptr(),
1302                AT_EMPTY_PATH,
1303            )
1304        };
1305        if res == 0 {
1306            trace!(
1307                "passthrough: link: inode={}, new_parent={}, new_name={}, res=0, trying to lookup",
1308                inode,
1309                new_parent,
1310                newname.to_string_lossy()
1311            );
1312            self.do_lookup(new_parent, newname).await
1313        } else {
1314            trace!(
1315                "passthrough: link: inode={}, new_parent={}, new_name={}, res={}",
1316                inode,
1317                new_parent,
1318                newname.to_string_lossy(),
1319                res
1320            );
1321            Err(io::Error::last_os_error().into())
1322        }
1323    }
1324
1325    /// open a file. Open flags (with the exception of `O_CREAT`, `O_EXCL` and `O_NOCTTY`) are
1326    /// available in flags. Filesystem may store an arbitrary file handle (pointer, index, etc) in
1327    /// fh, and use this in other all other file operations (read, write, flush, release, fsync).
1328    /// Filesystem may also implement stateless file I/O and not store anything in fh. There are
1329    /// also some flags (`direct_io`, `keep_cache`) which the filesystem may set, to change the way
1330    /// the file is opened. A filesystem need not implement this method if it
1331    /// sets [`MountOptions::no_open_support`][rfuse3::MountOptions::no_open_support] and if the
1332    /// kernel supports `FUSE_NO_OPEN_SUPPORT`.
1333    ///
1334    /// # Notes:
1335    ///
1336    /// See `fuse_file_info` structure in
1337    /// [fuse_common.h](https://libfuse.github.io/doxygen/include_2fuse__common_8h_source.html) for
1338    /// more details.
1339    async fn open(&self, _req: Request, inode: Inode, flags: u32) -> Result<ReplyOpen> {
1340        if self.no_open.load(Ordering::Relaxed) {
1341            info!("fuse: open is not supported.");
1342            Err(enosys().into())
1343        } else {
1344            let re = self.do_open(inode, flags).await?;
1345            Ok(ReplyOpen {
1346                fh: re.0.unwrap(),
1347                flags: re.1.bits(),
1348            })
1349        }
1350    }
1351
1352    /// read data. Read should send exactly the number of bytes requested except on EOF or error,
1353    /// otherwise the rest of the data will be substituted with zeroes. An exception to this is
1354    /// when the file has been opened in `direct_io` mode, in which case the return value of the
1355    /// read system call will reflect the return value of this operation. `fh` will contain the
1356    /// value set by the open method, or will be undefined if the open method didn't set any value.
1357    async fn read(
1358        &self,
1359        _req: Request,
1360        inode: Inode,
1361        fh: u64,
1362        offset: u64,
1363        size: u32,
1364    ) -> Result<ReplyData> {
1365        let data = self.get_data(fh, inode, libc::O_RDONLY).await?;
1366        let _guard = data.lock.lock().await;
1367        let raw_fd = data.borrow_fd().as_raw_fd();
1368
1369        let mut buf = vec![0; size as usize];
1370        let file = &data.file;
1371
1372        let res = if self.cfg.use_mmap {
1373            self.read_from_mmap(inode, offset, size as u64, file, buf.as_mut_slice())
1374                .await
1375                .ok()
1376        } else {
1377            None
1378        };
1379
1380        match res {
1381            Some(bytes_read) => {
1382                if bytes_read < size as usize {
1383                    buf.truncate(bytes_read); // Adjust the buffer size for EOF
1384                }
1385            }
1386            None => {
1387                if offset > i64::MAX as u64 {
1388                    error!("read error: offset too large: {}", offset);
1389                    return Err(Errno::from(libc::EOVERFLOW));
1390                }
1391                const ALIGN: usize = 4096;
1392                let open_flags = data.get_flags().await;
1393                #[allow(clippy::bad_bit_mask)]
1394                let ret = if (open_flags as i32 & O_DIRECT) != 0 {
1395                    let mut aligned_buf = unsafe {
1396                        let layout = std::alloc::Layout::from_size_align(size as _, ALIGN).unwrap();
1397                        let ptr = std::alloc::alloc(layout);
1398                        if ptr.is_null() {
1399                            return Err(io::Error::from_raw_os_error(libc::ENOMEM).into());
1400                        }
1401                        Vec::from_raw_parts(ptr, size as _, size as _)
1402                    };
1403                    let ret = unsafe {
1404                        pread(
1405                            raw_fd as c_int,
1406                            aligned_buf.as_mut_ptr() as *mut libc::c_void,
1407                            size as size_t,
1408                            offset as off_t,
1409                        )
1410                    };
1411
1412                    if ret >= 0 {
1413                        let bytes_read = ret as usize;
1414                        buf.as_mut_slice()[..bytes_read]
1415                            .copy_from_slice(&aligned_buf[..bytes_read]);
1416                    }
1417                    ret
1418                } else {
1419                    unsafe {
1420                        pread(
1421                            raw_fd as c_int,
1422                            buf.as_mut_ptr() as *mut libc::c_void,
1423                            size as size_t,
1424                            offset as off_t,
1425                        )
1426                    }
1427                };
1428                if ret < 0 {
1429                    let e = io::Error::last_os_error();
1430                    error!("read error: {e:?}");
1431                    error!(
1432                        "pread raw_fd={}, pointer={:p}, size={}, offset={}",
1433                        raw_fd,
1434                        buf.as_mut_ptr(),
1435                        size,
1436                        offset
1437                    );
1438                    return Err(e.into());
1439                } else {
1440                    let bytes_read = ret as usize;
1441                    buf.truncate(bytes_read);
1442                }
1443            }
1444        }
1445
1446        Ok(ReplyData {
1447            data: Bytes::from(buf),
1448        })
1449    }
1450
1451    /// write data. Write should return exactly the number of bytes requested except on error. An
1452    /// exception to this is when the file has been opened in `direct_io` mode, in which case the
1453    /// return value of the write system call will reflect the return value of this operation. `fh`
1454    /// will contain the value set by the open method, or will be undefined if the open method
1455    /// didn't set any value. When `write_flags` contains
1456    /// [`FUSE_WRITE_CACHE`][rfuse3::raw::flags::FUSE_WRITE_CACHE], means the write operation is a
1457    /// delay write.
1458    #[allow(clippy::too_many_arguments)]
1459    async fn write(
1460        &self,
1461        _req: Request,
1462        inode: Inode,
1463        fh: u64,
1464        offset: u64,
1465        data: &[u8],
1466        _write_flags: u32,
1467        flags: u32,
1468    ) -> Result<ReplyWrite> {
1469        let handle_data = self.get_data(fh, inode, libc::O_RDWR).await?;
1470        let file = &handle_data.file;
1471        let _guard = handle_data.lock.lock().await;
1472        let raw_fd = handle_data.borrow_fd().as_raw_fd();
1473
1474        let res = if self.cfg.use_mmap {
1475            self.write_to_mmap(inode, offset, data, file).await.ok()
1476        } else {
1477            None
1478        };
1479
1480        let ret = match res {
1481            Some(ret) => ret as isize,
1482            None => {
1483                let size = data.len();
1484                if offset > i64::MAX as u64 {
1485                    error!("write error: offset too large: {}", offset);
1486                    return Err(Errno::from(libc::EOVERFLOW));
1487                }
1488                self.check_fd_flags(&handle_data, raw_fd, flags).await?;
1489                let ret = unsafe {
1490                    libc::pwrite(
1491                        raw_fd as c_int,
1492                        data.as_ptr() as *const libc::c_void,
1493                        size as size_t,
1494                        offset as off_t,
1495                    )
1496                };
1497                if ret >= 0 {
1498                    ret
1499                } else {
1500                    let e = io::Error::last_os_error();
1501                    error!("write error: {e:?}");
1502                    error!(
1503                        "pwrite raw_fd={}, pointer={:p}, size={}, offset={}",
1504                        raw_fd,
1505                        data.as_ptr(),
1506                        size,
1507                        offset
1508                    );
1509                    return Err(Errno::from(e.raw_os_error().unwrap_or(-1)));
1510                }
1511            }
1512        };
1513
1514        Ok(ReplyWrite {
1515            written: ret as u32,
1516        })
1517    }
1518
1519    /// get filesystem statistics.
1520    async fn statfs(&self, _req: Request, inode: Inode) -> Result<ReplyStatFs> {
1521        let data = self.inode_map.get(inode).await?;
1522        let file = data.get_file()?;
1523
1524        #[cfg(target_os = "linux")]
1525        let statfs = {
1526            let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1527            match unsafe { libc::fstatvfs64(file.as_raw_fd(), out.as_mut_ptr()) } {
1528                0 => unsafe { out.assume_init() },
1529                _ => return Err(io::Error::last_os_error().into()),
1530            }
1531        };
1532
1533        #[cfg(target_os = "macos")]
1534        let statfs = {
1535            let mut out = MaybeUninit::<libc::statvfs>::zeroed();
1536            match unsafe { libc::fstatvfs(file.as_raw_fd(), out.as_mut_ptr()) } {
1537                0 => unsafe { out.assume_init() },
1538                _ => return Err(io::Error::last_os_error().into()),
1539            }
1540        };
1541
1542        Ok(
1543            // Populate the ReplyStatFs structure with the necessary information
1544            ReplyStatFs {
1545                blocks: statfs.f_blocks as u64,
1546                bfree: statfs.f_bfree as u64,
1547                bavail: statfs.f_bavail as u64,
1548                files: statfs.f_files as u64,
1549                ffree: statfs.f_ffree as u64,
1550                bsize: statfs.f_bsize as u32,
1551                namelen: statfs.f_namemax as u32,
1552                frsize: statfs.f_frsize as u32,
1553            },
1554        )
1555    }
1556
1557    /// release an open file. Release is called when there are no more references to an open file:
1558    /// all file descriptors are closed and all memory mappings are unmapped. For every open call
1559    /// there will be exactly one release call. The filesystem may reply with an error, but error
1560    /// values are not returned to `close()` or `munmap()` which triggered the release. `fh` will
1561    /// contain the value set by the open method, or will be undefined if the open method didn't
1562    /// set any value. `flags` will contain the same flags as for open. `flush` means flush the
1563    /// data or not when closing file.
1564    async fn release(
1565        &self,
1566        _req: Request,
1567        inode: Inode,
1568        fh: u64,
1569        _flags: u32,
1570        _lock_owner: u64,
1571        _flush: bool,
1572    ) -> Result<()> {
1573        if self.no_open.load(Ordering::Relaxed) {
1574            Err(enosys().into())
1575        } else {
1576            self.do_release(inode, fh).await.map_err(|e| e.into())
1577        }
1578    }
1579
1580    /// synchronize file contents. If the `datasync` is true, then only the user data should be
1581    /// flushed, not the metadata.
1582    async fn fsync(&self, _req: Request, inode: Inode, fh: u64, datasync: bool) -> Result<()> {
1583        let data = self.get_data(fh, inode, libc::O_RDONLY).await?;
1584        let fd = data.borrow_fd();
1585
1586        // Safe because this doesn't modify any memory and we check the return value.
1587        let res = unsafe {
1588            if datasync {
1589                #[cfg(target_os = "linux")]
1590                {
1591                    libc::fdatasync(fd.as_raw_fd())
1592                }
1593                #[cfg(target_os = "macos")]
1594                {
1595                    libc::fsync(fd.as_raw_fd())
1596                }
1597            } else {
1598                libc::fsync(fd.as_raw_fd())
1599            }
1600        };
1601        if res == 0 {
1602            Ok(())
1603        } else {
1604            Err(io::Error::last_os_error().into())
1605        }
1606    }
1607
1608    /// set an extended attribute.
1609    async fn setxattr(
1610        &self,
1611        _req: Request,
1612        inode: Inode,
1613        name: &OsStr,
1614        value: &[u8],
1615        flags: u32,
1616        _position: u32,
1617    ) -> Result<()> {
1618        if !self.cfg.xattr {
1619            return Err(enosys().into());
1620        }
1621        let name = osstr_to_cstr_or_einval(name)?;
1622        let name = name.as_ref();
1623        #[cfg(target_os = "macos")]
1624        if is_linux_only_xattr(name) {
1625            return Err(io::Error::from_raw_os_error(libc::ENOTSUP).into());
1626        }
1627        let data = self.inode_map.get(inode).await?;
1628        let file = data.get_file()?;
1629        #[cfg(target_os = "linux")]
1630        let pathname = fd_path_cstr(file.as_raw_fd())?;
1631
1632        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1633        // need to use the {set,get,remove,list}xattr variants.
1634        // Safe because this doesn't modify any memory and we check the return value.
1635        let res = match () {
1636            #[cfg(target_os = "linux")]
1637            () => unsafe {
1638                libc::setxattr(
1639                    pathname.as_ptr(),
1640                    name.as_ptr(),
1641                    value.as_ptr() as *const libc::c_void,
1642                    value.len(),
1643                    flags as libc::c_int,
1644                )
1645            },
1646            #[cfg(target_os = "macos")]
1647            () => unsafe {
1648                // `_position` is non-zero only for com.apple.ResourceFork; pass it through
1649                // so resource-fork writes work as expected.
1650                libc::fsetxattr(
1651                    file.as_raw_fd(),
1652                    name.as_ptr(),
1653                    value.as_ptr() as *const libc::c_void,
1654                    value.len(),
1655                    _position,
1656                    flags as libc::c_int,
1657                )
1658            },
1659        };
1660        if res == 0 {
1661            Ok(())
1662        } else {
1663            // Surface the real errno; the previous "fake success" hid bugs and
1664            // made conformance suites (pjdfstest) report misleading results.
1665            Err(io::Error::last_os_error().into())
1666        }
1667    }
1668
1669    /// Get an extended attribute. If `size` is too small, return `Err<ERANGE>`.
1670    /// Otherwise, use [`ReplyXAttr::Data`] to send the attribute data, or
1671    /// return an error.
1672    async fn getxattr(
1673        &self,
1674        _req: Request,
1675        inode: Inode,
1676        name: &OsStr,
1677        size: u32,
1678    ) -> Result<ReplyXAttr> {
1679        if !self.cfg.xattr {
1680            return Err(enosys().into());
1681        }
1682        let name = osstr_to_cstr_or_einval(name)?;
1683        let name = name.as_ref();
1684        #[cfg(target_os = "macos")]
1685        if is_linux_only_xattr(name) {
1686            return Err(io::Error::from_raw_os_error(libc::ENOTSUP).into());
1687        }
1688        let data = self.inode_map.get(inode).await?;
1689        let file = data.get_file()?;
1690        let mut buf = Vec::<u8>::with_capacity(size as usize);
1691        #[cfg(target_os = "linux")]
1692        let pathname = fd_path_cstr(file.as_raw_fd())?;
1693
1694        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1695        // need to use the {set,get,remove,list}xattr variants.
1696        // Safe because this will only modify the contents of `buf`.
1697        let res = match () {
1698            #[cfg(target_os = "linux")]
1699            () => unsafe {
1700                libc::getxattr(
1701                    pathname.as_ptr(),
1702                    name.as_ptr(),
1703                    buf.as_mut_ptr() as *mut libc::c_void,
1704                    size as libc::size_t,
1705                )
1706            },
1707            #[cfg(target_os = "macos")]
1708            () => unsafe {
1709                libc::fgetxattr(
1710                    file.as_raw_fd(),
1711                    name.as_ptr(),
1712                    buf.as_mut_ptr() as *mut libc::c_void,
1713                    size as libc::size_t,
1714                    0,
1715                    0,
1716                )
1717            },
1718        };
1719        if res < 0 {
1720            let e = io::Error::last_os_error();
1721            // error!("getxattr error: {e:?}");
1722            return Err(e.into());
1723        }
1724
1725        if size == 0 {
1726            Ok(ReplyXAttr::Size(res as u32))
1727        } else {
1728            // Safe because we trust the value returned by kernel.
1729            unsafe { buf.set_len(res as usize) };
1730            Ok(ReplyXAttr::Data(Bytes::from(buf)))
1731        }
1732    }
1733
1734    /// List extended attribute names.
1735    ///
1736    /// If `size` is too small, return `Err<ERANGE>`.  Otherwise, use
1737    /// [`ReplyXAttr::Data`] to send the attribute list, or return an error.
1738    async fn listxattr(&self, _req: Request, inode: Inode, size: u32) -> Result<ReplyXAttr> {
1739        if !self.cfg.xattr {
1740            return Err(enosys().into());
1741        }
1742
1743        let data = self.inode_map.get(inode).await?;
1744        let file = data.get_file()?;
1745        let mut buf = Vec::<u8>::with_capacity(size as usize);
1746        #[cfg(target_os = "linux")]
1747        let pathname = fd_path_cstr(file.as_raw_fd())?;
1748
1749        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1750        // need to use the {set,get,remove,list}xattr variants.
1751        // Safe because this will only modify the contents of `buf`.
1752        let res = match () {
1753            #[cfg(target_os = "linux")]
1754            () => unsafe {
1755                libc::listxattr(
1756                    pathname.as_ptr(),
1757                    buf.as_mut_ptr() as *mut libc::c_char,
1758                    size as libc::size_t,
1759                )
1760            },
1761            #[cfg(target_os = "macos")]
1762            () => unsafe {
1763                libc::flistxattr(
1764                    file.as_raw_fd(),
1765                    buf.as_mut_ptr() as *mut libc::c_char,
1766                    size as libc::size_t,
1767                    0,
1768                )
1769            },
1770        };
1771        if res < 0 {
1772            let e = io::Error::last_os_error();
1773            // error!("listxattr error: {e:?}");
1774            return Err(e.into());
1775        }
1776
1777        if size == 0 {
1778            Ok(ReplyXAttr::Size(res as u32))
1779        } else {
1780            // Safe because we trust the value returned by kernel.
1781            unsafe { buf.set_len(res as usize) };
1782            Ok(ReplyXAttr::Data(Bytes::from(buf)))
1783        }
1784    }
1785
1786    /// remove an extended attribute.
1787    async fn removexattr(&self, _req: Request, inode: Inode, name: &OsStr) -> Result<()> {
1788        if !self.cfg.xattr {
1789            return Err(enosys().into());
1790        }
1791        let name = osstr_to_cstr_or_einval(name)?;
1792        let name = name.as_ref();
1793        #[cfg(target_os = "macos")]
1794        if is_linux_only_xattr(name) {
1795            return Err(io::Error::from_raw_os_error(libc::ENOTSUP).into());
1796        }
1797        let data = self.inode_map.get(inode).await?;
1798        let file = data.get_file()?;
1799        #[cfg(target_os = "linux")]
1800        let pathname = fd_path_cstr(file.as_raw_fd())?;
1801
1802        #[cfg(target_os = "linux")]
1803        let res = unsafe { libc::removexattr(pathname.as_ptr(), name.as_ptr()) };
1804        #[cfg(target_os = "macos")]
1805        let res = unsafe { libc::fremovexattr(file.as_raw_fd(), name.as_ptr(), 0) };
1806        if res == 0 {
1807            Ok(())
1808        } else {
1809            Err(io::Error::last_os_error().into())
1810        }
1811    }
1812
1813    /// flush method. This is called on each `close()` of the opened file. Since file descriptors
1814    /// can be duplicated (`dup`, `dup2`, `fork`), for one open call there may be many flush calls.
1815    /// Filesystems shouldn't assume that flush will always be called after some writes, or that if
1816    /// will be called at all. `fh` will contain the value set by the open method, or will be
1817    /// undefined if the open method didn't set any value.
1818    ///
1819    /// # Notes:
1820    ///
1821    /// the name of the method is misleading, since (unlike fsync) the filesystem is not forced to
1822    /// flush pending writes. One reason to flush data, is if the filesystem wants to return write
1823    /// errors. If the filesystem supports file locking operations ([`setlk`][Filesystem::setlk],
1824    /// [`getlk`][Filesystem::getlk]) it should remove all locks belonging to `lock_owner`.
1825    async fn flush(&self, _req: Request, inode: Inode, fh: u64, _lock_owner: u64) -> Result<()> {
1826        if self.no_open.load(Ordering::Relaxed) {
1827            return Err(enosys().into());
1828        }
1829
1830        let data = self.handle_map.get(fh, inode).await?;
1831        trace!("flush: data.inode={}", data.inode);
1832
1833        // Since this method is called whenever an fd is closed in the client, we can emulate that
1834        // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
1835        // because this doesn't modify any memory and we check the return values.
1836        unsafe {
1837            let newfd = libc::dup(data.borrow_fd().as_raw_fd());
1838            if newfd < 0 {
1839                return Err(io::Error::last_os_error().into());
1840            }
1841
1842            if libc::close(newfd) < 0 {
1843                Err(io::Error::last_os_error().into())
1844            } else {
1845                Ok(())
1846            }
1847        }
1848        // if self.no_open.load(Ordering::Acquire) {
1849        //         return Err(enosys().into());
1850        //     }
1851
1852        // let data = self.handle_map.get(fh, inode).await?;
1853
1854        // // std flush impl
1855        // unsafe {
1856        //     let fd = data.borrow_fd().as_raw_fd();
1857        //     if libc::fsync(fd) < 0 {
1858        //         let err = io::Error::last_os_error();
1859        //         error!("Failed to fsync file descriptor {}: {}", fd, err);
1860        //         return Err(err.into());
1861        //     }
1862        // }
1863        // Ok(())
1864    }
1865
1866    /// open a directory. Filesystem may store an arbitrary file handle (pointer, index, etc) in
1867    /// `fh`, and use this in other all other directory stream operations
1868    /// ([`readdir`][Filesystem::readdir], [`releasedir`][Filesystem::releasedir],
1869    /// [`fsyncdir`][Filesystem::fsyncdir]). Filesystem may also implement stateless directory
1870    /// I/O and not store anything in `fh`.  A file system need not implement this method if it
1871    /// sets [`MountOptions::no_open_dir_support`][rfuse3::MountOptions::no_open_dir_support] and
1872    /// if the kernel supports `FUSE_NO_OPENDIR_SUPPORT`.
1873    async fn opendir(&self, _req: Request, inode: Inode, flags: u32) -> Result<ReplyOpen> {
1874        if self.no_opendir.load(Ordering::Relaxed) {
1875            info!("fuse: opendir is not supported.");
1876            Err(enosys().into())
1877        } else {
1878            let t = self
1879                .do_open(inode, flags | (libc::O_DIRECTORY as u32))
1880                .await?;
1881            let fd = t.0.unwrap();
1882            Ok(ReplyOpen {
1883                fh: fd,
1884                flags: t.1.bits(),
1885            })
1886        }
1887    }
1888
1889    /// read directory. `offset` is used to track the offset of the directory entries. `fh` will
1890    /// contain the value set by the [`opendir`][Filesystem::opendir] method, or will be
1891    /// undefined if the [`opendir`][Filesystem::opendir] method didn't set any value.
1892    async fn readdir<'a>(
1893        &'a self,
1894        _req: Request,
1895        parent: Inode,
1896        fh: u64,
1897        offset: i64,
1898    ) -> Result<
1899        ReplyDirectory<
1900            impl futures_util::stream::Stream<Item = Result<DirectoryEntry>> + Send + 'a,
1901        >,
1902    > {
1903        if self.no_readdir.load(Ordering::Relaxed) {
1904            return Err(enosys().into());
1905        }
1906        let mut entry_list = Vec::new();
1907        self.do_readdir(parent, fh, offset as u64, &mut entry_list)
1908            .await?;
1909        Ok(ReplyDirectory {
1910            entries: stream::iter(entry_list),
1911        })
1912    }
1913
1914    /// read directory entries, but with their attribute, like [`readdir`][Filesystem::readdir]
1915    /// + [`lookup`][Filesystem::lookup] at the same time.
1916    async fn readdirplus<'a>(
1917        &'a self,
1918        _req: Request,
1919        parent: Inode,
1920        fh: u64,
1921        offset: u64,
1922        _lock_owner: u64,
1923    ) -> Result<
1924        ReplyDirectoryPlus<
1925            impl futures_util::stream::Stream<Item = Result<DirectoryEntryPlus>> + Send + 'a,
1926        >,
1927    > {
1928        if self.no_readdir.load(Ordering::Relaxed) {
1929            return Err(enosys().into());
1930        }
1931        let mut entry_list = Vec::new();
1932        self.do_readdirplus(parent, fh, offset, &mut entry_list)
1933            .await?;
1934        Ok(ReplyDirectoryPlus {
1935            entries: stream::iter(entry_list),
1936        })
1937    }
1938
1939    /// release an open directory. For every [`opendir`][Filesystem::opendir] call there will
1940    /// be exactly one `releasedir` call. `fh` will contain the value set by the
1941    /// [`opendir`][Filesystem::opendir] method, or will be undefined if the
1942    /// [`opendir`][Filesystem::opendir] method didn't set any value.
1943    async fn releasedir(&self, _req: Request, inode: Inode, fh: u64, _flags: u32) -> Result<()> {
1944        if self.no_opendir.load(Ordering::Relaxed) {
1945            info!("fuse: releasedir is not supported.");
1946            Err(io::Error::from_raw_os_error(libc::ENOSYS).into())
1947        } else {
1948            self.do_release(inode, fh).await.map_err(|e| e.into())
1949        }
1950    }
1951
1952    /// synchronize directory contents. If the `datasync` is true, then only the directory contents
1953    /// should be flushed, not the metadata. `fh` will contain the value set by the
1954    /// [`opendir`][Filesystem::opendir] method, or will be undefined if the
1955    /// [`opendir`][Filesystem::opendir] method didn't set any value.
1956    async fn fsyncdir(&self, req: Request, inode: Inode, fh: u64, datasync: bool) -> Result<()> {
1957        self.fsync(req, inode, fh, datasync).await
1958    }
1959
1960    #[allow(clippy::too_many_arguments)]
1961    async fn getlk(
1962        &self,
1963        _req: Request,
1964        inode: Inode,
1965        fh: u64,
1966        _lock_owner: u64,
1967        start: u64,
1968        end: u64,
1969        r#type: u32,
1970        pid: u32,
1971    ) -> Result<ReplyLock> {
1972        if self.no_open.load(Ordering::Relaxed) {
1973            return Err(enosys().into());
1974        }
1975
1976        let data = self.handle_map.get(fh, inode).await?;
1977        let mut flock = libc::flock {
1978            l_type: r#type as libc::c_short,
1979            l_whence: libc::SEEK_SET as libc::c_short,
1980            l_start: start as libc::off_t,
1981            l_len: if end == u64::MAX {
1982                0 // 0 means until EOF
1983            } else {
1984                end.saturating_sub(start) as libc::off_t
1985            },
1986            l_pid: pid as libc::pid_t,
1987        };
1988
1989        // SAFETY: We pass a valid fd and a valid pointer to flock.
1990        let ret = unsafe { libc::fcntl(data.borrow_fd().as_raw_fd(), libc::F_GETLK, &mut flock) };
1991        if ret < 0 {
1992            return Err(io::Error::last_os_error().into());
1993        }
1994
1995        Ok(ReplyLock {
1996            start: flock.l_start as u64,
1997            end: if flock.l_len == 0 {
1998                u64::MAX
1999            } else {
2000                flock.l_start as u64 + flock.l_len as u64
2001            },
2002            r#type: flock.l_type as u32,
2003            pid: flock.l_pid as u32,
2004        })
2005    }
2006
2007    #[allow(clippy::too_many_arguments)]
2008    async fn setlk(
2009        &self,
2010        _req: Request,
2011        inode: Inode,
2012        fh: u64,
2013        _lock_owner: u64,
2014        start: u64,
2015        end: u64,
2016        r#type: u32,
2017        pid: u32,
2018        block: bool,
2019    ) -> Result<()> {
2020        if self.no_open.load(Ordering::Relaxed) {
2021            return Err(enosys().into());
2022        }
2023
2024        let data = self.handle_map.get(fh, inode).await?;
2025        let flock = libc::flock {
2026            l_type: r#type as libc::c_short,
2027            l_whence: libc::SEEK_SET as libc::c_short,
2028            l_start: start as libc::off_t,
2029            l_len: if end == u64::MAX {
2030                0 // 0 means until EOF
2031            } else {
2032                end.saturating_sub(start) as libc::off_t
2033            },
2034            l_pid: pid as libc::pid_t,
2035        };
2036
2037        let cmd = if block { libc::F_SETLKW } else { libc::F_SETLK };
2038
2039        // SAFETY: We pass a valid fd and a valid pointer to flock.
2040        let ret = unsafe { libc::fcntl(data.borrow_fd().as_raw_fd(), cmd, &flock) };
2041        if ret < 0 {
2042            return Err(io::Error::last_os_error().into());
2043        }
2044
2045        Ok(())
2046    }
2047
2048    /// check file access permissions. This will be called for the `access()` system call. If the
2049    /// `default_permissions` mount option is given, this method is not be called. This method is
2050    /// not called under Linux kernel versions 2.4.x.
2051    async fn access(&self, req: Request, inode: Inode, mask: u32) -> Result<()> {
2052        let data = self.inode_map.get(inode).await?;
2053        let st = stat_fd(&data.get_file()?, None)?;
2054        let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
2055
2056        let uid = self.cfg.mapping.get_uid(req.uid);
2057        let gid = self.cfg.mapping.get_gid(req.gid);
2058
2059        if mode == libc::F_OK {
2060            // The file exists since we were able to call `stat(2)` on it.
2061            return Ok(());
2062        }
2063
2064        if (mode & libc::R_OK) != 0
2065            && uid != 0
2066            && (st.st_uid != uid || st.st_mode & 0o400 == 0)
2067            && (st.st_gid != gid || st.st_mode & 0o040 == 0)
2068            && st.st_mode & 0o004 == 0
2069        {
2070            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
2071        }
2072
2073        if (mode & libc::W_OK) != 0
2074            && uid != 0
2075            && (st.st_uid != uid || st.st_mode & 0o200 == 0)
2076            && (st.st_gid != gid || st.st_mode & 0o020 == 0)
2077            && st.st_mode & 0o002 == 0
2078        {
2079            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
2080        }
2081
2082        // root can only execute something if it is executable by one of the owner, the group, or
2083        // everyone.
2084        if (mode & libc::X_OK) != 0
2085            && (uid != 0 || st.st_mode & 0o111 == 0)
2086            && (st.st_uid != uid || st.st_mode & 0o100 == 0)
2087            && (st.st_gid != gid || st.st_mode & 0o010 == 0)
2088            && st.st_mode & 0o001 == 0
2089        {
2090            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
2091        }
2092
2093        Ok(())
2094    }
2095
2096    /// create and open a file. If the file does not exist, first create it with the specified
2097    /// mode, and then open it. Open flags (with the exception of `O_NOCTTY`) are available in
2098    /// flags. Filesystem may store an arbitrary file handle (pointer, index, etc) in `fh`, and use
2099    /// this in other all other file operations ([`read`][Filesystem::read],
2100    /// [`write`][Filesystem::write], [`flush`][Filesystem::flush],
2101    /// [`release`][Filesystem::release], [`fsync`][Filesystem::fsync]). There are also some flags
2102    /// (`direct_io`, `keep_cache`) which the filesystem may set, to change the way the file is
2103    /// opened. If this method is not implemented or under Linux kernel versions earlier than
2104    /// 2.6.15, the [`mknod`][Filesystem::mknod] and [`open`][Filesystem::open] methods will be
2105    /// called instead.
2106    ///
2107    /// # Notes:
2108    ///
2109    /// See `fuse_file_info` structure in
2110    /// [fuse_common.h](https://libfuse.github.io/doxygen/include_2fuse__common_8h_source.html) for
2111    /// more details.
2112    async fn create(
2113        &self,
2114        req: Request,
2115        parent: Inode,
2116        name: &OsStr,
2117        mode: u32,
2118        flags: u32,
2119    ) -> Result<ReplyCreated> {
2120        self.do_create_inner(req, parent, name, mode, flags, None, None)
2121            .await
2122    }
2123
2124    /// handle interrupt. When a operation is interrupted, an interrupt request will send to fuse
2125    /// server with the unique id of the operation.
2126    async fn interrupt(&self, _req: Request, _unique: u64) -> Result<()> {
2127        Ok(())
2128    }
2129
2130    /// forget more than one inode. This is a batch version [`forget`][Filesystem::forget]
2131    async fn batch_forget(&self, _req: Request, inodes: &[(Inode, u64)]) {
2132        let mut inodes_w = self.inode_map.inodes.write().await;
2133
2134        for i in inodes {
2135            self.forget_one(&mut inodes_w, i.0, i.1).await;
2136        }
2137    }
2138
2139    /// allocate space for an open file. This function ensures that required space is allocated for
2140    /// specified file.
2141    ///
2142    /// # Notes:
2143    ///
2144    /// more information about `fallocate`, please see **`man 2 fallocate`**
2145    async fn fallocate(
2146        &self,
2147        _req: Request,
2148        inode: Inode,
2149        fh: u64,
2150        _offset: u64,
2151        _length: u64,
2152        _mode: u32,
2153    ) -> Result<()> {
2154        // Let the Arc<HandleData> in scope, otherwise fd may get invalid.
2155        let data = self.get_data(fh, inode, libc::O_RDWR).await?;
2156        let _fd = data.borrow_fd();
2157
2158        //  if self.seal_size.load().await {
2159        //      let st = stat_fd(&fd, None)?;
2160        //      self.seal_size_check(
2161        //          Opcode::Fallocate,
2162        //          st.st_size as u64,
2163        //          offset,
2164        //          length,
2165        //          mode as i32,
2166        //      )?;
2167        //  }
2168
2169        #[cfg(target_os = "linux")]
2170        {
2171            // Safe because this doesn't modify any memory and we check the return value.
2172            let res = unsafe {
2173                libc::fallocate64(
2174                    _fd.as_raw_fd(),
2175                    _mode as libc::c_int,
2176                    _offset as libc::off64_t,
2177                    _length as libc::off64_t,
2178                )
2179            };
2180            if res == 0 {
2181                Ok(())
2182            } else {
2183                Err(io::Error::last_os_error().into())
2184            }
2185        }
2186        #[cfg(target_os = "macos")]
2187        {
2188            // macOS has no fallocate(). Mode bits beyond plain "extend" (PUNCH_HOLE,
2189            // COLLAPSE_RANGE, ZERO_RANGE, ...) have no equivalent in F_PREALLOCATE.
2190            if _mode != 0 {
2191                return Err(io::Error::from_raw_os_error(libc::ENOTSUP).into());
2192            }
2193            let raw_fd = _fd.as_raw_fd();
2194            let target_size = _offset.saturating_add(_length) as libc::off_t;
2195
2196            // Determine current size to compute how much to preallocate from EOF.
2197            let st = stat_fd(&_fd, None)?;
2198            let current_size = st.st_size as libc::off_t;
2199            if target_size > current_size {
2200                // A concurrent writer can extend the file between stat and
2201                // F_PREALLOCATE. That may over-reserve blocks, but the kernel
2202                // clamps allocation to the file and ftruncate below preserves
2203                // the requested final size. Same shared-fd race exists for
2204                // Linux fallocate; correctness does not depend on the
2205                // preallocation length being exact.
2206                let mut store = libc::fstore_t {
2207                    fst_flags: libc::F_ALLOCATEALL,
2208                    fst_posmode: libc::F_PEOFPOSMODE,
2209                    fst_offset: 0,
2210                    fst_length: target_size - current_size,
2211                    fst_bytesalloc: 0,
2212                };
2213                // Try contiguous first; on ENOSPC, retry without the contiguous hint.
2214                store.fst_flags |= libc::F_ALLOCATECONTIG;
2215                let mut res = unsafe { libc::fcntl(raw_fd, libc::F_PREALLOCATE, &mut store) };
2216                if res < 0 {
2217                    store.fst_flags &= !libc::F_ALLOCATECONTIG;
2218                    res = unsafe { libc::fcntl(raw_fd, libc::F_PREALLOCATE, &mut store) };
2219                }
2220                if res < 0 {
2221                    return Err(io::Error::last_os_error().into());
2222                }
2223                // F_PREALLOCATE reserves blocks but does not grow the file; ftruncate
2224                // is what actually advances st_size to match Linux fallocate semantics.
2225                let res = unsafe { libc::ftruncate(raw_fd, target_size) };
2226                if res < 0 {
2227                    return Err(io::Error::last_os_error().into());
2228                }
2229            }
2230            Ok(())
2231        }
2232    }
2233
2234    /// rename a file or directory.
2235    async fn rename(
2236        &self,
2237        _req: Request,
2238        parent: Inode,
2239        name: &OsStr,
2240        new_parent: Inode,
2241        new_name: &OsStr,
2242    ) -> Result<()> {
2243        let oldname = osstr_to_cstr_or_einval(name)?;
2244        let oldname = oldname.as_ref();
2245        let newname = osstr_to_cstr_or_einval(new_name)?;
2246        let newname = newname.as_ref();
2247        self.validate_path_component(oldname)?;
2248        self.validate_path_component(newname)?;
2249
2250        // Check if new_name exists and is a whiteout file
2251        let new_parent_data = self.inode_map.get(new_parent).await?;
2252        let new_parent_file = new_parent_data.get_file()?;
2253
2254        // Try to lookup newname to see if it exists
2255        // Check if new_name exists and is a whiteout file
2256        let mut st = std::mem::MaybeUninit::<libc::stat>::uninit();
2257        let res = unsafe {
2258            libc::fstatat(
2259                new_parent_file.as_raw_fd(),
2260                newname.as_ptr(),
2261                st.as_mut_ptr(),
2262                libc::AT_SYMLINK_NOFOLLOW,
2263            )
2264        };
2265
2266        if res == 0 {
2267            // If file exists, check if it's a whiteout file
2268            let st = unsafe { st.assume_init() };
2269            if (st.st_mode & libc::S_IFMT) == libc::S_IFCHR && st.st_rdev == 0 {
2270                // It's a whiteout file, delete it
2271                let unlink_res =
2272                    unsafe { libc::unlinkat(new_parent_file.as_raw_fd(), newname.as_ptr(), 0) };
2273                if unlink_res < 0 {
2274                    return Err(io::Error::last_os_error().into());
2275                }
2276            }
2277        } else {
2278            let err = io::Error::last_os_error();
2279            if err.raw_os_error() != Some(libc::ENOENT) {
2280                return Err(err.into());
2281            }
2282        }
2283
2284        let old_inode = self.inode_map.get(parent).await?;
2285        let new_inode = self.inode_map.get(new_parent).await?;
2286        let old_file = old_inode.get_file()?;
2287        let new_file = new_inode.get_file()?;
2288
2289        // macOS lazy-fd: capture the source inode id before the rename so we
2290        // can rewrite the moved inode's `ReopenableState.path` on success.
2291        // Without this, a cached InodeData would reopen the *old* path after
2292        // any cache miss (e.g. once an LRU eviction layer lands).
2293        #[cfg(target_os = "macos")]
2294        let src_id_before = if self.cfg.macos_lazy_inode_fd {
2295            statx::statx(&old_file, Some(oldname))
2296                .ok()
2297                .map(|s| inode_store::InodeId::from_stat(&s))
2298        } else {
2299            None
2300        };
2301
2302        let res = unsafe {
2303            libc::renameat(
2304                old_file.as_raw_fd(),
2305                oldname.as_ptr(),
2306                new_file.as_raw_fd(),
2307                newname.as_ptr(),
2308            )
2309        };
2310
2311        if res != 0 {
2312            return Err(io::Error::last_os_error().into());
2313        }
2314
2315        #[cfg(target_os = "macos")]
2316        self.macos_lazy_after_rename(&new_inode, new_name, src_id_before)
2317            .await;
2318
2319        Ok(())
2320    }
2321
2322    /// rename a file or directory with flags.
2323    async fn rename2(
2324        &self,
2325        _req: Request,
2326        parent: Inode,
2327        name: &OsStr,
2328        new_parent: Inode,
2329        new_name: &OsStr,
2330        flags: u32,
2331    ) -> Result<()> {
2332        let oldname = osstr_to_cstr_or_einval(name)?;
2333        let oldname = oldname.as_ref();
2334        let newname = osstr_to_cstr_or_einval(new_name)?;
2335        let newname = newname.as_ref();
2336        self.validate_path_component(oldname)?;
2337        self.validate_path_component(newname)?;
2338
2339        let old_inode = self.inode_map.get(parent).await?;
2340        let new_inode = self.inode_map.get(new_parent).await?;
2341        let old_file = old_inode.get_file()?;
2342        let new_file = new_inode.get_file()?;
2343
2344        #[cfg(target_os = "linux")]
2345        {
2346            let res = unsafe {
2347                libc::renameat2(
2348                    old_file.as_raw_fd(),
2349                    oldname.as_ptr(),
2350                    new_file.as_raw_fd(),
2351                    newname.as_ptr(),
2352                    flags,
2353                )
2354            };
2355            if res == 0 {
2356                Ok(())
2357            } else {
2358                Err(io::Error::last_os_error().into())
2359            }
2360        }
2361        #[cfg(target_os = "macos")]
2362        {
2363            // Linux uapi flag values used by FUSE wire protocol.
2364            const RENAME_NOREPLACE: u32 = 1;
2365            const RENAME_EXCHANGE: u32 = 2;
2366            const RENAME_WHITEOUT: u32 = 4;
2367
2368            // Capture source (and dest, for EXCHANGE) ids before the rename so
2369            // we can rewrite their cached lazy paths on success.
2370            let lazy = self.cfg.macos_lazy_inode_fd;
2371            let src_id_before = if lazy {
2372                statx::statx(&old_file, Some(oldname))
2373                    .ok()
2374                    .map(|s| inode_store::InodeId::from_stat(&s))
2375            } else {
2376                None
2377            };
2378            let dst_id_before = if lazy && flags == RENAME_EXCHANGE {
2379                statx::statx(&new_file, Some(newname))
2380                    .ok()
2381                    .map(|s| inode_store::InodeId::from_stat(&s))
2382            } else {
2383                None
2384            };
2385
2386            if flags == 0 {
2387                let res = unsafe {
2388                    libc::renameat(
2389                        old_file.as_raw_fd(),
2390                        oldname.as_ptr(),
2391                        new_file.as_raw_fd(),
2392                        newname.as_ptr(),
2393                    )
2394                };
2395                if res != 0 {
2396                    return Err(io::Error::last_os_error().into());
2397                }
2398                self.macos_lazy_after_rename(&new_inode, new_name, src_id_before)
2399                    .await;
2400                return Ok(());
2401            }
2402
2403            // Map Linux flags to macOS `renamex_np` flags. Combinations and
2404            // unsupported flags (e.g. WHITEOUT) return ENOTSUP — callers
2405            // should handle the fallback themselves.
2406            let macos_flags: libc::c_uint = match flags {
2407                RENAME_NOREPLACE => libc::RENAME_EXCL,
2408                RENAME_EXCHANGE => libc::RENAME_SWAP,
2409                RENAME_WHITEOUT => {
2410                    return Err(io::Error::from_raw_os_error(libc::ENOTSUP).into());
2411                }
2412                _ => return Err(io::Error::from_raw_os_error(libc::ENOTSUP).into()),
2413            };
2414
2415            // `renamex_np` only takes absolute paths — there is no `*at` form.
2416            // Resolve dir fds to absolute paths via `F_GETPATH`, then join.
2417            let old_dir = util::fd_path_cstr(old_file.as_raw_fd())?;
2418            let new_dir = util::fd_path_cstr(new_file.as_raw_fd())?;
2419            let old_full = join_dir_and_name(&old_dir, oldname)?;
2420            let new_full = join_dir_and_name(&new_dir, newname)?;
2421
2422            let res =
2423                unsafe { libc::renamex_np(old_full.as_ptr(), new_full.as_ptr(), macos_flags) };
2424            if res != 0 {
2425                return Err(io::Error::last_os_error().into());
2426            }
2427            // RENAME_EXCL: src moved to dest; same update as plain rename.
2428            // RENAME_SWAP: src and dest swap places.
2429            self.macos_lazy_after_rename(&new_inode, new_name, src_id_before)
2430                .await;
2431            if flags == RENAME_EXCHANGE {
2432                self.macos_lazy_after_rename(&old_inode, name, dst_id_before)
2433                    .await;
2434            }
2435            Ok(())
2436        }
2437    }
2438
2439    /// find next data or hole after the specified offset.
2440    async fn lseek(
2441        &self,
2442        _req: Request,
2443        inode: Inode,
2444        fh: u64,
2445        offset: u64,
2446        whence: u32,
2447    ) -> Result<ReplyLSeek> {
2448        // Let the Arc<HandleData> in scope, otherwise fd may get invalid.
2449        let data = self.handle_map.get(fh, inode).await?;
2450
2451        // Check file type to determine appropriate lseek handling
2452        let st = stat_fd(data.get_file(), None)?;
2453        let is_dir = (st.st_mode & libc::S_IFMT) == libc::S_IFDIR;
2454
2455        if is_dir {
2456            // Directory special handling: support SEEK_SET and SEEK_CUR with bounds checks.
2457            // Acquire the lock to get exclusive access
2458            let (_guard, file) = data.get_file_mut().await;
2459
2460            // Handle directory lseek operations according to POSIX standard
2461            // This enables seekdir/telldir functionality on directories
2462            match whence {
2463                // SEEK_SET: set directory offset to an absolute value
2464                x if x == libc::SEEK_SET as u32 => {
2465                    // Validate offset bounds to prevent overflow
2466                    // Directory offsets should not exceed i64::MAX
2467                    if offset > i64::MAX as u64 {
2468                        return Err(io::Error::from_raw_os_error(libc::EINVAL).into());
2469                    }
2470
2471                    // Perform the seek operation using libc::lseek64
2472                    // This directly manipulates the file descriptor's position
2473                    let res = unsafe {
2474                        #[cfg(target_os = "linux")]
2475                        {
2476                            libc::lseek64(file.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET)
2477                        }
2478                        #[cfg(target_os = "macos")]
2479                        {
2480                            libc::lseek(file.as_raw_fd(), offset as libc::off_t, libc::SEEK_SET)
2481                        }
2482                    };
2483                    if res < 0 {
2484                        return Err(io::Error::last_os_error().into());
2485                    }
2486                    Ok(ReplyLSeek { offset: res as u64 })
2487                }
2488                // SEEK_CUR: move relative to current directory offset
2489                x if x == libc::SEEK_CUR as u32 => {
2490                    // Get current position using libc::lseek64 with offset 0
2491                    let cur = unsafe {
2492                        #[cfg(target_os = "linux")]
2493                        {
2494                            libc::lseek64(file.as_raw_fd(), 0, libc::SEEK_CUR)
2495                        }
2496                        #[cfg(target_os = "macos")]
2497                        {
2498                            libc::lseek(file.as_raw_fd(), 0, libc::SEEK_CUR)
2499                        }
2500                    };
2501                    if cur < 0 {
2502                        return Err(io::Error::last_os_error().into());
2503                    }
2504                    let current = cur as u64;
2505
2506                    // Compute new offset safely to prevent arithmetic overflow
2507                    if let Some(new_offset) = current.checked_add(offset) {
2508                        // Ensure the new offset is within valid bounds
2509                        if new_offset > i64::MAX as u64 {
2510                            return Err(io::Error::from_raw_os_error(libc::EINVAL).into());
2511                        }
2512                        // Set the new offset using libc::lseek64
2513                        let res = unsafe {
2514                            #[cfg(target_os = "linux")]
2515                            {
2516                                libc::lseek64(
2517                                    file.as_raw_fd(),
2518                                    new_offset as libc::off64_t,
2519                                    libc::SEEK_SET,
2520                                )
2521                            }
2522                            #[cfg(target_os = "macos")]
2523                            {
2524                                libc::lseek(
2525                                    file.as_raw_fd(),
2526                                    new_offset as libc::off_t,
2527                                    libc::SEEK_SET,
2528                                )
2529                            }
2530                        };
2531                        if res < 0 {
2532                            return Err(io::Error::last_os_error().into());
2533                        }
2534                        Ok(ReplyLSeek { offset: new_offset })
2535                    } else {
2536                        Err(io::Error::from_raw_os_error(libc::EINVAL).into())
2537                    }
2538                }
2539                // Other whence values are invalid for directories (e.g., SEEK_END)
2540                _ => Err(io::Error::from_raw_os_error(libc::EINVAL).into()),
2541            }
2542        } else {
2543            // File seek handling for non-directory files
2544            // Acquire the lock to get exclusive access, otherwise it may break do_readdir().
2545            let (_guard, file) = data.get_file_mut().await;
2546
2547            // Safe because this doesn't modify any memory and we check the return value.
2548            // Use 64-bit seek for regular files to match kernel offsets
2549            let res = unsafe {
2550                #[cfg(target_os = "linux")]
2551                {
2552                    libc::lseek64(
2553                        file.as_raw_fd(),
2554                        offset as libc::off64_t,
2555                        whence as libc::c_int,
2556                    )
2557                }
2558                #[cfg(target_os = "macos")]
2559                {
2560                    libc::lseek(
2561                        file.as_raw_fd(),
2562                        offset as libc::off_t,
2563                        whence as libc::c_int,
2564                    )
2565                }
2566            };
2567            if res < 0 {
2568                Err(io::Error::last_os_error().into())
2569            } else {
2570                Ok(ReplyLSeek { offset: res as u64 })
2571            }
2572        }
2573    }
2574
2575    /// Copy a range of data from one file to another using the copy_file_range system call.
2576    /// This can improve performance by reducing data copying between userspace and kernel.
2577    #[allow(clippy::too_many_arguments)]
2578    async fn copy_file_range(
2579        &self,
2580        _req: Request,
2581        inode_in: Inode,
2582        fh_in: u64,
2583        offset_in: u64,
2584        inode_out: Inode,
2585        fh_out: u64,
2586        offset_out: u64,
2587        length: u64,
2588        flags: u64,
2589    ) -> Result<ReplyCopyFileRange> {
2590        // Get the handle data for both source and destination files
2591        let data_in = self.handle_map.get(fh_in, inode_in).await?;
2592        let data_out = self.handle_map.get(fh_out, inode_out).await?;
2593
2594        // Get file descriptors
2595        let _fd_in = data_in.borrow_fd().as_raw_fd();
2596        let _fd_out = data_out.borrow_fd().as_raw_fd();
2597
2598        // Validate and reject unsupported flags
2599        // Linux copy_file_range currently doesn't define any flags (should be 0)
2600        if flags != 0 {
2601            return Err(io::Error::from_raw_os_error(libc::EINVAL).into());
2602        }
2603
2604        // Convert offsets to i64, checking for overflow (offsets > i64::MAX would wrap to negative)
2605        let mut _off_in: i64 = offset_in
2606            .try_into()
2607            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
2608        let mut _off_out: i64 = offset_out
2609            .try_into()
2610            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
2611
2612        // Convert length to usize, checking for overflow on 32-bit systems
2613        let _len: usize = length
2614            .try_into()
2615            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
2616
2617        #[cfg(target_os = "linux")]
2618        {
2619            // SAFETY: copy_file_range reads from fd_in and writes to fd_out. We pass valid
2620            // file descriptors and pointers to offset values. The syscall updates the offset
2621            // pointers to reflect the new positions after the copy, but doesn't modify the
2622            // file descriptor positions themselves (when offsets are non-NULL).
2623            let res = unsafe {
2624                libc::copy_file_range(
2625                    _fd_in,
2626                    &mut _off_in as *mut i64,
2627                    _fd_out,
2628                    &mut _off_out as *mut i64,
2629                    _len,
2630                    0,
2631                )
2632            };
2633            if res < 0 {
2634                Err(io::Error::last_os_error().into())
2635            } else {
2636                Ok(ReplyCopyFileRange {
2637                    copied: res as usize as u64,
2638                })
2639            }
2640        }
2641        #[cfg(target_os = "macos")]
2642        {
2643            // macOS has no copy_file_range. Two-tier strategy:
2644            //
2645            // 1. Whole-file copy (offset_in==offset_out==0, length>=src_size):
2646            //    use `fcopyfile(.., COPYFILE_CLONE | COPYFILE_DATA)`. The
2647            //    `CLONE` flag attempts an APFS O(1) clone; if the underlying
2648            //    FS doesn't support cloning the kernel falls back to a data
2649            //    copy (still in-kernel, faster than userspace pread/pwrite).
2650            //
2651            // 2. Partial / offset copy: pread+pwrite loop with a 64 KiB
2652            //    buffer. Surfaces short copies the way Linux
2653            //    copy_file_range does — a short return is success, fail only
2654            //    if zero bytes moved.
2655            if _off_in == 0 && _off_out == 0 {
2656                let mut src_st = std::mem::MaybeUninit::<libc::stat>::zeroed();
2657                let st_res = unsafe { libc::fstat(_fd_in, src_st.as_mut_ptr()) };
2658                if st_res == 0 {
2659                    let src_size = unsafe { src_st.assume_init() }.st_size as u64;
2660                    if src_size > 0 && length >= src_size {
2661                        let copy_res = unsafe {
2662                            libc::fcopyfile(
2663                                _fd_in,
2664                                _fd_out,
2665                                std::ptr::null_mut(),
2666                                libc::COPYFILE_CLONE | libc::COPYFILE_DATA,
2667                            )
2668                        };
2669                        if copy_res == 0 {
2670                            return Ok(ReplyCopyFileRange { copied: src_size });
2671                        }
2672                        // On any failure, drop through to the pread/pwrite
2673                        // path so we never fail outright when a fallback exists.
2674                    }
2675                }
2676            }
2677
2678            const BUF_SIZE: usize = 64 * 1024;
2679            let mut buf = vec![0u8; BUF_SIZE];
2680            let mut copied: usize = 0;
2681            while copied < _len {
2682                let want = (_len - copied).min(BUF_SIZE);
2683                let read_off = _off_in + copied as i64;
2684                let n = unsafe { libc::pread(_fd_in, buf.as_mut_ptr() as *mut _, want, read_off) };
2685                if n < 0 {
2686                    let err = io::Error::last_os_error();
2687                    if err.kind() == io::ErrorKind::Interrupted {
2688                        continue;
2689                    }
2690                    return if copied == 0 {
2691                        Err(err.into())
2692                    } else {
2693                        Ok(ReplyCopyFileRange {
2694                            copied: copied as u64,
2695                        })
2696                    };
2697                }
2698                if n == 0 {
2699                    break; // EOF on source
2700                }
2701                let n = n as usize;
2702                let mut written = 0usize;
2703                while written < n {
2704                    let write_off = _off_out + (copied + written) as i64;
2705                    let w = unsafe {
2706                        libc::pwrite(
2707                            _fd_out,
2708                            buf.as_ptr().add(written) as *const _,
2709                            n - written,
2710                            write_off,
2711                        )
2712                    };
2713                    if w < 0 {
2714                        let err = io::Error::last_os_error();
2715                        if err.kind() == io::ErrorKind::Interrupted {
2716                            continue;
2717                        }
2718                        return if copied + written == 0 {
2719                            Err(err.into())
2720                        } else {
2721                            Ok(ReplyCopyFileRange {
2722                                copied: (copied + written) as u64,
2723                            })
2724                        };
2725                    }
2726                    if w == 0 {
2727                        break;
2728                    }
2729                    written += w as usize;
2730                }
2731                copied += written;
2732                if written < n {
2733                    break; // short write — stop here, return what we have
2734                }
2735            }
2736            Ok(ReplyCopyFileRange {
2737                copied: copied as u64,
2738            })
2739        }
2740    }
2741
2742    // ------------------------------------------------------------------
2743    // macOS-only opcodes
2744    // ------------------------------------------------------------------
2745
2746    /// macOS only: Finder-issued volume rename. Backing FS doesn't have a
2747    /// notion of a volume name, so we accept and ignore. Returning an
2748    /// error here would surface as `errno` in `setattrlist(ATTR_VOL_NAME)`
2749    /// and confuse Finder.
2750    #[cfg(target_os = "macos")]
2751    async fn setvolname(&self, _req: Request, _name: &OsStr) -> Result<()> {
2752        Ok(())
2753    }
2754
2755    /// macOS only: HFS+/APFS extra-time query. macOS extends `stat(2)` with
2756    /// backup-time and creation-time fields that aren't part of POSIX. The
2757    /// kernel issues this opcode through `getattrlist(ATTR_CMN_BKUPTIME)`
2758    /// and similar.
2759    ///
2760    /// We expose `st_birthtimespec` (creation time) for both fields. macOS
2761    /// has no native concept of backup time on either HFS+ or APFS — the
2762    /// field exists for `getattrlist` API compatibility but is not
2763    /// updated by the kernel. Returning crtime is the best honest
2764    /// approximation: it's monotonic, present on every inode, and fits
2765    /// the field's semantic role (oldest meaningful timestamp on the
2766    /// inode).
2767    #[cfg(target_os = "macos")]
2768    async fn getxtimes(
2769        &self,
2770        _req: Request,
2771        inode: Inode,
2772    ) -> Result<rfuse3::raw::reply::ReplyXTimes> {
2773        let data = self.inode_map.get(inode).await?;
2774        let st = data.handle.stat()?;
2775        // libc::stat on macOS exposes st_birthtimespec.
2776        let crtime = rfuse3::Timestamp::new(st.st_birthtime as i64, st.st_birthtime_nsec as u32);
2777        Ok(rfuse3::raw::reply::ReplyXTimes {
2778            bkuptime: crtime,
2779            crtime,
2780        })
2781    }
2782
2783    /// macOS only: atomic two-entry swap. Userspace triggers this through
2784    /// `exchangedata(2)`. Implemented on top of `renamex_np(RENAME_SWAP)`,
2785    /// which is the same primitive `rename2(RENAME_EXCHANGE)` already
2786    /// uses — both the swap semantics and lazy-fd path bookkeeping are
2787    /// identical.
2788    #[cfg(target_os = "macos")]
2789    async fn exchange(
2790        &self,
2791        _req: Request,
2792        olddir: Inode,
2793        oldname: &OsStr,
2794        newdir: Inode,
2795        newname: &OsStr,
2796        _options: u64,
2797    ) -> Result<()> {
2798        let old_cstr = osstr_to_cstr_or_einval(oldname)?;
2799        let old_cstr = old_cstr.as_ref();
2800        let new_cstr = osstr_to_cstr_or_einval(newname)?;
2801        let new_cstr = new_cstr.as_ref();
2802        self.validate_path_component(old_cstr)?;
2803        self.validate_path_component(new_cstr)?;
2804
2805        let old_inode = self.inode_map.get(olddir).await?;
2806        let new_inode = self.inode_map.get(newdir).await?;
2807        let old_file = old_inode.get_file()?;
2808        let new_file = new_inode.get_file()?;
2809
2810        let lazy = self.cfg.macos_lazy_inode_fd;
2811        let src_id_before = if lazy {
2812            statx::statx(&old_file, Some(old_cstr))
2813                .ok()
2814                .map(|s| inode_store::InodeId::from_stat(&s))
2815        } else {
2816            None
2817        };
2818        let dst_id_before = if lazy {
2819            statx::statx(&new_file, Some(new_cstr))
2820                .ok()
2821                .map(|s| inode_store::InodeId::from_stat(&s))
2822        } else {
2823            None
2824        };
2825
2826        let old_dir_path = util::fd_path_cstr(old_file.as_raw_fd())?;
2827        let new_dir_path = util::fd_path_cstr(new_file.as_raw_fd())?;
2828        let old_full = join_dir_and_name(&old_dir_path, old_cstr)?;
2829        let new_full = join_dir_and_name(&new_dir_path, new_cstr)?;
2830
2831        let res =
2832            unsafe { libc::renamex_np(old_full.as_ptr(), new_full.as_ptr(), libc::RENAME_SWAP) };
2833        if res != 0 {
2834            return Err(io::Error::last_os_error().into());
2835        }
2836
2837        // After SWAP: each entry now lives at the other's old path.
2838        self.macos_lazy_after_rename(&new_inode, newname, src_id_before)
2839            .await;
2840        self.macos_lazy_after_rename(&old_inode, oldname, dst_id_before)
2841            .await;
2842        Ok(())
2843    }
2844}
2845
2846/// trim all trailing nul terminators.
2847pub fn bytes_to_cstr(buf: &[u8]) -> Result<&CStr> {
2848    // There might be multiple 0s at the end of buf, find & use the first one and trim other zeros.
2849    match buf.iter().position(|x| *x == 0) {
2850        // Convert to a `CStr` so that we can drop the '\0' byte at the end and make sure
2851        // there are no interior '\0' bytes.
2852        Some(pos) => CStr::from_bytes_with_nul(&buf[0..=pos]).map_err(|_| Errno::from(5)),
2853        None => {
2854            // Invalid input, just call CStr::from_bytes_with_nul() for suitable error code
2855            CStr::from_bytes_with_nul(buf).map_err(|_| Errno::from(5))
2856        }
2857    }
2858}