libfuse_fs/passthrough/
async_io.rs

1use crate::util::open_options::OpenOptions;
2use bytes::Bytes;
3use futures::stream;
4use libc::{off_t, pread, size_t};
5use rfuse3::{Errno, Inode, Result, raw::prelude::*};
6use std::{
7    ffi::{CStr, CString, OsStr, OsString},
8    fs::File,
9    io,
10    mem::MaybeUninit,
11    num::NonZeroU32,
12    os::{
13        fd::{AsRawFd, RawFd},
14        raw::c_int,
15        unix::ffi::OsStringExt,
16    },
17    sync::{Arc, atomic::Ordering},
18    time::Duration,
19};
20use tracing::{debug, error, info, trace};
21
22use vm_memory::{ByteValued, bitmap::BitmapSlice};
23
24use crate::{
25    passthrough::{CURRENT_DIR_CSTR, EMPTY_CSTR, FileUniqueKey, PARENT_DIR_CSTR, statx::statx},
26    util::{convert_stat64_to_file_attr, filetype_from_mode},
27};
28
29use super::{
30    Handle, HandleData, PassthroughFs, config::CachePolicy, os_compat::LinuxDirent64, util::*,
31};
32
33impl<S: BitmapSlice + Send + Sync> PassthroughFs<S> {
34    async fn open_inode(&self, inode: Inode, flags: i32) -> io::Result<File> {
35        let data = self.inode_map.get(inode).await?;
36        if !is_safe_inode(data.mode) {
37            Err(ebadf())
38        } else {
39            let mut new_flags = self.get_writeback_open_flags(flags).await;
40            if !self.cfg.allow_direct_io && flags & libc::O_DIRECT != 0 {
41                new_flags &= !libc::O_DIRECT;
42            }
43            data.open_file(new_flags | libc::O_CLOEXEC, &self.proc_self_fd)
44        }
45    }
46
47    /// Check the HandleData flags against the flags from the current request
48    /// if these do not match update the file descriptor flags and store the new
49    /// result in the HandleData entry
50    async fn check_fd_flags(
51        &self,
52        data: &Arc<HandleData>,
53        fd: RawFd,
54        flags: u32,
55    ) -> io::Result<()> {
56        let open_flags = data.get_flags().await;
57        if open_flags != flags {
58            let ret = unsafe { libc::fcntl(fd, libc::F_SETFL, flags) };
59            if ret != 0 {
60                return Err(io::Error::last_os_error());
61            }
62            data.set_flags(flags).await;
63        }
64        Ok(())
65    }
66
67    async fn do_readdir(
68        &self,
69        inode: Inode,
70        handle: Handle,
71        offset: u64,
72        entry_list: &mut Vec<std::result::Result<DirectoryEntry, Errno>>,
73    ) -> io::Result<()> {
74        const BUFFER_SIZE: usize = 8192;
75
76        let data = self.get_dirdata(handle, inode, libc::O_RDONLY).await?;
77
78        // Since we are going to work with the kernel offset, we have to acquire the file lock
79        // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread
80        // changes the kernel offset while we are using it.
81        let (_guard, dir) = data.get_file_mut().await;
82
83        // Allocate buffer; pay attention to alignment.
84        let mut buffer = vec![0u8; BUFFER_SIZE];
85
86        // Safe because this doesn't modify any memory and we check the return value.
87        let res =
88            unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) };
89        if res < 0 {
90            return Err(io::Error::last_os_error());
91        }
92
93        loop {
94            // call getdents64 system call
95            let result = unsafe {
96                libc::syscall(
97                    libc::SYS_getdents64,
98                    dir.as_raw_fd(),
99                    buffer.as_mut_ptr() as *mut LinuxDirent64,
100                    BUFFER_SIZE,
101                )
102            };
103
104            if result == -1 {
105                return Err(std::io::Error::last_os_error());
106            }
107
108            let bytes_read = result as usize;
109            if bytes_read == 0 {
110                break; // no more
111            }
112
113            // push every entry .
114            let mut offset = 0;
115            while offset < bytes_read {
116                //let (front, back) = buffer.split_at(size_of::<LinuxDirent64>());
117                //size_of::<LinuxDirent64>()
118                let front = &buffer[offset..offset + size_of::<LinuxDirent64>()];
119                let back = &buffer[offset + size_of::<LinuxDirent64>()..];
120
121                let dirent64 = LinuxDirent64::from_slice(front)
122                    .expect("fuse: unable to get LinuxDirent64 from slice");
123
124                let namelen = dirent64.d_reclen as usize - size_of::<LinuxDirent64>();
125                debug_assert!(
126                    namelen <= back.len(),
127                    "fuse: back is smaller than `namelen`"
128                );
129
130                let name = &back[..namelen];
131                if name.eq(CURRENT_DIR_CSTR) || name.eq(PARENT_DIR_CSTR) {
132                    offset += dirent64.d_reclen as usize;
133                    continue;
134                }
135                let name = bytes_to_cstr(name)
136                    .map_err(|e| {
137                        error!("fuse: do_readdir: {e:?}");
138                        einval()
139                    })?
140                    .to_bytes();
141
142                let mut entry = DirectoryEntry {
143                    inode: dirent64.d_ino,
144                    kind: filetype_from_mode((dirent64.d_ty as u16 * 0x1000u16).into()),
145                    name: OsString::from_vec(name.to_vec()),
146                    offset: dirent64.d_off,
147                };
148                // Safe because do_readdir() has ensured dir_entry.name is a
149                // valid [u8] generated by CStr::to_bytes().
150                let name = osstr_to_cstr(&entry.name)?;
151                // trace!("do_readdir: inode={}, name={}", inode, name.to_str().unwrap());
152                let _entry = self.do_lookup(inode, &name).await?;
153                let mut inodes = self.inode_map.inodes.write().await;
154
155                self.forget_one(&mut inodes, _entry.attr.ino, 1).await;
156                entry.inode = _entry.attr.ino;
157                entry_list.push(Ok(entry));
158
159                // move to next entry
160                offset += dirent64.d_reclen as usize;
161            }
162        }
163
164        Ok(())
165    }
166
167    async fn do_readdirplus(
168        &self,
169        inode: Inode,
170        handle: Handle,
171        offset: u64,
172        entry_list: &mut Vec<std::result::Result<DirectoryEntryPlus, Errno>>,
173    ) -> io::Result<()> {
174        const BUFFER_SIZE: usize = 8192;
175
176        let data = self.get_dirdata(handle, inode, libc::O_RDONLY).await?;
177
178        // Since we are going to work with the kernel offset, we have to acquire the file lock
179        // for both the `lseek64` and `getdents64` syscalls to ensure that no other thread
180        // changes the kernel offset while we are using it.
181        let (_guard, dir) = data.get_file_mut().await;
182
183        // Allocate buffer; pay attention to alignment.
184        let mut buffer = vec![0u8; BUFFER_SIZE];
185
186        // Safe because this doesn't modify any memory and we check the return value.
187        let res =
188            unsafe { libc::lseek64(dir.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET) };
189        if res < 0 {
190            return Err(io::Error::last_os_error());
191        }
192        loop {
193            // call getdents64 system call
194            let result = unsafe {
195                libc::syscall(
196                    libc::SYS_getdents64,
197                    dir.as_raw_fd(),
198                    buffer.as_mut_ptr() as *mut LinuxDirent64,
199                    BUFFER_SIZE,
200                )
201            };
202
203            if result == -1 {
204                return Err(std::io::Error::last_os_error());
205            }
206
207            let bytes_read = result as usize;
208            if bytes_read == 0 {
209                break;
210            }
211
212            let mut offset = 0;
213            while offset < bytes_read {
214                //size_of::<LinuxDirent64>()
215                let front = &buffer[offset..offset + size_of::<LinuxDirent64>()];
216                let back = &buffer[offset + size_of::<LinuxDirent64>()..];
217                //let (front, back) = buffer.split_at(size_of::<LinuxDirent64>());
218
219                let dirent64 = LinuxDirent64::from_slice(front)
220                    .expect("fuse: unable to get LinuxDirent64 from slice");
221
222                let namelen = dirent64.d_reclen as usize - size_of::<LinuxDirent64>();
223                debug_assert!(
224                    namelen <= back.len(),
225                    "fuse: back is smaller than `namelen`"
226                );
227
228                let name = &back[..namelen];
229                if name.starts_with(CURRENT_DIR_CSTR) || name.starts_with(PARENT_DIR_CSTR) {
230                    offset += dirent64.d_reclen as usize;
231                    continue;
232                }
233                let name = bytes_to_cstr(name)
234                    .map_err(|e| {
235                        error!("fuse: do_readdir: {e:?}");
236                        einval()
237                    })?
238                    .to_bytes();
239
240                let mut entry = DirectoryEntry {
241                    inode: dirent64.d_ino,
242                    kind: filetype_from_mode((dirent64.d_ty as u16 * 0x1000u16).into()),
243                    name: OsString::from_vec(name.to_vec()),
244                    offset: dirent64.d_off,
245                };
246                // Safe because do_readdir() has ensured dir_entry.name is a
247                // valid [u8] generated by CStr::to_bytes().
248                let name = osstr_to_cstr(&entry.name)?;
249                debug!("readdir:{}", name.to_str().unwrap());
250                let _entry = self.do_lookup(inode, &name).await?;
251                entry.inode = _entry.attr.ino;
252
253                entry_list.push(Ok(DirectoryEntryPlus {
254                    inode: entry.inode,
255                    generation: _entry.generation,
256                    kind: entry.kind,
257                    name: entry.name,
258                    offset: entry.offset,
259                    attr: _entry.attr,
260                    entry_ttl: _entry.ttl,
261                    attr_ttl: _entry.ttl,
262                }));
263                // add the offset.
264                offset += dirent64.d_reclen as usize;
265            }
266        }
267        Ok(())
268    }
269
270    async fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
271        let file = self.open_inode(inode, flags as i32).await?;
272
273        let data = HandleData::new(inode, file, flags);
274        let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
275        self.handle_map.insert(handle, data).await;
276
277        let mut opts = OpenOptions::empty();
278        match self.cfg.cache_policy {
279            // We only set the direct I/O option on files.
280            CachePolicy::Never => opts.set(
281                OpenOptions::DIRECT_IO,
282                flags & (libc::O_DIRECTORY as u32) == 0,
283            ),
284            CachePolicy::Metadata => {
285                if flags & (libc::O_DIRECTORY as u32) == 0 {
286                    opts |= OpenOptions::DIRECT_IO;
287                } else {
288                    opts |= OpenOptions::CACHE_DIR | OpenOptions::KEEP_CACHE;
289                }
290            }
291            CachePolicy::Always => {
292                opts |= OpenOptions::KEEP_CACHE;
293                if flags & (libc::O_DIRECTORY as u32) != 0 {
294                    opts |= OpenOptions::CACHE_DIR;
295                }
296            }
297            _ => {}
298        };
299
300        Ok((Some(handle), opts))
301    }
302
303    /// Core implementation for `getattr`.
304    ///
305    /// This is the internal function that performs the actual `stat` system call.
306    /// It contains a crucial `mapping` parameter that controls its behavior:
307    /// - `mapping: true`: Applies reverse ID mapping (host -> container) to the `uid` and `gid`.
308    ///   This is for external FUSE clients.
309    /// - `mapping: false`: Returns the raw, unmapped host attributes. This is for internal
310    ///   callers like `overlayfs`'s copy-up logic.
311    async fn do_getattr_inner(
312        &self,
313        inode: Inode,
314        handle: Option<Handle>,
315        mapping: bool,
316    ) -> io::Result<(libc::stat64, Duration)> {
317        // trace!("FS {} passthrough: do_getattr: before get: inode={}, handle={:?}", self.uuid, inode, handle);
318        let data = self.inode_map.get(inode).await.map_err(|e| {
319            error!("fuse: do_getattr ino {inode} Not find err {e:?}");
320            e
321        })?;
322        // trace!("do_getattr: got data {:?}", data);
323
324        // kernel sends 0 as handle in case of no_open, and it depends on fuse server to handle
325        // this case correctly.
326        let st = if !self.no_open.load(Ordering::Relaxed)
327            && let Some(handle_id) = handle
328        {
329            let hd = self.handle_map.get(handle_id, inode).await?;
330            // trace!("FS {} passthrough: do_getattr: before stat_fd", self.uuid);
331            stat_fd(hd.get_file(), None)
332        } else {
333            // trace!("FS {} passthrough: do_getattr: before stat", self.uuid);
334            data.handle.stat()
335        };
336        // trace!("FS {} passthrough: do_getattr: after stat", self.uuid);
337
338        let mut st = st.map_err(|e| {
339            error!("fuse: do_getattr stat failed ino {inode} err {e:?}");
340            e
341        })?;
342        st.st_ino = inode;
343        if mapping {
344            st.st_uid = self.cfg.mapping.find_mapping(st.st_uid, true, true);
345            st.st_gid = self.cfg.mapping.find_mapping(st.st_gid, true, false);
346        }
347        Ok((st, self.cfg.attr_timeout))
348    }
349
350    /// Public `getattr` wrapper for FUSE clients.
351    ///
352    /// This function serves as the standard entry point for `getattr` requests from the FUSE
353    /// kernel module. It always performs ID mapping by calling [`do_getattr_inner`][Self::do_getattr_inner] with
354    /// `mapping: true` to ensure clients see attributes from the container's perspective.
355    async fn do_getattr(
356        &self,
357        inode: Inode,
358        handle: Option<Handle>,
359    ) -> io::Result<(libc::stat64, Duration)> {
360        self.do_getattr_inner(inode, handle, true).await
361    }
362
363    /// Internal `getattr` helper that skips ID mapping.
364    ///
365    /// This helper is specifically designed for internal use by `overlayfs`. It calls
366    /// [`do_getattr_inner`][Self::do_getattr_inner] with `mapping: false` to retrieve the raw, unmodified host
367    /// attributes of a file. This is essential for the `copy_up` process to correctly
368    /// preserve the original file ownership.
369    pub async fn do_getattr_helper(
370        &self,
371        inode: Inode,
372        handle: Option<Handle>,
373    ) -> io::Result<(libc::stat64, Duration)> {
374        self.do_getattr_inner(inode, handle, false).await
375    }
376
377    async fn do_unlink(&self, parent: Inode, name: &CStr, flags: libc::c_int) -> io::Result<()> {
378        let data = self.inode_map.get(parent).await?;
379        let file = data.get_file()?;
380        let st = statx(&file, Some(name)).ok();
381        // Safe because this doesn't modify any memory and we check the return value.
382        let res = unsafe { libc::unlinkat(file.as_raw_fd(), name.as_ptr(), flags) };
383        if res == 0 {
384            if let Some(st) = st
385                && let Some(btime) = st.btime
386                && (btime.tv_sec != 0 || btime.tv_nsec != 0)
387            {
388                let key = FileUniqueKey(st.st.st_ino, btime);
389                self.handle_cache.invalidate(&key).await;
390            }
391
392            Ok(())
393        } else {
394            Err(io::Error::last_os_error())
395        }
396    }
397
398    async fn get_dirdata(
399        &self,
400        handle: Handle,
401        inode: Inode,
402        flags: libc::c_int,
403    ) -> io::Result<Arc<HandleData>> {
404        let no_open = self.no_opendir.load(Ordering::Relaxed);
405        if !no_open {
406            self.handle_map.get(handle, inode).await
407        } else {
408            let file = self.open_inode(inode, flags | libc::O_DIRECTORY).await?;
409            Ok(Arc::new(HandleData::new(inode, file, flags as u32)))
410        }
411    }
412
413    async fn get_data(
414        &self,
415        handle: Handle,
416        inode: Inode,
417        flags: libc::c_int,
418    ) -> io::Result<Arc<HandleData>> {
419        let no_open = self.no_open.load(Ordering::Relaxed);
420        if !no_open {
421            self.handle_map.get(handle, inode).await
422        } else {
423            let file = self.open_inode(inode, flags).await?;
424            Ok(Arc::new(HandleData::new(inode, file, flags as u32)))
425        }
426    }
427
428    /// Core implementation for `create`.
429    ///
430    /// It uses the provided `uid` and `gid` for credential switching if they are `Some`;
431    /// otherwise, it falls back to the credentials from the `Request`. This allows internal
432    /// callers like `overlayfs` to specify an exact host UID/GID.
433    #[allow(clippy::too_many_arguments)]
434    async fn do_create_inner(
435        &self,
436        req: Request,
437        parent: Inode,
438        name: &OsStr,
439        mode: u32,
440        flags: u32,
441        uid: Option<u32>,
442        gid: Option<u32>,
443    ) -> Result<ReplyCreated> {
444        let name = osstr_to_cstr(name).unwrap();
445        let name = name.as_ref();
446        self.validate_path_component(name)?;
447
448        let dir = self.inode_map.get(parent).await?;
449        let dir_file = dir.get_file()?;
450
451        let new_file = {
452            // Here we need to adjust the code order because guard doesn't allowed to cross await point
453            let flags = self.get_writeback_open_flags(flags as i32).await;
454            let _guard = set_creds(
455                uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
456                gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
457            )?;
458            Self::create_file_excl(&dir_file, name, flags, mode)?
459        };
460
461        let entry = self.do_lookup(parent, name).await?;
462        let file = match new_file {
463            // File didn't exist, now created by create_file_excl()
464            Some(f) => f,
465            // File exists, and args.flags doesn't contain O_EXCL. Now let's open it with
466            // open_inode().
467            None => {
468                // Cap restored when _killpriv is dropped
469                // let _killpriv = if self.killpriv_v2.load().await
470                //     && (args.fuse_flags & FOPEN_IN_KILL_SUIDGID != 0)
471                // {
472                //     self::drop_cap_fsetid()?
473                // } else {
474                //     None
475                // };
476
477                // Here we can not call self.open_inode() directly because guard doesn't allowed to cross await point
478                let data = self.inode_map.get(entry.attr.ino).await?;
479                if !is_safe_inode(data.mode) {
480                    return Err(ebadf().into());
481                }
482
483                // Calculate the final flags. This involves an async call.
484                let mut final_flags = self.get_writeback_open_flags(flags as i32).await;
485                if !self.cfg.allow_direct_io && (flags as i32) & libc::O_DIRECT != 0 {
486                    final_flags &= !libc::O_DIRECT;
487                }
488                final_flags |= libc::O_CLOEXEC;
489
490                {
491                    let _guard = set_creds(
492                        uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
493                        gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
494                    )?;
495                    // Maybe buggy because `open_file` may call `open_by_handle_at`, which requires CAP_DAC_READ_SEARCH.
496                    data.open_file(final_flags, &self.proc_self_fd)?
497                }
498            }
499        };
500
501        let ret_handle = if !self.no_open.load(Ordering::Relaxed) {
502            let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
503            let data = HandleData::new(entry.attr.ino, file, flags);
504            self.handle_map.insert(handle, data).await;
505            handle
506        } else {
507            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
508        };
509
510        let mut opts = OpenOptions::empty();
511        match self.cfg.cache_policy {
512            CachePolicy::Never => opts |= OpenOptions::DIRECT_IO,
513            CachePolicy::Metadata => opts |= OpenOptions::DIRECT_IO,
514            CachePolicy::Always => opts |= OpenOptions::KEEP_CACHE,
515            _ => {}
516        };
517        Ok(ReplyCreated {
518            ttl: entry.ttl,
519            attr: entry.attr,
520            generation: entry.generation,
521            fh: ret_handle,
522            flags: opts.bits(),
523        })
524    }
525
526    /// A wrapper for `create`, used by [`copy_regfile_up`][crate::overlayfs::OverlayFs::copy_regfile_up].
527    ///
528    /// This helper is called during a copy-up operation to create a file in the upper
529    /// layer while preserving the original host UID/GID from the lower layer file.
530    #[allow(clippy::too_many_arguments)]
531    pub async fn do_create_helper(
532        &self,
533        req: Request,
534        parent: Inode,
535        name: &OsStr,
536        mode: u32,
537        flags: u32,
538        uid: u32,
539        gid: u32,
540    ) -> Result<ReplyCreated> {
541        self.do_create_inner(req, parent, name, mode, flags, Some(uid), Some(gid))
542            .await
543    }
544
545    /// Core implementation for `mkdir`.
546    ///
547    /// It uses the provided `uid` and `gid` for credential switching if they are `Some`;
548    /// otherwise, it falls back to the credentials from the `Request`.
549    #[allow(clippy::too_many_arguments)]
550    async fn do_mkdir_inner(
551        &self,
552        req: Request,
553        parent: Inode,
554        name: &OsStr,
555        mode: u32,
556        umask: u32,
557        uid: Option<u32>,
558        gid: Option<u32>,
559    ) -> Result<ReplyEntry> {
560        let name = osstr_to_cstr(name).unwrap();
561        let name = name.as_ref();
562        self.validate_path_component(name)?;
563
564        let data = self.inode_map.get(parent).await?;
565        let file = data.get_file()?;
566
567        let res = {
568            let _guard = set_creds(
569                uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
570                gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
571            )?;
572
573            // Safe because this doesn't modify any memory and we check the return value.
574            unsafe { libc::mkdirat(file.as_raw_fd(), name.as_ptr(), mode & !umask) }
575        };
576        if res < 0 {
577            return Err(io::Error::last_os_error().into());
578        }
579
580        self.do_lookup(parent, name).await
581    }
582
583    /// A wrapper for `mkdir`, used by [`create_upper_dir`][crate::overlayfs::OverlayInode::create_upper_dir] function.
584    ///
585    /// This helper is called during a copy-up operation when a parent directory needs to be
586    /// created in the upper layer, preserving the original host UID/GID.
587    #[allow(clippy::too_many_arguments)]
588    pub async fn do_mkdir_helper(
589        &self,
590        req: Request,
591        parent: Inode,
592        name: &OsStr,
593        mode: u32,
594        umask: u32,
595        uid: u32,
596        gid: u32,
597    ) -> Result<ReplyEntry> {
598        self.do_mkdir_inner(req, parent, name, mode, umask, Some(uid), Some(gid))
599            .await
600    }
601
602    /// Core implementation for `symlink`.
603    ///
604    /// It uses the provided `uid` and `gid` for credential switching if they are `Some`;
605    /// otherwise, it falls back to the credentials from the `Request`.
606    async fn do_symlink_inner(
607        &self,
608        req: Request,
609        parent: Inode,
610        name: &OsStr,
611        link: &OsStr,
612        uid: Option<u32>,
613        gid: Option<u32>,
614    ) -> Result<ReplyEntry> {
615        let name = osstr_to_cstr(name).unwrap();
616        let name = name.as_ref();
617        let link = osstr_to_cstr(link).unwrap();
618        let link = link.as_ref();
619        self.validate_path_component(name)?;
620
621        let data = self.inode_map.get(parent).await?;
622        let file = data.get_file()?;
623
624        let res = {
625            let _guard = set_creds(
626                uid.unwrap_or(self.cfg.mapping.get_uid(req.uid)),
627                gid.unwrap_or(self.cfg.mapping.get_gid(req.gid)),
628            )?;
629
630            // Safe because this doesn't modify any memory and we check the return value.
631            unsafe { libc::symlinkat(link.as_ptr(), file.as_raw_fd(), name.as_ptr()) }
632        };
633        if res == 0 {
634            self.do_lookup(parent, name).await
635        } else {
636            Err(io::Error::last_os_error().into())
637        }
638    }
639
640    /// A wrapper for `symlink`, used by [`copy_symlink_up`][crate::overlayfs::OverlayFs::copy_symlink_up] function.
641    ///
642    /// This helper is called during a copy-up operation to create a symbolic link in the
643    /// upper layer while preserving the original host UID/GID from the lower layer link.
644    pub async fn do_symlink_helper(
645        &self,
646        req: Request,
647        parent: Inode,
648        name: &OsStr,
649        link: &OsStr,
650        uid: u32,
651        gid: u32,
652    ) -> Result<ReplyEntry> {
653        self.do_symlink_inner(req, parent, name, link, Some(uid), Some(gid))
654            .await
655    }
656}
657
658impl Filesystem for PassthroughFs {
659    /// initialize filesystem. Called before any other filesystem method.
660    async fn init(&self, _req: Request) -> Result<ReplyInit> {
661        if self.cfg.do_import {
662            self.import().await?;
663        }
664
665        Ok(ReplyInit {
666            max_write: NonZeroU32::new(128 * 1024).unwrap(),
667        })
668    }
669
670    /// clean up filesystem. Called on filesystem exit which is fuseblk, in normal fuse filesystem,
671    /// kernel may call forget for root. There is some discuss for this
672    /// <https://github.com/bazil/fuse/issues/82#issuecomment-88126886>,
673    /// <https://sourceforge.net/p/fuse/mailman/message/31995737/>
674    async fn destroy(&self, _req: Request) {
675        self.handle_map.clear().await;
676        self.inode_map.clear().await;
677
678        if let Err(e) = self.import().await {
679            error!("fuse: failed to destroy instance, {e:?}");
680        };
681    }
682
683    /// look up a directory entry by name and get its attributes.
684    async fn lookup(&self, _req: Request, parent: Inode, name: &OsStr) -> Result<ReplyEntry> {
685        // Don't use is_safe_path_component(), allow "." and ".." for NFS export support
686        if name.to_string_lossy().as_bytes().contains(&SLASH_ASCII) {
687            return Err(einval().into());
688        }
689        let name = osstr_to_cstr(name).unwrap();
690        // trace!("lookup: parent={}, name={}", parent, name.to_str().unwrap());
691        self.do_lookup(parent, name.as_ref()).await
692    }
693
694    /// forget an inode. The nlookup parameter indicates the number of lookups previously
695    /// performed on this inode. If the filesystem implements inode lifetimes, it is recommended
696    /// that inodes acquire a single reference on each lookup, and lose nlookup references on each
697    /// forget. The filesystem may ignore forget calls, if the inodes don't need to have a limited
698    /// lifetime. On unmount it is not guaranteed, that all referenced inodes will receive a forget
699    /// message. When filesystem is normal(not fuseblk) and unmounting, kernel may send forget
700    /// request for root and this library will stop session after call forget. There is some
701    /// discussion for this <https://github.com/bazil/fuse/issues/82#issuecomment-88126886>,
702    /// <https://sourceforge.net/p/fuse/mailman/message/31995737/>
703    async fn forget(&self, _req: Request, inode: Inode, nlookup: u64) {
704        let mut inodes = self.inode_map.inodes.write().await;
705
706        self.forget_one(&mut inodes, inode, nlookup).await
707    }
708
709    /// get file attributes. If `fh` is None, means `fh` is not set.
710    async fn getattr(
711        &self,
712        _req: Request,
713        inode: Inode,
714        fh: Option<u64>,
715        _flags: u32,
716    ) -> Result<ReplyAttr> {
717        let re = self.do_getattr(inode, fh).await?;
718        Ok(ReplyAttr {
719            ttl: re.1,
720            attr: convert_stat64_to_file_attr(re.0),
721        })
722    }
723
724    /// set file attributes. If `fh` is None, means `fh` is not set.
725    async fn setattr(
726        &self,
727        req: Request,
728        inode: Inode,
729        fh: Option<u64>,
730        set_attr: SetAttr,
731    ) -> Result<ReplyAttr> {
732        let inode_data = self.inode_map.get(inode).await?;
733
734        enum Data {
735            Handle(Arc<HandleData>),
736            ProcPath(CString),
737        }
738
739        let file = inode_data.get_file()?;
740        let data = if self.no_open.load(Ordering::Relaxed) {
741            let pathname = CString::new(format!("{}", file.as_raw_fd()))
742                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
743            Data::ProcPath(pathname)
744        } else {
745            // If we have a handle then use it otherwise get a new fd from the inode.
746            if let Some(handle) = fh {
747                let hd = self.handle_map.get(handle, inode).await?;
748                Data::Handle(hd)
749            } else {
750                let pathname = CString::new(format!("{}", file.as_raw_fd()))
751                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
752                Data::ProcPath(pathname)
753            }
754        };
755
756        if set_attr.size.is_some() && self.seal_size.load(Ordering::Relaxed) {
757            return Err(io::Error::from_raw_os_error(libc::EPERM).into());
758        }
759
760        if set_attr.mode.is_some() {
761            // Safe because this doesn't modify any memory and we check the return value.
762            let res = unsafe {
763                match data {
764                    Data::Handle(ref h) => {
765                        libc::fchmod(h.borrow_fd().as_raw_fd(), set_attr.mode.unwrap())
766                    }
767                    Data::ProcPath(ref p) => libc::fchmodat(
768                        self.proc_self_fd.as_raw_fd(),
769                        p.as_ptr(),
770                        set_attr.mode.unwrap(),
771                        0,
772                    ),
773                }
774            };
775            if res < 0 {
776                return Err(io::Error::last_os_error().into());
777            }
778        }
779
780        if set_attr.uid.is_some() && set_attr.gid.is_some() {
781            //valid.intersects(SetattrValid::UID | SetattrValid::GID)
782            let uid = self.cfg.mapping.get_uid(set_attr.uid.unwrap());
783            let gid = self.cfg.mapping.get_gid(set_attr.gid.unwrap());
784
785            // Safe because this is a constant value and a valid C string.
786            let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
787
788            // Safe because this doesn't modify any memory and we check the return value.
789            let res = unsafe {
790                libc::fchownat(
791                    file.as_raw_fd(),
792                    empty.as_ptr(),
793                    uid,
794                    gid,
795                    libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
796                )
797            };
798            if res < 0 {
799                return Err(io::Error::last_os_error().into());
800            }
801        }
802
803        if set_attr.size.is_some() {
804            let size = set_attr.size.unwrap();
805            // Safe because this doesn't modify any memory and we check the return value.
806            let res = match data {
807                Data::Handle(ref h) => unsafe {
808                    libc::ftruncate(h.borrow_fd().as_raw_fd(), size.try_into().unwrap())
809                },
810                _ => {
811                    // There is no `ftruncateat` so we need to get a new fd and truncate it.
812                    let f = self
813                        .open_inode(inode, libc::O_NONBLOCK | libc::O_RDWR)
814                        .await?;
815                    unsafe { libc::ftruncate(f.as_raw_fd(), size.try_into().unwrap()) }
816                }
817            };
818            if res < 0 {
819                return Err(io::Error::last_os_error().into());
820            }
821        }
822
823        if set_attr.atime.is_some() || set_attr.mtime.is_some() {
824            // POSIX utime() permission rules:
825            // - utime(NULL): requires owner OR write permission
826            // - utime(&times): requires owner only
827            //
828            // At FUSE level, we cannot reliably distinguish these cases because VFS
829            // converts both to actual timestamps. We use a heuristic:
830            // - If both nsec == 0 and timestamp is in the past: likely utime(&times)
831            // - Otherwise: likely utime(NULL) which gets current time with nsec precision
832
833            // SAFETY: libc::time with null pointer is a read-only syscall that always
834            // succeeds and doesn't modify memory.
835            let now = unsafe { libc::time(std::ptr::null_mut()) };
836
837            // Heuristic: utime(&times) typically sets whole seconds (both nsec=0) to past times.
838            // utime(NULL) sets current time which usually has non-zero nsec.
839            // Both timestamps and both conditions must be satisfied to avoid false positives.
840            let is_utime_times =
841                if let (Some(atime_ts), Some(mtime_ts)) = (set_attr.atime, set_attr.mtime) {
842                    (atime_ts.nsec == 0 && mtime_ts.nsec == 0)
843                        && (atime_ts.sec < now && mtime_ts.sec < now)
844                } else {
845                    // If one is None, it's likely a specific update, treat as requiring ownership.
846                    true
847                };
848
849            let st = stat_fd(&file, None)?;
850            let uid = self.cfg.mapping.get_uid(req.uid);
851            let gid = self.cfg.mapping.get_gid(req.gid);
852
853            let is_owner = st.st_uid == uid;
854
855            if !is_owner {
856                if is_utime_times {
857                    // utime(&times): only owner allowed
858                    return Err(io::Error::from_raw_os_error(libc::EPERM).into());
859                } else {
860                    // utime(NULL): check for write permission
861                    // Check user, group, and other permissions
862                    // NOTE: This currently only checks the primary gid. A complete POSIX-compliant
863                    // implementation should check all supplementary groups from req.groups if available.
864                    // However, rfuse3::Request currently doesn't expose supplementary group information.
865                    let has_user_write = st.st_uid == uid && st.st_mode & 0o200 != 0;
866                    let has_group_write = st.st_gid == gid && st.st_mode & 0o020 != 0;
867                    let has_other_write = st.st_mode & 0o002 != 0;
868
869                    if !has_user_write && !has_group_write && !has_other_write {
870                        return Err(io::Error::from_raw_os_error(libc::EPERM).into());
871                    }
872                }
873            }
874            let mut tvs: [libc::timespec; 2] = [
875                libc::timespec {
876                    tv_sec: 0,
877                    tv_nsec: libc::UTIME_OMIT,
878                },
879                libc::timespec {
880                    tv_sec: 0,
881                    tv_nsec: libc::UTIME_OMIT,
882                },
883            ];
884            if let Some(atime_ts) = set_attr.atime {
885                tvs[0].tv_sec = atime_ts.sec;
886                tvs[0].tv_nsec = atime_ts.nsec as i64;
887            }
888            if let Some(mtime_ts) = set_attr.mtime {
889                tvs[1].tv_sec = mtime_ts.sec;
890                tvs[1].tv_nsec = mtime_ts.nsec as i64;
891            }
892
893            // Safe because this doesn't modify any memory and we check the return value.
894            let res = match data {
895                Data::Handle(ref h) => unsafe {
896                    libc::futimens(h.borrow_fd().as_raw_fd(), tvs.as_ptr())
897                },
898                Data::ProcPath(ref p) => unsafe {
899                    libc::utimensat(self.proc_self_fd.as_raw_fd(), p.as_ptr(), tvs.as_ptr(), 0)
900                },
901            };
902            if res < 0 {
903                return Err(io::Error::last_os_error().into());
904            }
905        }
906
907        // After any successful modification, re-stat the file to get fresh attributes.
908        // Use `do_getattr` which correctly handles ID mapping.
909        let (new_stat, _attr_timeout) = self.do_getattr(inode, fh).await?;
910        // Crucially, return a ReplyAttr with a zero TTL.
911        // This tells the kernel to invalidate its attribute cache for this inode immediately.
912        // Subsequent `stat()` calls from clients will trigger a fresh `getattr` request.
913        Ok(ReplyAttr {
914            ttl: Duration::new(0, 0),
915            attr: convert_stat64_to_file_attr(new_stat),
916        })
917    }
918
919    /// read symbolic link.
920    async fn readlink(&self, _req: Request, inode: Inode) -> Result<ReplyData> {
921        // Safe because this is a constant value and a valid C string.
922        let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
923        let mut buf = Vec::<u8>::with_capacity(libc::PATH_MAX as usize);
924        let data = self.inode_map.get(inode).await?;
925
926        let file = data.get_file()?;
927
928        // Safe because this will only modify the contents of `buf` and we check the return value.
929        let res = unsafe {
930            libc::readlinkat(
931                file.as_raw_fd(),
932                empty.as_ptr(),
933                buf.as_mut_ptr() as *mut libc::c_char,
934                libc::PATH_MAX as usize,
935            )
936        };
937        if res < 0 {
938            return Err(io::Error::last_os_error().into());
939        }
940
941        // Safe because we trust the value returned by kernel.
942        unsafe { buf.set_len(res as usize) };
943
944        Ok(ReplyData {
945            data: Bytes::from(buf),
946        })
947    }
948
949    /// create a symbolic link.
950    async fn symlink(
951        &self,
952        req: Request,
953        parent: Inode,
954        name: &OsStr,
955        link: &OsStr,
956    ) -> Result<ReplyEntry> {
957        self.do_symlink_inner(req, parent, name, link, None, None)
958            .await
959    }
960
961    /// create file node. Create a regular file, character device, block device, fifo or socket
962    /// node. When creating file, most cases user only need to implement
963    /// [`create`][Filesystem::create].
964    async fn mknod(
965        &self,
966        req: Request,
967        parent: Inode,
968        name: &OsStr,
969        mode: u32,
970        rdev: u32,
971    ) -> Result<ReplyEntry> {
972        let name = osstr_to_cstr(name).unwrap();
973        let name = name.as_ref();
974        self.validate_path_component(name)?;
975
976        let data = self.inode_map.get(parent).await?;
977        let file = data.get_file()?;
978
979        let res = {
980            let (_uid, _gid) = set_creds(
981                self.cfg.mapping.get_uid(req.uid),
982                self.cfg.mapping.get_gid(req.gid),
983            )?;
984
985            // Safe because this doesn't modify any memory and we check the return value.
986            unsafe {
987                libc::mknodat(
988                    file.as_raw_fd(),
989                    name.as_ptr(),
990                    (mode) as libc::mode_t,
991                    u64::from(rdev),
992                )
993            }
994        };
995        if res < 0 {
996            Err(io::Error::last_os_error().into())
997        } else {
998            self.do_lookup(parent, name).await
999        }
1000    }
1001
1002    /// create a directory.
1003    async fn mkdir(
1004        &self,
1005        req: Request,
1006        parent: Inode,
1007        name: &OsStr,
1008        mode: u32,
1009        umask: u32,
1010    ) -> Result<ReplyEntry> {
1011        self.do_mkdir_inner(req, parent, name, mode, umask, None, None)
1012            .await
1013    }
1014
1015    /// remove a file.
1016    async fn unlink(&self, _req: Request, parent: Inode, name: &OsStr) -> Result<()> {
1017        let name = osstr_to_cstr(name).unwrap();
1018        let name = name.as_ref();
1019        self.validate_path_component(name)?;
1020        self.do_unlink(parent, name, 0).await.map_err(|e| e.into())
1021    }
1022
1023    /// remove a directory.
1024    async fn rmdir(&self, _req: Request, parent: Inode, name: &OsStr) -> Result<()> {
1025        let name = osstr_to_cstr(name).unwrap();
1026        let name = name.as_ref();
1027        self.validate_path_component(name)?;
1028        self.do_unlink(parent, name, libc::AT_REMOVEDIR)
1029            .await
1030            .map_err(|e| e.into())
1031    }
1032
1033    /// create a hard link.
1034    async fn link(
1035        &self,
1036        _req: Request,
1037        inode: Inode,
1038        new_parent: Inode,
1039        new_name: &OsStr,
1040    ) -> Result<ReplyEntry> {
1041        trace!(
1042            "passthrough: link: inode={}, new_parent={}, new_name={}",
1043            inode,
1044            new_parent,
1045            new_name.to_str().unwrap()
1046        );
1047        let newname = osstr_to_cstr(new_name).unwrap();
1048        let newname = newname.as_ref();
1049        self.validate_path_component(newname)?;
1050
1051        trace!("link: trying to get inode {inode}");
1052        let data = self.inode_map.get(inode).await?;
1053        trace!("link: trying to get new parent {new_parent}");
1054        let new_inode = self.inode_map.get(new_parent).await?;
1055        let file = data.get_file()?;
1056        let new_file = new_inode.get_file()?;
1057
1058        // Safe because this is a constant value and a valid C string.
1059        let empty = unsafe { CStr::from_bytes_with_nul_unchecked(EMPTY_CSTR) };
1060
1061        // Safe because this doesn't modify any memory and we check the return value.
1062        let res = unsafe {
1063            libc::linkat(
1064                file.as_raw_fd(),
1065                empty.as_ptr(),
1066                new_file.as_raw_fd(),
1067                newname.as_ptr(),
1068                libc::AT_EMPTY_PATH,
1069            )
1070        };
1071        if res == 0 {
1072            trace!(
1073                "passthrough: link: inode={}, new_parent={}, new_name={}, res=0, trying to lookup",
1074                inode,
1075                new_parent,
1076                newname.to_str().unwrap()
1077            );
1078            self.do_lookup(new_parent, newname).await
1079        } else {
1080            trace!(
1081                "passthrough: link: inode={}, new_parent={}, new_name={}, res={}",
1082                inode,
1083                new_parent,
1084                newname.to_str().unwrap(),
1085                res
1086            );
1087            Err(io::Error::last_os_error().into())
1088        }
1089    }
1090
1091    /// open a file. Open flags (with the exception of `O_CREAT`, `O_EXCL` and `O_NOCTTY`) are
1092    /// available in flags. Filesystem may store an arbitrary file handle (pointer, index, etc) in
1093    /// fh, and use this in other all other file operations (read, write, flush, release, fsync).
1094    /// Filesystem may also implement stateless file I/O and not store anything in fh. There are
1095    /// also some flags (`direct_io`, `keep_cache`) which the filesystem may set, to change the way
1096    /// the file is opened. A filesystem need not implement this method if it
1097    /// sets [`MountOptions::no_open_support`][rfuse3::MountOptions::no_open_support] and if the
1098    /// kernel supports `FUSE_NO_OPEN_SUPPORT`.
1099    ///
1100    /// # Notes:
1101    ///
1102    /// See `fuse_file_info` structure in
1103    /// [fuse_common.h](https://libfuse.github.io/doxygen/include_2fuse__common_8h_source.html) for
1104    /// more details.
1105    async fn open(&self, _req: Request, inode: Inode, flags: u32) -> Result<ReplyOpen> {
1106        if self.no_open.load(Ordering::Relaxed) {
1107            info!("fuse: open is not supported.");
1108            Err(enosys().into())
1109        } else {
1110            let re = self.do_open(inode, flags).await?;
1111            Ok(ReplyOpen {
1112                fh: re.0.unwrap(),
1113                flags: re.1.bits(),
1114            })
1115        }
1116    }
1117
1118    /// read data. Read should send exactly the number of bytes requested except on EOF or error,
1119    /// otherwise the rest of the data will be substituted with zeroes. An exception to this is
1120    /// when the file has been opened in `direct_io` mode, in which case the return value of the
1121    /// read system call will reflect the return value of this operation. `fh` will contain the
1122    /// value set by the open method, or will be undefined if the open method didn't set any value.
1123    async fn read(
1124        &self,
1125        _req: Request,
1126        inode: Inode,
1127        fh: u64,
1128        offset: u64,
1129        size: u32,
1130    ) -> Result<ReplyData> {
1131        let data = self.get_data(fh, inode, libc::O_RDONLY).await?;
1132        let _guard = data.lock.lock().await;
1133        let raw_fd = data.borrow_fd().as_raw_fd();
1134
1135        let mut buf = vec![0; size as usize];
1136        let file = &data.file;
1137
1138        let res = if self.cfg.use_mmap {
1139            self.read_from_mmap(inode, offset, size as u64, file, buf.as_mut_slice())
1140                .await
1141                .ok()
1142        } else {
1143            None
1144        };
1145
1146        match res {
1147            Some(bytes_read) => {
1148                if bytes_read < size as usize {
1149                    buf.truncate(bytes_read); // Adjust the buffer size for EOF
1150                }
1151            }
1152            None => {
1153                if offset > i64::MAX as u64 {
1154                    error!("read error: offset too large: {}", offset);
1155                    return Err(Errno::from(libc::EOVERFLOW));
1156                }
1157                const ALIGN: usize = 4096;
1158                let open_flags = data.get_flags().await;
1159                let ret = if (open_flags as i32 & libc::O_DIRECT) != 0 {
1160                    let mut aligned_buf = unsafe {
1161                        let layout = std::alloc::Layout::from_size_align(size as _, ALIGN).unwrap();
1162                        let ptr = std::alloc::alloc(layout);
1163                        if ptr.is_null() {
1164                            return Err(io::Error::from_raw_os_error(libc::ENOMEM).into());
1165                        }
1166                        Vec::from_raw_parts(ptr, size as _, size as _)
1167                    };
1168                    let ret = unsafe {
1169                        pread(
1170                            raw_fd as c_int,
1171                            aligned_buf.as_mut_ptr() as *mut libc::c_void,
1172                            size as size_t,
1173                            offset as off_t,
1174                        )
1175                    };
1176
1177                    if ret >= 0 {
1178                        let bytes_read = ret as usize;
1179                        buf.as_mut_slice()[..bytes_read]
1180                            .copy_from_slice(&aligned_buf[..bytes_read]);
1181                    }
1182                    ret
1183                } else {
1184                    unsafe {
1185                        pread(
1186                            raw_fd as c_int,
1187                            buf.as_mut_ptr() as *mut libc::c_void,
1188                            size as size_t,
1189                            offset as off_t,
1190                        )
1191                    }
1192                };
1193                if ret < 0 {
1194                    let e = io::Error::last_os_error();
1195                    error!("read error: {e:?}");
1196                    error!(
1197                        "pread raw_fd={}, pointer={:p}, size={}, offset={}",
1198                        raw_fd,
1199                        buf.as_mut_ptr(),
1200                        size,
1201                        offset
1202                    );
1203                    return Err(e.into());
1204                } else {
1205                    let bytes_read = ret as usize;
1206                    buf.truncate(bytes_read);
1207                }
1208            }
1209        }
1210
1211        Ok(ReplyData {
1212            data: Bytes::from(buf),
1213        })
1214    }
1215
1216    /// write data. Write should return exactly the number of bytes requested except on error. An
1217    /// exception to this is when the file has been opened in `direct_io` mode, in which case the
1218    /// return value of the write system call will reflect the return value of this operation. `fh`
1219    /// will contain the value set by the open method, or will be undefined if the open method
1220    /// didn't set any value. When `write_flags` contains
1221    /// [`FUSE_WRITE_CACHE`][rfuse3::raw::flags::FUSE_WRITE_CACHE], means the write operation is a
1222    /// delay write.
1223    #[allow(clippy::too_many_arguments)]
1224    async fn write(
1225        &self,
1226        _req: Request,
1227        inode: Inode,
1228        fh: u64,
1229        offset: u64,
1230        data: &[u8],
1231        _write_flags: u32,
1232        flags: u32,
1233    ) -> Result<ReplyWrite> {
1234        let handle_data = self.get_data(fh, inode, libc::O_RDWR).await?;
1235        let file = &handle_data.file;
1236        let _guard = handle_data.lock.lock().await;
1237        let raw_fd = handle_data.borrow_fd().as_raw_fd();
1238
1239        let res = if self.cfg.use_mmap {
1240            self.write_to_mmap(inode, offset, data, file).await.ok()
1241        } else {
1242            None
1243        };
1244
1245        let ret = match res {
1246            Some(ret) => ret as isize,
1247            None => {
1248                let size = data.len();
1249                if offset > i64::MAX as u64 {
1250                    error!("write error: offset too large: {}", offset);
1251                    return Err(Errno::from(libc::EOVERFLOW));
1252                }
1253                self.check_fd_flags(&handle_data, raw_fd, flags).await?;
1254                let ret = unsafe {
1255                    libc::pwrite(
1256                        raw_fd as c_int,
1257                        data.as_ptr() as *const libc::c_void,
1258                        size as size_t,
1259                        offset as off_t,
1260                    )
1261                };
1262                if ret >= 0 {
1263                    ret
1264                } else {
1265                    let e = io::Error::last_os_error();
1266                    error!("write error: {e:?}");
1267                    error!(
1268                        "pwrite raw_fd={}, pointer={:p}, size={}, offset={}",
1269                        raw_fd,
1270                        data.as_ptr(),
1271                        size,
1272                        offset
1273                    );
1274                    return Err(Errno::from(e.raw_os_error().unwrap_or(-1)));
1275                }
1276            }
1277        };
1278
1279        Ok(ReplyWrite {
1280            written: ret as u32,
1281        })
1282    }
1283
1284    /// get filesystem statistics.
1285    async fn statfs(&self, _req: Request, inode: Inode) -> Result<ReplyStatFs> {
1286        let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
1287        let data = self.inode_map.get(inode).await?;
1288        let file = data.get_file()?;
1289
1290        // Safe because this will only modify `out` and we check the return value.
1291        let statfs: libc::statvfs64 =
1292            match unsafe { libc::fstatvfs64(file.as_raw_fd(), out.as_mut_ptr()) } {
1293                // Safe because the kernel guarantees that `out` has been initialized.
1294                0 => unsafe { out.assume_init() },
1295                _ => return Err(io::Error::last_os_error().into()),
1296            };
1297
1298        Ok(
1299            // Populate the ReplyStatFs structure with the necessary information
1300            ReplyStatFs {
1301                blocks: statfs.f_blocks,
1302                bfree: statfs.f_bfree,
1303                bavail: statfs.f_bavail,
1304                files: statfs.f_files,
1305                ffree: statfs.f_ffree,
1306                bsize: statfs.f_bsize as u32,
1307                namelen: statfs.f_namemax as u32,
1308                frsize: statfs.f_frsize as u32,
1309            },
1310        )
1311    }
1312
1313    /// release an open file. Release is called when there are no more references to an open file:
1314    /// all file descriptors are closed and all memory mappings are unmapped. For every open call
1315    /// there will be exactly one release call. The filesystem may reply with an error, but error
1316    /// values are not returned to `close()` or `munmap()` which triggered the release. `fh` will
1317    /// contain the value set by the open method, or will be undefined if the open method didn't
1318    /// set any value. `flags` will contain the same flags as for open. `flush` means flush the
1319    /// data or not when closing file.
1320    async fn release(
1321        &self,
1322        _req: Request,
1323        inode: Inode,
1324        fh: u64,
1325        _flags: u32,
1326        _lock_owner: u64,
1327        _flush: bool,
1328    ) -> Result<()> {
1329        if self.no_open.load(Ordering::Relaxed) {
1330            Err(enosys().into())
1331        } else {
1332            self.do_release(inode, fh).await.map_err(|e| e.into())
1333        }
1334    }
1335
1336    /// synchronize file contents. If the `datasync` is true, then only the user data should be
1337    /// flushed, not the metadata.
1338    async fn fsync(&self, _req: Request, inode: Inode, fh: u64, datasync: bool) -> Result<()> {
1339        let data = self.get_data(fh, inode, libc::O_RDONLY).await?;
1340        let fd = data.borrow_fd();
1341
1342        // Safe because this doesn't modify any memory and we check the return value.
1343        let res = unsafe {
1344            if datasync {
1345                libc::fdatasync(fd.as_raw_fd())
1346            } else {
1347                libc::fsync(fd.as_raw_fd())
1348            }
1349        };
1350        if res == 0 {
1351            Ok(())
1352        } else {
1353            Err(io::Error::last_os_error().into())
1354        }
1355    }
1356
1357    /// set an extended attribute.
1358    async fn setxattr(
1359        &self,
1360        _req: Request,
1361        inode: Inode,
1362        name: &OsStr,
1363        value: &[u8],
1364        flags: u32,
1365        _position: u32,
1366    ) -> Result<()> {
1367        if !self.cfg.xattr {
1368            return Err(enosys().into());
1369        }
1370        let name = osstr_to_cstr(name).unwrap();
1371        let name = name.as_ref();
1372        let data = self.inode_map.get(inode).await?;
1373        let file = data.get_file()?;
1374        let pathname = CString::new(format!("/proc/self/fd/{}", file.as_raw_fd()))
1375            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1376
1377        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1378        // need to use the {set,get,remove,list}xattr variants.
1379        // Safe because this doesn't modify any memory and we check the return value.
1380        let res = unsafe {
1381            libc::setxattr(
1382                pathname.as_ptr(),
1383                name.as_ptr(),
1384                value.as_ptr() as *const libc::c_void,
1385                value.len(),
1386                flags as libc::c_int,
1387            )
1388        };
1389        if res == 0 {
1390            Ok(())
1391        } else {
1392            Err(io::Error::last_os_error().into())
1393        }
1394    }
1395
1396    /// Get an extended attribute. If `size` is too small, return `Err<ERANGE>`.
1397    /// Otherwise, use [`ReplyXAttr::Data`] to send the attribute data, or
1398    /// return an error.
1399    async fn getxattr(
1400        &self,
1401        _req: Request,
1402        inode: Inode,
1403        name: &OsStr,
1404        size: u32,
1405    ) -> Result<ReplyXAttr> {
1406        if !self.cfg.xattr {
1407            return Err(enosys().into());
1408        }
1409        let name =
1410            osstr_to_cstr(name).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1411        let name = name.as_ref();
1412        let data = self.inode_map.get(inode).await?;
1413        let file = data.get_file()?;
1414        let mut buf = Vec::<u8>::with_capacity(size as usize);
1415        let pathname = CString::new(format!("/proc/self/fd/{}", file.as_raw_fd(),))
1416            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1417
1418        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1419        // need to use the {set,get,remove,list}xattr variants.
1420        // Safe because this will only modify the contents of `buf`.
1421        let res = unsafe {
1422            libc::getxattr(
1423                pathname.as_ptr(),
1424                name.as_ptr(),
1425                buf.as_mut_ptr() as *mut libc::c_void,
1426                size as libc::size_t,
1427            )
1428        };
1429        if res < 0 {
1430            let e = io::Error::last_os_error();
1431            // error!("getxattr error: {e:?}");
1432            return Err(e.into());
1433        }
1434
1435        if size == 0 {
1436            Ok(ReplyXAttr::Size(res as u32))
1437        } else {
1438            // Safe because we trust the value returned by kernel.
1439            unsafe { buf.set_len(res as usize) };
1440            Ok(ReplyXAttr::Data(Bytes::from(buf)))
1441        }
1442    }
1443
1444    /// List extended attribute names.
1445    ///
1446    /// If `size` is too small, return `Err<ERANGE>`.  Otherwise, use
1447    /// [`ReplyXAttr::Data`] to send the attribute list, or return an error.
1448    async fn listxattr(&self, _req: Request, inode: Inode, size: u32) -> Result<ReplyXAttr> {
1449        if !self.cfg.xattr {
1450            return Err(enosys().into());
1451        }
1452
1453        let data = self.inode_map.get(inode).await?;
1454        let file = data.get_file()?;
1455        let mut buf = Vec::<u8>::with_capacity(size as usize);
1456        let pathname = CString::new(format!("/proc/self/fd/{}", file.as_raw_fd()))
1457            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1458
1459        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1460        // need to use the {set,get,remove,list}xattr variants.
1461        // Safe because this will only modify the contents of `buf`.
1462        let res = unsafe {
1463            libc::listxattr(
1464                pathname.as_ptr(),
1465                buf.as_mut_ptr() as *mut libc::c_char,
1466                size as libc::size_t,
1467            )
1468        };
1469        if res < 0 {
1470            let e = io::Error::last_os_error();
1471            // error!("listxattr error: {e:?}");
1472            return Err(e.into());
1473        }
1474
1475        if size == 0 {
1476            Ok(ReplyXAttr::Size(res as u32))
1477        } else {
1478            // Safe because we trust the value returned by kernel.
1479            unsafe { buf.set_len(res as usize) };
1480            Ok(ReplyXAttr::Data(Bytes::from(buf)))
1481        }
1482    }
1483
1484    /// remove an extended attribute.
1485    async fn removexattr(&self, _req: Request, inode: Inode, name: &OsStr) -> Result<()> {
1486        if !self.cfg.xattr {
1487            return Err(enosys().into());
1488        }
1489        let name = osstr_to_cstr(name).unwrap();
1490        let name = name.as_ref();
1491        let data = self.inode_map.get(inode).await?;
1492        let file = data.get_file()?;
1493        let pathname = CString::new(format!("/proc/self/fd/{}", file.as_raw_fd()))
1494            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1495
1496        // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we
1497        // need to use the {set,get,remove,list}xattr variants.
1498        // Safe because this doesn't modify any memory and we check the return value.
1499        let res = unsafe { libc::removexattr(pathname.as_ptr(), name.as_ptr()) };
1500        if res == 0 {
1501            Ok(())
1502        } else {
1503            Err(io::Error::last_os_error().into())
1504        }
1505    }
1506
1507    /// flush method. This is called on each `close()` of the opened file. Since file descriptors
1508    /// can be duplicated (`dup`, `dup2`, `fork`), for one open call there may be many flush calls.
1509    /// Filesystems shouldn't assume that flush will always be called after some writes, or that if
1510    /// will be called at all. `fh` will contain the value set by the open method, or will be
1511    /// undefined if the open method didn't set any value.
1512    ///
1513    /// # Notes:
1514    ///
1515    /// the name of the method is misleading, since (unlike fsync) the filesystem is not forced to
1516    /// flush pending writes. One reason to flush data, is if the filesystem wants to return write
1517    /// errors. If the filesystem supports file locking operations ([`setlk`][Filesystem::setlk],
1518    /// [`getlk`][Filesystem::getlk]) it should remove all locks belonging to `lock_owner`.
1519    async fn flush(&self, _req: Request, inode: Inode, fh: u64, _lock_owner: u64) -> Result<()> {
1520        if self.no_open.load(Ordering::Relaxed) {
1521            return Err(enosys().into());
1522        }
1523
1524        let data = self.handle_map.get(fh, inode).await?;
1525        trace!("flush: data.inode={}", data.inode);
1526
1527        // Since this method is called whenever an fd is closed in the client, we can emulate that
1528        // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
1529        // because this doesn't modify any memory and we check the return values.
1530        unsafe {
1531            let newfd = libc::dup(data.borrow_fd().as_raw_fd());
1532            if newfd < 0 {
1533                return Err(io::Error::last_os_error().into());
1534            }
1535
1536            if libc::close(newfd) < 0 {
1537                Err(io::Error::last_os_error().into())
1538            } else {
1539                Ok(())
1540            }
1541        }
1542        // if self.no_open.load(Ordering::Acquire) {
1543        //         return Err(enosys().into());
1544        //     }
1545
1546        // let data = self.handle_map.get(fh, inode).await?;
1547
1548        // // std flush impl
1549        // unsafe {
1550        //     let fd = data.borrow_fd().as_raw_fd();
1551        //     if libc::fsync(fd) < 0 {
1552        //         let err = io::Error::last_os_error();
1553        //         error!("Failed to fsync file descriptor {}: {}", fd, err);
1554        //         return Err(err.into());
1555        //     }
1556        // }
1557        // Ok(())
1558    }
1559
1560    /// open a directory. Filesystem may store an arbitrary file handle (pointer, index, etc) in
1561    /// `fh`, and use this in other all other directory stream operations
1562    /// ([`readdir`][Filesystem::readdir], [`releasedir`][Filesystem::releasedir],
1563    /// [`fsyncdir`][Filesystem::fsyncdir]). Filesystem may also implement stateless directory
1564    /// I/O and not store anything in `fh`.  A file system need not implement this method if it
1565    /// sets [`MountOptions::no_open_dir_support`][rfuse3::MountOptions::no_open_dir_support] and
1566    /// if the kernel supports `FUSE_NO_OPENDIR_SUPPORT`.
1567    async fn opendir(&self, _req: Request, inode: Inode, flags: u32) -> Result<ReplyOpen> {
1568        if self.no_opendir.load(Ordering::Relaxed) {
1569            info!("fuse: opendir is not supported.");
1570            Err(enosys().into())
1571        } else {
1572            let t = self
1573                .do_open(inode, flags | (libc::O_DIRECTORY as u32))
1574                .await?;
1575            let fd = t.0.unwrap();
1576            Ok(ReplyOpen {
1577                fh: fd,
1578                flags: t.1.bits(),
1579            })
1580        }
1581    }
1582
1583    /// read directory. `offset` is used to track the offset of the directory entries. `fh` will
1584    /// contain the value set by the [`opendir`][Filesystem::opendir] method, or will be
1585    /// undefined if the [`opendir`][Filesystem::opendir] method didn't set any value.
1586    async fn readdir<'a>(
1587        &'a self,
1588        _req: Request,
1589        parent: Inode,
1590        fh: u64,
1591        offset: i64,
1592    ) -> Result<
1593        ReplyDirectory<
1594            impl futures_util::stream::Stream<Item = Result<DirectoryEntry>> + Send + 'a,
1595        >,
1596    > {
1597        if self.no_readdir.load(Ordering::Relaxed) {
1598            return Err(enosys().into());
1599        }
1600        let mut entry_list = Vec::new();
1601        self.do_readdir(parent, fh, offset as u64, &mut entry_list)
1602            .await?;
1603        Ok(ReplyDirectory {
1604            entries: stream::iter(entry_list),
1605        })
1606    }
1607
1608    /// read directory entries, but with their attribute, like [`readdir`][Filesystem::readdir]
1609    /// + [`lookup`][Filesystem::lookup] at the same time.
1610    async fn readdirplus<'a>(
1611        &'a self,
1612        _req: Request,
1613        parent: Inode,
1614        fh: u64,
1615        offset: u64,
1616        _lock_owner: u64,
1617    ) -> Result<
1618        ReplyDirectoryPlus<
1619            impl futures_util::stream::Stream<Item = Result<DirectoryEntryPlus>> + Send + 'a,
1620        >,
1621    > {
1622        if self.no_readdir.load(Ordering::Relaxed) {
1623            return Err(enosys().into());
1624        }
1625        let mut entry_list = Vec::new();
1626        self.do_readdirplus(parent, fh, offset, &mut entry_list)
1627            .await?;
1628        Ok(ReplyDirectoryPlus {
1629            entries: stream::iter(entry_list),
1630        })
1631    }
1632
1633    /// release an open directory. For every [`opendir`][Filesystem::opendir] call there will
1634    /// be exactly one `releasedir` call. `fh` will contain the value set by the
1635    /// [`opendir`][Filesystem::opendir] method, or will be undefined if the
1636    /// [`opendir`][Filesystem::opendir] method didn't set any value.
1637    async fn releasedir(&self, _req: Request, inode: Inode, fh: u64, _flags: u32) -> Result<()> {
1638        if self.no_opendir.load(Ordering::Relaxed) {
1639            info!("fuse: releasedir is not supported.");
1640            Err(io::Error::from_raw_os_error(libc::ENOSYS).into())
1641        } else {
1642            self.do_release(inode, fh).await.map_err(|e| e.into())
1643        }
1644    }
1645
1646    /// synchronize directory contents. If the `datasync` is true, then only the directory contents
1647    /// should be flushed, not the metadata. `fh` will contain the value set by the
1648    /// [`opendir`][Filesystem::opendir] method, or will be undefined if the
1649    /// [`opendir`][Filesystem::opendir] method didn't set any value.
1650    async fn fsyncdir(&self, req: Request, inode: Inode, fh: u64, datasync: bool) -> Result<()> {
1651        self.fsync(req, inode, fh, datasync).await
1652    }
1653
1654    /// check file access permissions. This will be called for the `access()` system call. If the
1655    /// `default_permissions` mount option is given, this method is not be called. This method is
1656    /// not called under Linux kernel versions 2.4.x.
1657    async fn access(&self, req: Request, inode: Inode, mask: u32) -> Result<()> {
1658        let data = self.inode_map.get(inode).await?;
1659        let st = stat_fd(&data.get_file()?, None)?;
1660        let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
1661
1662        let uid = self.cfg.mapping.get_uid(req.uid);
1663        let gid = self.cfg.mapping.get_gid(req.gid);
1664
1665        if mode == libc::F_OK {
1666            // The file exists since we were able to call `stat(2)` on it.
1667            return Ok(());
1668        }
1669
1670        if (mode & libc::R_OK) != 0
1671            && uid != 0
1672            && (st.st_uid != uid || st.st_mode & 0o400 == 0)
1673            && (st.st_gid != gid || st.st_mode & 0o040 == 0)
1674            && st.st_mode & 0o004 == 0
1675        {
1676            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
1677        }
1678
1679        if (mode & libc::W_OK) != 0
1680            && uid != 0
1681            && (st.st_uid != uid || st.st_mode & 0o200 == 0)
1682            && (st.st_gid != gid || st.st_mode & 0o020 == 0)
1683            && st.st_mode & 0o002 == 0
1684        {
1685            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
1686        }
1687
1688        // root can only execute something if it is executable by one of the owner, the group, or
1689        // everyone.
1690        if (mode & libc::X_OK) != 0
1691            && (uid != 0 || st.st_mode & 0o111 == 0)
1692            && (st.st_uid != uid || st.st_mode & 0o100 == 0)
1693            && (st.st_gid != gid || st.st_mode & 0o010 == 0)
1694            && st.st_mode & 0o001 == 0
1695        {
1696            return Err(io::Error::from_raw_os_error(libc::EACCES).into());
1697        }
1698
1699        Ok(())
1700    }
1701
1702    /// create and open a file. If the file does not exist, first create it with the specified
1703    /// mode, and then open it. Open flags (with the exception of `O_NOCTTY`) are available in
1704    /// flags. Filesystem may store an arbitrary file handle (pointer, index, etc) in `fh`, and use
1705    /// this in other all other file operations ([`read`][Filesystem::read],
1706    /// [`write`][Filesystem::write], [`flush`][Filesystem::flush],
1707    /// [`release`][Filesystem::release], [`fsync`][Filesystem::fsync]). There are also some flags
1708    /// (`direct_io`, `keep_cache`) which the filesystem may set, to change the way the file is
1709    /// opened. If this method is not implemented or under Linux kernel versions earlier than
1710    /// 2.6.15, the [`mknod`][Filesystem::mknod] and [`open`][Filesystem::open] methods will be
1711    /// called instead.
1712    ///
1713    /// # Notes:
1714    ///
1715    /// See `fuse_file_info` structure in
1716    /// [fuse_common.h](https://libfuse.github.io/doxygen/include_2fuse__common_8h_source.html) for
1717    /// more details.
1718    async fn create(
1719        &self,
1720        req: Request,
1721        parent: Inode,
1722        name: &OsStr,
1723        mode: u32,
1724        flags: u32,
1725    ) -> Result<ReplyCreated> {
1726        self.do_create_inner(req, parent, name, mode, flags, None, None)
1727            .await
1728    }
1729
1730    /// handle interrupt. When a operation is interrupted, an interrupt request will send to fuse
1731    /// server with the unique id of the operation.
1732    async fn interrupt(&self, _req: Request, _unique: u64) -> Result<()> {
1733        Ok(())
1734    }
1735
1736    /// forget more than one inode. This is a batch version [`forget`][Filesystem::forget]
1737    async fn batch_forget(&self, _req: Request, inodes: &[(Inode, u64)]) {
1738        let mut inodes_w = self.inode_map.inodes.write().await;
1739
1740        for i in inodes {
1741            self.forget_one(&mut inodes_w, i.0, i.1).await;
1742        }
1743    }
1744
1745    /// allocate space for an open file. This function ensures that required space is allocated for
1746    /// specified file.
1747    ///
1748    /// # Notes:
1749    ///
1750    /// more information about `fallocate`, please see **`man 2 fallocate`**
1751    async fn fallocate(
1752        &self,
1753        _req: Request,
1754        inode: Inode,
1755        fh: u64,
1756        offset: u64,
1757        length: u64,
1758        mode: u32,
1759    ) -> Result<()> {
1760        // Let the Arc<HandleData> in scope, otherwise fd may get invalid.
1761        let data = self.get_data(fh, inode, libc::O_RDWR).await?;
1762        let fd = data.borrow_fd();
1763
1764        //  if self.seal_size.load().await {
1765        //      let st = stat_fd(&fd, None)?;
1766        //      self.seal_size_check(
1767        //          Opcode::Fallocate,
1768        //          st.st_size as u64,
1769        //          offset,
1770        //          length,
1771        //          mode as i32,
1772        //      )?;
1773        //  }
1774
1775        // Safe because this doesn't modify any memory and we check the return value.
1776        let res = unsafe {
1777            libc::fallocate64(
1778                fd.as_raw_fd(),
1779                mode as libc::c_int,
1780                offset as libc::off64_t,
1781                length as libc::off64_t,
1782            )
1783        };
1784
1785        if res == 0 {
1786            Ok(())
1787        } else {
1788            Err(io::Error::last_os_error().into())
1789        }
1790    }
1791
1792    /// rename a file or directory.
1793    async fn rename(
1794        &self,
1795        _req: Request,
1796        parent: Inode,
1797        name: &OsStr,
1798        new_parent: Inode,
1799        new_name: &OsStr,
1800    ) -> Result<()> {
1801        let oldname = osstr_to_cstr(name).unwrap();
1802        let oldname = oldname.as_ref();
1803        let newname = osstr_to_cstr(new_name).unwrap();
1804        let newname = newname.as_ref();
1805        self.validate_path_component(oldname)?;
1806        self.validate_path_component(newname)?;
1807
1808        // Check if new_name exists and is a whiteout file
1809        let new_parent_data = self.inode_map.get(new_parent).await?;
1810        let new_parent_file = new_parent_data.get_file()?;
1811
1812        // Try to lookup newname to see if it exists
1813        // Check if new_name exists and is a whiteout file
1814        let mut st = std::mem::MaybeUninit::<libc::stat>::uninit();
1815        let res = unsafe {
1816            libc::fstatat(
1817                new_parent_file.as_raw_fd(),
1818                newname.as_ptr(),
1819                st.as_mut_ptr(),
1820                libc::AT_SYMLINK_NOFOLLOW,
1821            )
1822        };
1823
1824        if res == 0 {
1825            // If file exists, check if it's a whiteout file
1826            let st = unsafe { st.assume_init() };
1827            if (st.st_mode & libc::S_IFMT) == libc::S_IFCHR && st.st_rdev == 0 {
1828                // It's a whiteout file, delete it
1829                let unlink_res =
1830                    unsafe { libc::unlinkat(new_parent_file.as_raw_fd(), newname.as_ptr(), 0) };
1831                if unlink_res < 0 {
1832                    return Err(io::Error::last_os_error().into());
1833                }
1834            }
1835        } else {
1836            let err = io::Error::last_os_error();
1837            if err.raw_os_error() != Some(libc::ENOENT) {
1838                return Err(err.into());
1839            }
1840        }
1841
1842        let old_inode = self.inode_map.get(parent).await?;
1843        let new_inode = self.inode_map.get(new_parent).await?;
1844        let old_file = old_inode.get_file()?;
1845        let new_file = new_inode.get_file()?;
1846
1847        //TODO: Switch to libc::renameat2 -> libc::renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
1848        let res = unsafe {
1849            libc::renameat(
1850                old_file.as_raw_fd(),
1851                oldname.as_ptr(),
1852                new_file.as_raw_fd(),
1853                newname.as_ptr(),
1854            )
1855        };
1856
1857        if res == 0 {
1858            Ok(())
1859        } else {
1860            Err(io::Error::last_os_error().into())
1861        }
1862    }
1863
1864    /// rename a file or directory with flags.
1865    async fn rename2(
1866        &self,
1867        _req: Request,
1868        parent: Inode,
1869        name: &OsStr,
1870        new_parent: Inode,
1871        new_name: &OsStr,
1872        flags: u32,
1873    ) -> Result<()> {
1874        let oldname = osstr_to_cstr(name).unwrap();
1875        let oldname = oldname.as_ref();
1876        let newname = osstr_to_cstr(new_name).unwrap();
1877        let newname = newname.as_ref();
1878        self.validate_path_component(oldname)?;
1879        self.validate_path_component(newname)?;
1880
1881        let old_inode = self.inode_map.get(parent).await?;
1882        let new_inode = self.inode_map.get(new_parent).await?;
1883        let old_file = old_inode.get_file()?;
1884        let new_file = new_inode.get_file()?;
1885        //TODO: Switch to libc::renameat2 -> libc::renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
1886        let res = unsafe {
1887            libc::renameat2(
1888                old_file.as_raw_fd(),
1889                oldname.as_ptr(),
1890                new_file.as_raw_fd(),
1891                newname.as_ptr(),
1892                flags,
1893            )
1894        };
1895
1896        if res == 0 {
1897            Ok(())
1898        } else {
1899            Err(io::Error::last_os_error().into())
1900        }
1901    }
1902
1903    /// find next data or hole after the specified offset.
1904    async fn lseek(
1905        &self,
1906        _req: Request,
1907        inode: Inode,
1908        fh: u64,
1909        offset: u64,
1910        whence: u32,
1911    ) -> Result<ReplyLSeek> {
1912        // Let the Arc<HandleData> in scope, otherwise fd may get invalid.
1913        let data = self.handle_map.get(fh, inode).await?;
1914
1915        // Check file type to determine appropriate lseek handling
1916        let st = stat_fd(data.get_file(), None)?;
1917        let is_dir = (st.st_mode & libc::S_IFMT) == libc::S_IFDIR;
1918
1919        if is_dir {
1920            // Directory special handling: support SEEK_SET and SEEK_CUR with bounds checks.
1921            // Acquire the lock to get exclusive access
1922            let (_guard, file) = data.get_file_mut().await;
1923
1924            // Handle directory lseek operations according to POSIX standard
1925            // This enables seekdir/telldir functionality on directories
1926            match whence {
1927                // SEEK_SET: set directory offset to an absolute value
1928                x if x == libc::SEEK_SET as u32 => {
1929                    // Validate offset bounds to prevent overflow
1930                    // Directory offsets should not exceed i64::MAX
1931                    if offset > i64::MAX as u64 {
1932                        return Err(io::Error::from_raw_os_error(libc::EINVAL).into());
1933                    }
1934
1935                    // Perform the seek operation using libc::lseek64
1936                    // This directly manipulates the file descriptor's position
1937                    let res = unsafe {
1938                        libc::lseek64(file.as_raw_fd(), offset as libc::off64_t, libc::SEEK_SET)
1939                    };
1940                    if res < 0 {
1941                        return Err(io::Error::last_os_error().into());
1942                    }
1943                    Ok(ReplyLSeek { offset: res as u64 })
1944                }
1945                // SEEK_CUR: move relative to current directory offset
1946                x if x == libc::SEEK_CUR as u32 => {
1947                    // Get current position using libc::lseek64 with offset 0
1948                    let cur = unsafe { libc::lseek64(file.as_raw_fd(), 0, libc::SEEK_CUR) };
1949                    if cur < 0 {
1950                        return Err(io::Error::last_os_error().into());
1951                    }
1952                    let current = cur as u64;
1953
1954                    // Compute new offset safely to prevent arithmetic overflow
1955                    if let Some(new_offset) = current.checked_add(offset) {
1956                        // Ensure the new offset is within valid bounds
1957                        if new_offset > i64::MAX as u64 {
1958                            return Err(io::Error::from_raw_os_error(libc::EINVAL).into());
1959                        }
1960                        // Set the new offset using libc::lseek64
1961                        let res = unsafe {
1962                            libc::lseek64(
1963                                file.as_raw_fd(),
1964                                new_offset as libc::off64_t,
1965                                libc::SEEK_SET,
1966                            )
1967                        };
1968                        if res < 0 {
1969                            return Err(io::Error::last_os_error().into());
1970                        }
1971                        Ok(ReplyLSeek { offset: new_offset })
1972                    } else {
1973                        Err(io::Error::from_raw_os_error(libc::EINVAL).into())
1974                    }
1975                }
1976                // Other whence values are invalid for directories (e.g., SEEK_END)
1977                _ => Err(io::Error::from_raw_os_error(libc::EINVAL).into()),
1978            }
1979        } else {
1980            // File seek handling for non-directory files
1981            // Acquire the lock to get exclusive access, otherwise it may break do_readdir().
1982            let (_guard, file) = data.get_file_mut().await;
1983
1984            // Safe because this doesn't modify any memory and we check the return value.
1985            // Use 64-bit seek for regular files to match kernel offsets
1986            let res = unsafe {
1987                libc::lseek64(
1988                    file.as_raw_fd(),
1989                    offset as libc::off64_t,
1990                    whence as libc::c_int,
1991                )
1992            };
1993            if res < 0 {
1994                Err(io::Error::last_os_error().into())
1995            } else {
1996                Ok(ReplyLSeek { offset: res as u64 })
1997            }
1998        }
1999    }
2000
2001    /// Copy a range of data from one file to another using the copy_file_range system call.
2002    /// This can improve performance by reducing data copying between userspace and kernel.
2003    #[allow(clippy::too_many_arguments)]
2004    async fn copy_file_range(
2005        &self,
2006        _req: Request,
2007        inode_in: Inode,
2008        fh_in: u64,
2009        offset_in: u64,
2010        inode_out: Inode,
2011        fh_out: u64,
2012        offset_out: u64,
2013        length: u64,
2014        flags: u64,
2015    ) -> Result<ReplyCopyFileRange> {
2016        // Get the handle data for both source and destination files
2017        let data_in = self.handle_map.get(fh_in, inode_in).await?;
2018        let data_out = self.handle_map.get(fh_out, inode_out).await?;
2019
2020        // Get file descriptors
2021        let fd_in = data_in.borrow_fd().as_raw_fd();
2022        let fd_out = data_out.borrow_fd().as_raw_fd();
2023
2024        // Validate and reject unsupported flags
2025        // Linux copy_file_range currently doesn't define any flags (should be 0)
2026        if flags != 0 {
2027            return Err(io::Error::from_raw_os_error(libc::EINVAL).into());
2028        }
2029
2030        // Convert offsets to i64, checking for overflow (offsets > i64::MAX would wrap to negative)
2031        let mut off_in: i64 = offset_in
2032            .try_into()
2033            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
2034        let mut off_out: i64 = offset_out
2035            .try_into()
2036            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
2037
2038        // Convert length to usize, checking for overflow on 32-bit systems
2039        let len: usize = length
2040            .try_into()
2041            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?;
2042
2043        // SAFETY: copy_file_range reads from fd_in and writes to fd_out. We pass valid
2044        // file descriptors and pointers to offset values. The syscall updates the offset
2045        // pointers to reflect the new positions after the copy, but doesn't modify the
2046        // file descriptor positions themselves (when offsets are non-NULL).
2047        let res = unsafe {
2048            libc::copy_file_range(
2049                fd_in,
2050                &mut off_in as *mut i64, // Pass offset pointer directly
2051                fd_out,
2052                &mut off_out as *mut i64, // Pass offset pointer directly
2053                len,
2054                0, // flags (must be 0, already validated above)
2055            )
2056        };
2057
2058        if res < 0 {
2059            Err(io::Error::last_os_error().into())
2060        } else {
2061            // res is guaranteed >= 0 here, safe to cast to usize then u64
2062            Ok(ReplyCopyFileRange {
2063                copied: res as usize as u64,
2064            })
2065        }
2066    }
2067}
2068
2069/// trim all trailing nul terminators.
2070pub fn bytes_to_cstr(buf: &[u8]) -> Result<&CStr> {
2071    // There might be multiple 0s at the end of buf, find & use the first one and trim other zeros.
2072    match buf.iter().position(|x| *x == 0) {
2073        // Convert to a `CStr` so that we can drop the '\0' byte at the end and make sure
2074        // there are no interior '\0' bytes.
2075        Some(pos) => CStr::from_bytes_with_nul(&buf[0..=pos]).map_err(|_| Errno::from(5)),
2076        None => {
2077            // Invalid input, just call CStr::from_bytes_with_nul() for suitable error code
2078            CStr::from_bytes_with_nul(buf).map_err(|_| Errno::from(5))
2079        }
2080    }
2081}