Skip to main content

libfuse_fs/unionfs/
mod.rs

1// Copyright (C) 2023 Ant Group. All rights reserved.
2//  2024 From [fuse_backend_rs](https://github.com/cloud-hypervisor/fuse-backend-rs)
3// SPDX-License-Identifier: Apache-2.0
4
5#![allow(missing_docs)]
6mod async_io;
7pub mod config;
8mod inode_store;
9pub mod layer;
10mod utils;
11
12//mod tempfile;
13use core::panic;
14use std::collections::HashMap;
15use std::ffi::{OsStr, OsString};
16use std::future::Future;
17use std::io::{Error, Result};
18use std::path::Path;
19
20use config::Config;
21use futures::StreamExt as _;
22use rfuse3::raw::reply::{
23    DirectoryEntry, DirectoryEntryPlus, ReplyAttr, ReplyEntry, ReplyOpen, ReplyStatFs,
24};
25use rfuse3::raw::{Request, Session};
26use std::sync::{Arc, Weak};
27use tracing::debug;
28use tracing::error;
29use tracing::info;
30use tracing::trace;
31
32use rfuse3::{Errno, FileType, MountOptions, mode_from_kind_and_perm};
33const SLASH_ASCII: char = '/';
34use futures::future::join_all;
35use futures::stream::iter;
36
37use crate::passthrough::{PassthroughArgs, new_passthroughfs_layer};
38use crate::util::convert_stat64_to_file_attr;
39use crate::util::whiteout::{WhiteoutFormat, is_user_creatable_name, oci_whiteout_name};
40use inode_store::InodeStore;
41use layer::Layer;
42use rfuse3::raw::logfs::LoggingFileSystem;
43use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
44
45use tokio::sync::{Mutex, RwLock};
46
47pub type Inode = u64;
48pub type Handle = u64;
49
50pub(crate) type BoxedLayer = dyn Layer;
51//type BoxedFileSystem = Box<dyn FileSystem<Inode = Inode, Handle = Handle> + Send + Sync>;
52const INODE_ALLOC_BATCH: u64 = 0x1_0000_0000;
53// RealInode represents one inode object in specific layer.
54// Also, each RealInode maps to one Entry, which should be 'forgotten' after drop.
55// Important note: do not impl Clone trait for it or refcount will be messed up.
56pub(crate) struct RealInode {
57    pub layer: Arc<BoxedLayer>,
58    pub in_upper_layer: bool,
59    pub inode: u64,
60    // File is whiteouted, we need to hide it.
61    pub whiteout: bool,
62    // Directory is opaque, we need to hide all entries inside it.
63    pub opaque: bool,
64    pub stat: Option<ReplyAttr>,
65}
66
67// OverlayInode must be protected by lock, it can be operated by multiple threads.
68// #[derive(Default)]
69pub(crate) struct OverlayInode {
70    // Inode hash table, map from 'name' to 'OverlayInode'.
71    pub childrens: Mutex<HashMap<String, Arc<OverlayInode>>>,
72    pub parent: Mutex<Weak<OverlayInode>>,
73    // Backend inodes from all layers.
74    pub real_inodes: Mutex<Vec<Arc<RealInode>>>,
75    // Inode number.
76    pub inode: u64,
77    pub path: RwLock<String>,
78    pub name: RwLock<String>,
79    pub lookups: AtomicU64,
80    // Node is whiteout-ed.
81    pub whiteout: AtomicBool,
82    // Directory is loaded.
83    pub loaded: AtomicBool,
84}
85
86#[derive(Default)]
87pub enum CachePolicy {
88    Never,
89    #[default]
90    Auto,
91    Always,
92}
93pub struct OverlayFs {
94    config: Config,
95    lower_layers: Vec<Arc<BoxedLayer>>,
96    upper_layer: Option<Arc<BoxedLayer>>,
97    // All inodes in FS.
98    inodes: RwLock<InodeStore>,
99    // Open file handles.
100    handles: Mutex<HashMap<u64, Arc<HandleData>>>,
101    next_handle: AtomicU64,
102    writeback: AtomicBool,
103    no_open: AtomicBool,
104    no_opendir: AtomicBool,
105    killpriv_v2: AtomicBool,
106    perfile_dax: AtomicBool,
107    root_inodes: u64,
108}
109
110// This is a wrapper of one inode in specific layer, It can't impl Clone trait.
111struct RealHandle {
112    layer: Arc<BoxedLayer>,
113    in_upper_layer: bool,
114    inode: u64,
115    handle: AtomicU64,
116}
117
118struct HandleData {
119    node: Arc<OverlayInode>,
120    //offset: libc::off_t,
121    real_handle: Option<RealHandle>,
122    // Cache the directory entries for stable readdir offsets.
123    // The snapshot contains all necessary info to avoid re-accessing childrens map.
124    dir_snapshot: Mutex<Option<Vec<DirectoryEntryPlus>>>,
125}
126
127// RealInode is a wrapper of one inode in specific layer.
128// All layer operations returning Entry should be wrapped in RealInode implementation
129// so that we can increase the refcount(lookup count) of each inode and decrease it after Drop.
130// Important: do not impl 'Copy' trait for it or refcount will be messed up.
131impl RealInode {
132    async fn new(
133        layer: Arc<BoxedLayer>,
134        in_upper_layer: bool,
135        inode: u64,
136        whiteout: bool,
137        opaque: bool,
138    ) -> Self {
139        let mut ri = RealInode {
140            layer,
141            in_upper_layer,
142            inode,
143            whiteout,
144            opaque,
145            stat: None,
146        };
147        match ri.stat64_ignore_enoent(&Request::default()).await {
148            Ok(v) => {
149                ri.stat = v;
150            }
151            Err(e) => {
152                error!("stat64 failed during RealInode creation: {e}");
153            }
154        }
155        ri
156    }
157
158    async fn stat64(&self, req: &Request) -> Result<ReplyAttr> {
159        let layer = self.layer.as_ref();
160        if self.inode == 0 {
161            return Err(Error::from_raw_os_error(libc::ENOENT));
162        }
163        // trace!("stat64: trying to getattr req: {:?}", req);
164        layer
165            .getattr(*req, self.inode, None, 0)
166            .await
167            .map_err(|e| e.into())
168    }
169
170    async fn stat64_ignore_enoent(&self, req: &Request) -> Result<Option<ReplyAttr>> {
171        match self.stat64(req).await {
172            Ok(v1) => Ok(Some(v1)),
173            Err(e) => match e.raw_os_error() {
174                Some(raw_error) => {
175                    if raw_error == libc::ENOENT
176                        || raw_error == libc::ENAMETOOLONG
177                        || raw_error == libc::ESTALE
178                    {
179                        return Ok(None);
180                    }
181                    Err(e)
182                }
183                None => Err(e),
184            },
185        }
186    }
187
188    // Do real lookup action in specific layer, this call will increase Entry refcount which must be released later.
189    async fn lookup_child_ignore_enoent(
190        &self,
191        ctx: Request,
192        name: &str,
193    ) -> Result<Option<ReplyEntry>> {
194        let cname = OsStr::new(name);
195        // Real inode must have a layer.
196        let layer = self.layer.as_ref();
197        match layer.lookup(ctx, self.inode, cname).await {
198            Ok(v) => {
199                // Negative entry also indicates missing entry.
200                if v.attr.ino == 0 {
201                    return Ok(None);
202                }
203                Ok(Some(v))
204            }
205            Err(e) => {
206                let ioerror: std::io::Error = e.into();
207                if let Some(raw_error) = ioerror.raw_os_error()
208                    && (raw_error == libc::ENOENT || raw_error == libc::ENAMETOOLONG)
209                {
210                    return Ok(None);
211                }
212
213                Err(e.into())
214            }
215        }
216    }
217
218    // Find child inode in same layer under this directory(Self).
219    // Return None if not found.
220    async fn lookup_child(&self, ctx: Request, name: &str) -> Result<Option<RealInode>> {
221        if self.whiteout {
222            return Ok(None);
223        }
224
225        let layer = self.layer.as_ref();
226
227        // Find child Entry with <name> under directory with inode <self.inode>.
228        match self.lookup_child_ignore_enoent(ctx, name).await? {
229            Some(v) => {
230                // The Entry must be forgotten in each layer, which will be done automatically by Drop operation.
231                let (whiteout, opaque) = if v.attr.kind == FileType::Directory {
232                    (false, layer.is_opaque(ctx, v.attr.ino).await?)
233                } else {
234                    let is_wh = match layer.whiteout_format() {
235                        // CharDev: the looked-up entry's attr (char dev 0/0) is the marker.
236                        WhiteoutFormat::CharDev => layer.is_whiteout(ctx, v.attr.ino).await?,
237                        // OCI: the entry itself is the real file; the marker is a
238                        // sibling `.wh.<name>` in the same directory.
239                        WhiteoutFormat::OciWhiteout => {
240                            let wh_name = oci_whiteout_name(OsStr::new(name));
241                            match layer.lookup(ctx, self.inode, &wh_name).await {
242                                Ok(marker) if marker.attr.ino != 0 => {
243                                    layer.forget(ctx, marker.attr.ino, 1).await;
244                                    true
245                                }
246                                Ok(_) => false,
247                                Err(e) => {
248                                    let ie: std::io::Error = e.into();
249                                    if ie.raw_os_error() == Some(libc::ENOENT) {
250                                        false
251                                    } else {
252                                        return Err(ie);
253                                    }
254                                }
255                            }
256                        }
257                    };
258                    (is_wh, false)
259                };
260
261                Ok(Some(RealInode {
262                    layer: self.layer.clone(),
263                    in_upper_layer: self.in_upper_layer,
264                    inode: v.attr.ino,
265                    whiteout,
266                    opaque,
267                    stat: Some(ReplyAttr {
268                        ttl: v.ttl,
269                        attr: v.attr,
270                    }),
271                }))
272            }
273            None => Ok(None),
274        }
275    }
276
277    // Read directory entries from specific RealInode, error out if it's not directory.
278    async fn readdir(&self, ctx: Request) -> Result<HashMap<String, RealInode>> {
279        // Deleted inode should not be read.
280        if self.whiteout {
281            return Err(Error::from_raw_os_error(libc::ENOENT));
282        }
283        // trace!("readdir: before stat");
284        let stat = match self.stat.clone() {
285            Some(v) => v,
286            None => self.stat64(&ctx).await?,
287        };
288
289        // Must be directory.
290        if stat.attr.kind != FileType::Directory {
291            return Err(Error::from_raw_os_error(libc::ENOTDIR));
292        }
293
294        // Open the directory and load each entry.
295        let opendir_res = self
296            .layer
297            .opendir(ctx, self.inode, libc::O_RDONLY as u32)
298            .await;
299        // trace!("readdir: after opendir");
300        let handle = match opendir_res {
301            Ok(handle) => handle,
302
303            // opendir may not be supported if no_opendir is set, so we can ignore this error.
304            Err(e) => {
305                let ioerror: std::io::Error = e.into();
306                match ioerror.raw_os_error() {
307                    Some(raw_error) if raw_error == libc::ENOSYS => {
308                        // We can still call readdir with inode if opendir is not supported in this layer.
309                        ReplyOpen { fh: 0, flags: 0 }
310                    }
311                    Some(_) => {
312                        return Err(e.into());
313                    }
314                    None => {
315                        return Err(e.into());
316                    }
317                }
318            }
319        };
320
321        let child_names = self.layer.readdir(ctx, self.inode, handle.fh, 0).await?;
322        // Non-zero handle indicates successful 'open', we should 'release' it.
323        if handle.fh > 0 {
324            self.layer
325                .releasedir(ctx, self.inode, handle.fh, handle.flags)
326                .await?
327            //DIFF
328        }
329
330        // Lookup all child and construct "RealInode"s.
331        let child_real_inodes = Arc::new(Mutex::new(HashMap::new()));
332        // trace!("readdir: before iter childrens");
333        let oci_mode = matches!(self.layer.whiteout_format(), WhiteoutFormat::OciWhiteout);
334        let a_map = child_names.entries.map(|entery| async {
335            match entery {
336                Ok(dire) => {
337                    let dname = dire
338                        .name
339                        .into_string()
340                        .map_err(|_| Errno::from(libc::EINVAL))?;
341                    if dname == "." || dname == ".." {
342                        return Ok(());
343                    }
344                    if oci_mode {
345                        if crate::util::whiteout::is_oci_opaque_marker(std::ffi::OsStr::new(&dname))
346                        {
347                            return Ok(());
348                        }
349                        // Translate `.wh.<base>` into a whiteout entry under
350                        // `<base>` so union merge drops lower-layer matches.
351                        if let Some(base) =
352                            crate::util::whiteout::oci_whiteout_target(std::ffi::OsStr::new(&dname))
353                        {
354                            let base_str = base.to_string_lossy().into_owned();
355                            let marker = self
356                                .layer
357                                .lookup(ctx, self.inode, std::ffi::OsStr::new(&dname))
358                                .await?;
359                            let real = RealInode {
360                                layer: self.layer.clone(),
361                                in_upper_layer: self.in_upper_layer,
362                                inode: marker.attr.ino,
363                                whiteout: true,
364                                opaque: false,
365                                stat: Some(ReplyAttr {
366                                    ttl: marker.ttl,
367                                    attr: marker.attr,
368                                }),
369                            };
370                            child_real_inodes.lock().await.insert(base_str, real);
371                            return Ok(());
372                        }
373                        if dname.starts_with(crate::util::whiteout::OCI_WHITEOUT_PREFIX) {
374                            return Ok(());
375                        }
376                    }
377                    if let Some(child) = self.lookup_child(ctx, &dname).await? {
378                        child_real_inodes.lock().await.insert(dname, child);
379                    }
380                    Ok(())
381                }
382                Err(err) => Err(err),
383            }
384        });
385        for result in join_all(a_map.collect::<Vec<_>>().await).await {
386            result?;
387        }
388        // Now into_inner func is safety.
389        let re = Arc::try_unwrap(child_real_inodes)
390            .map_err(|_| Errno::new_not_exist())?
391            .into_inner();
392        // trace!("readdir: return");
393        Ok(re)
394    }
395
396    async fn create_whiteout(&self, ctx: Request, name: &str) -> Result<RealInode> {
397        if !self.in_upper_layer {
398            return Err(Error::from_raw_os_error(libc::EROFS));
399        }
400
401        // from &str to &OsStr
402        let name_osstr = OsStr::new(name);
403        let entry = self
404            .layer
405            .create_whiteout(ctx, self.inode, name_osstr)
406            .await?;
407
408        // Wrap whiteout to RealInode.
409        Ok(RealInode {
410            layer: self.layer.clone(),
411            in_upper_layer: true,
412            inode: entry.attr.ino,
413            whiteout: true,
414            opaque: false,
415            stat: Some(ReplyAttr {
416                ttl: entry.ttl,
417                attr: entry.attr,
418            }),
419        })
420    }
421
422    async fn mkdir(&self, ctx: Request, name: &str, mode: u32, umask: u32) -> Result<RealInode> {
423        if !self.in_upper_layer {
424            return Err(Error::from_raw_os_error(libc::EROFS));
425        }
426
427        let name_osstr = OsStr::new(name);
428        let entry = self
429            .layer
430            .mkdir(ctx, self.inode, name_osstr, mode, umask)
431            .await?;
432
433        // update node's first_layer
434        Ok(RealInode {
435            layer: self.layer.clone(),
436            in_upper_layer: true,
437            inode: entry.attr.ino,
438            whiteout: false,
439            opaque: false,
440            stat: Some(ReplyAttr {
441                ttl: entry.ttl,
442                attr: entry.attr,
443            }),
444        })
445    }
446
447    async fn create(
448        &self,
449        ctx: Request,
450        name: &str,
451        mode: u32,
452        flags: u32,
453    ) -> Result<(RealInode, Option<u64>)> {
454        if !self.in_upper_layer {
455            return Err(Error::from_raw_os_error(libc::EROFS));
456        }
457        let name = OsStr::new(name);
458        let create_rep = self
459            .layer
460            .create(ctx, self.inode, name, mode, flags)
461            .await?;
462
463        Ok((
464            RealInode {
465                layer: self.layer.clone(),
466                in_upper_layer: true,
467                inode: create_rep.attr.ino,
468                whiteout: false,
469                opaque: false,
470                stat: Some(ReplyAttr {
471                    ttl: create_rep.ttl,
472                    attr: create_rep.attr,
473                }),
474            },
475            Some(create_rep.fh),
476        ))
477    }
478
479    async fn mknod(
480        &self,
481        ctx: Request,
482        name: &str,
483        mode: u32,
484        rdev: u32,
485        _umask: u32,
486    ) -> Result<RealInode> {
487        if !self.in_upper_layer {
488            return Err(Error::from_raw_os_error(libc::EROFS));
489        }
490        let name = OsStr::new(name);
491        let rep = self.layer.mknod(ctx, self.inode, name, mode, rdev).await?;
492        Ok(RealInode {
493            layer: self.layer.clone(),
494            in_upper_layer: true,
495            inode: rep.attr.ino,
496            whiteout: false,
497            opaque: false,
498            stat: Some(ReplyAttr {
499                ttl: rep.ttl,
500                attr: rep.attr,
501            }),
502        })
503    }
504
505    async fn link(&self, ctx: Request, ino: u64, name: &str) -> Result<RealInode> {
506        if !self.in_upper_layer {
507            return Err(Error::from_raw_os_error(libc::EROFS));
508        }
509        let name = OsStr::new(name);
510        let entry = self.layer.link(ctx, ino, self.inode, name).await?;
511
512        let opaque = if utils::is_dir(&entry.attr.kind) {
513            self.layer.is_opaque(ctx, entry.attr.ino).await?
514        } else {
515            false
516        };
517        Ok(RealInode {
518            layer: self.layer.clone(),
519            in_upper_layer: true,
520            inode: entry.attr.ino,
521            whiteout: false,
522            opaque,
523            stat: Some(ReplyAttr {
524                ttl: entry.ttl,
525                attr: entry.attr,
526            }),
527        })
528    }
529
530    // Create a symlink in self directory.
531    async fn symlink(&self, ctx: Request, link_name: &str, filename: &str) -> Result<RealInode> {
532        if !self.in_upper_layer {
533            return Err(Error::from_raw_os_error(libc::EROFS));
534        }
535        let link_name = OsStr::new(link_name);
536        let filename = OsStr::new(filename);
537        let entry = self
538            .layer
539            .symlink(ctx, self.inode, filename, link_name)
540            .await?;
541
542        Ok(RealInode {
543            layer: self.layer.clone(),
544            in_upper_layer: true,
545            inode: entry.attr.ino,
546            whiteout: false,
547            opaque: false,
548            stat: Some(ReplyAttr {
549                ttl: entry.ttl,
550                attr: entry.attr,
551            }),
552        })
553    }
554}
555
556impl Drop for RealInode {
557    fn drop(&mut self) {
558        let layer = Arc::clone(&self.layer);
559        let inode = self.inode;
560        tokio::spawn(async move {
561            let ctx = Request::default();
562            layer.forget(ctx, inode, 1).await;
563        });
564    }
565}
566
567impl OverlayInode {
568    pub fn new() -> Self {
569        Self {
570            childrens: Mutex::new(HashMap::new()),
571            parent: Mutex::new(Weak::new()),
572            real_inodes: Mutex::new(vec![]),
573            inode: 0,
574            path: RwLock::new(String::new()),
575            name: RwLock::new(String::new()),
576            lookups: AtomicU64::new(0),
577            whiteout: AtomicBool::new(false),
578            loaded: AtomicBool::new(false),
579        }
580    }
581    // Allocate new OverlayInode based on one RealInode,
582    // inode number is always 0 since only OverlayFs has global unique inode allocator.
583    pub async fn new_from_real_inode(
584        name: &str,
585        ino: u64,
586        path: String,
587        real_inode: RealInode,
588    ) -> Self {
589        let mut new = OverlayInode::new();
590        new.inode = ino;
591        new.path = path.into();
592        new.name = name.to_string().into();
593        new.whiteout.store(real_inode.whiteout, Ordering::Relaxed);
594        new.lookups = AtomicU64::new(1);
595        new.real_inodes = Mutex::new(vec![real_inode.into()]);
596        new
597    }
598
599    pub async fn new_from_real_inodes(
600        name: &str,
601        ino: u64,
602        path: String,
603        real_inodes: Vec<RealInode>,
604    ) -> Result<Self> {
605        if real_inodes.is_empty() {
606            error!("BUG: new_from_real_inodes() called with empty real_inodes");
607            return Err(Error::from_raw_os_error(libc::EINVAL));
608        }
609
610        let mut first = true;
611        let mut new = Self::new();
612        for ri in real_inodes {
613            let whiteout = ri.whiteout;
614            let opaque = ri.opaque;
615            let stat = match &ri.stat {
616                Some(v) => v.clone(),
617                None => ri.stat64(&Request::default()).await?,
618            };
619
620            if first {
621                first = false;
622                new = Self::new_from_real_inode(name, ino, path.clone(), ri).await;
623
624                // This is whiteout, no need to check lower layers.
625                if whiteout {
626                    break;
627                }
628
629                // A non-directory file shadows all lower layers as default.
630                if !utils::is_dir(&stat.attr.kind) {
631                    break;
632                }
633
634                // Opaque directory shadows all lower layers.
635                if opaque {
636                    break;
637                }
638            } else {
639                // This is whiteout, no need to record this, break directly.
640                if ri.whiteout {
641                    break;
642                }
643
644                // Only directory have multiple real inodes, so if this is non-first real-inode
645                // and it's not directory, it should indicates some invalid layout. @weizhang555
646                if !utils::is_dir(&stat.attr.kind) {
647                    error!("invalid layout: non-directory has multiple real inodes");
648                    break;
649                }
650
651                // Valid directory.
652                new.real_inodes.lock().await.push(ri.into());
653                // Opaque directory shadows all lower layers.
654                if opaque {
655                    break;
656                }
657            }
658        }
659        Ok(new)
660    }
661
662    pub async fn stat64(&self, ctx: Request) -> Result<ReplyAttr> {
663        // try layers in order or just take stat from first layer?
664        for l in self.real_inodes.lock().await.iter() {
665            if let Some(v) = l.stat64_ignore_enoent(&ctx).await? {
666                return Ok(v);
667            }
668        }
669
670        // not in any layer
671        Err(Error::from_raw_os_error(libc::ENOENT))
672    }
673
674    pub async fn is_dir(&self, ctx: Request) -> Result<bool> {
675        let st = self.stat64(ctx).await?;
676        Ok(utils::is_dir(&st.attr.kind))
677    }
678
679    pub async fn count_entries_and_whiteout(&self, ctx: Request) -> Result<(u64, u64)> {
680        let mut count = 0;
681        let mut whiteouts = 0;
682
683        let st = self.stat64(ctx).await?;
684
685        // must be directory
686        if !utils::is_dir(&st.attr.kind) {
687            return Err(Error::from_raw_os_error(libc::ENOTDIR));
688        }
689
690        for (_, child) in self.childrens.lock().await.iter() {
691            if child.whiteout.load(Ordering::Relaxed) {
692                whiteouts += 1;
693            } else {
694                count += 1;
695            }
696        }
697        Ok((count, whiteouts))
698    }
699
700    pub async fn open(
701        &self,
702        ctx: Request,
703        flags: u32,
704        _fuse_flags: u32,
705    ) -> Result<(Arc<BoxedLayer>, ReplyOpen)> {
706        let (layer, _, inode) = self.first_layer_inode().await;
707        let ro = layer.as_ref().open(ctx, inode, flags).await?;
708        Ok((layer, ro))
709    }
710
711    // Self is directory, fill all childrens.
712    pub async fn scan_childrens(self: &Arc<Self>, ctx: Request) -> Result<Vec<OverlayInode>> {
713        let st = self.stat64(ctx).await?;
714        if !utils::is_dir(&st.attr.kind) {
715            return Err(Error::from_raw_os_error(libc::ENOTDIR));
716        }
717
718        let mut all_layer_inodes: HashMap<String, Vec<RealInode>> = HashMap::new();
719        // read out directories from each layer
720        // Scan from upper layer to lower layer.
721        for ri in self.real_inodes.lock().await.iter() {
722            if ri.whiteout {
723                // Node is deleted from some upper layer, skip it.
724                debug!("directory is whiteout");
725                break;
726            }
727
728            let stat = match &ri.stat {
729                Some(v) => v.clone(),
730                None => ri.stat64(&ctx).await?,
731            };
732
733            if !utils::is_dir(&stat.attr.kind) {
734                debug!("{} is not a directory", self.path.read().await);
735                // not directory
736                break;
737            }
738
739            // Read all entries from one layer.
740            let entries: HashMap<String, RealInode> = ri.readdir(ctx).await?;
741
742            // Merge entries from one layer to all_layer_inodes.
743            for (name, inode) in entries {
744                match all_layer_inodes.get_mut(&name) {
745                    Some(v) => {
746                        // Append additional RealInode to the end of vector.
747                        v.push(inode)
748                    }
749                    None => {
750                        all_layer_inodes.insert(name, vec![inode]);
751                    }
752                }
753            }
754
755            // if opaque, stop here
756            if ri.opaque {
757                debug!("directory {} is opaque", self.path.read().await);
758                break;
759            }
760        }
761
762        // Construct OverlayInode for each entry.
763        let mut childrens = vec![];
764        for (name, real_inodes) in all_layer_inodes {
765            // Inode numbers are not allocated yet.
766            let path = format!("{}/{}", self.path.read().await, name);
767            let new = Self::new_from_real_inodes(name.as_str(), 0, path, real_inodes).await?;
768            childrens.push(new);
769        }
770
771        Ok(childrens)
772    }
773
774    /// Create a new directory in upper layer for node, node must be directory.
775    ///
776    /// Recursively ensures a directory path exists in the upper layer.
777    ///
778    /// This function is a critical part of the copy-up process. When a file or directory
779    /// needs to be copied up, this function is called on its parent to ensure the entire
780    /// directory hierarchy exists in the upper layer first. It works recursively:
781    /// 1. If the current directory is already in the upper layer, it does nothing.
782    /// 2. If not, it first calls itself on its own parent directory.
783    /// 3. Once the parent is guaranteed to be in the upper layer, it creates the current
784    ///    directory within the parent's upper-layer representation.
785    ///
786    /// Crucially, it preserves the original directory's ownership (UID/GID) and permissions
787    /// by using [`getattr_with_mapping`][crate::unionfs::layer::Layer::getattr_with_mapping] and
788    /// [`mkdir_with_context`][crate::unionfs::layer::Layer::mkdir_with_context] with [`OperationContext`][crate::context::OperationContext].
789    pub async fn create_upper_dir(
790        self: Arc<Self>,
791        ctx: Request,
792        mode_umask: Option<(u32, u32)>,
793    ) -> Result<()> {
794        // To preserve original ownership, we must get the raw, unmapped host attributes.
795        // We achieve this by calling `do_getattr_helper`, which is specifically designed
796        // to bypass the ID mapping logic. This is safe and does not affect other
797        // functionalities because `do_getattr_helper` and the standard `stat64()` call
798        // both rely on the same underlying `stat` system call; they only differ in
799        // whether the resulting `uid` and `gid` are mapped.
800        let (self_layer, _, self_inode) = self.first_layer_inode().await;
801        let re = self_layer
802            .getattr_with_mapping(self_inode, None, false)
803            .await?;
804        let st = ReplyAttr {
805            ttl: re.1,
806            attr: convert_stat64_to_file_attr(re.0),
807        };
808        if !utils::is_dir(&st.attr.kind) {
809            return Err(Error::from_raw_os_error(libc::ENOTDIR));
810        }
811
812        // If node already has upper layer, we can just return here.
813        if self.in_upper_layer().await {
814            return Ok(());
815        }
816
817        // not in upper layer, check parent.
818        let pnode = if let Some(n) = self.parent.lock().await.upgrade() {
819            Arc::clone(&n)
820        } else {
821            return Err(Error::other("no parent?"));
822        };
823
824        if !pnode.in_upper_layer().await {
825            Box::pin(pnode.clone().create_upper_dir(ctx, None)).await?; // recursive call
826        }
827        let child: Arc<Mutex<Option<RealInode>>> = Arc::new(Mutex::new(None));
828        let c_name = self.name.read().await.clone();
829        let _ = pnode
830            .handle_upper_inode_locked(&mut |parent_upper_inode: Option<Arc<RealInode>>| async {
831                match parent_upper_inode {
832                    Some(parent_ri) => {
833                        let ri = match mode_umask {
834                            // We manually unfold the `mkdir` logic here instead of calling the `mkdir` method directly.
835                            // This is necessary to preserve the original directory's UID and GID during the copy-up process.
836                            Some((mode, umask)) => {
837                                if !parent_ri.in_upper_layer {
838                                    return Err(Error::from_raw_os_error(libc::EROFS));
839                                }
840                                let name_osstr = OsStr::new(&c_name);
841                                let op_ctx = crate::context::OperationContext::with_credentials(
842                                    ctx,
843                                    st.attr.uid,
844                                    st.attr.gid,
845                                );
846                                let entry = parent_ri
847                                    .layer
848                                    .mkdir_with_context(
849                                        op_ctx,
850                                        parent_ri.inode,
851                                        name_osstr,
852                                        mode,
853                                        umask,
854                                    )
855                                    .await?;
856                                RealInode {
857                                    layer: parent_ri.layer.clone(),
858                                    in_upper_layer: true,
859                                    inode: entry.attr.ino,
860                                    whiteout: false,
861                                    opaque: false,
862                                    stat: Some(ReplyAttr {
863                                        ttl: entry.ttl,
864                                        attr: entry.attr,
865                                    }),
866                                }
867                            }
868                            None => {
869                                if !parent_ri.in_upper_layer {
870                                    return Err(Error::from_raw_os_error(libc::EROFS));
871                                }
872                                let name_osstr = OsStr::new(&c_name);
873                                let op_ctx = crate::context::OperationContext::with_credentials(
874                                    ctx,
875                                    st.attr.uid,
876                                    st.attr.gid,
877                                );
878                                let entry = parent_ri
879                                    .layer
880                                    .mkdir_with_context(
881                                        op_ctx,
882                                        parent_ri.inode,
883                                        name_osstr,
884                                        mode_from_kind_and_perm(st.attr.kind, st.attr.perm),
885                                        0,
886                                    )
887                                    .await?;
888                                RealInode {
889                                    layer: parent_ri.layer.clone(),
890                                    in_upper_layer: true,
891                                    inode: entry.attr.ino,
892                                    whiteout: false,
893                                    opaque: false,
894                                    stat: Some(ReplyAttr {
895                                        ttl: entry.ttl,
896                                        attr: entry.attr,
897                                    }),
898                                }
899                            }
900                        };
901                        // create directory here
902                        child.lock().await.replace(ri);
903                    }
904                    None => {
905                        error!(
906                            "BUG: parent {} has no upper inode after create_upper_dir",
907                            pnode.inode
908                        );
909                        return Err(Error::from_raw_os_error(libc::EINVAL));
910                    }
911                }
912                Ok(false)
913            })
914            .await?;
915
916        if let Some(ri) = child.lock().await.take() {
917            // Push the new real inode to the front of vector.
918            self.add_upper_inode(ri, false).await;
919        }
920
921        Ok(())
922    }
923
924    // Add new upper RealInode to OverlayInode, clear all lower RealInodes if 'clear_lowers' is true.
925    async fn add_upper_inode(self: &Arc<Self>, ri: RealInode, clear_lowers: bool) {
926        let mut inodes = self.real_inodes.lock().await;
927        // Update self according to upper attribute.
928        self.whiteout.store(ri.whiteout, Ordering::Relaxed);
929
930        // Push the new real inode to the front of vector.
931        let mut new = vec![Arc::new(ri)];
932        // Drain lower RealInodes.
933        let lowers = inodes.drain(..).collect::<Vec<Arc<RealInode>>>();
934        if !clear_lowers {
935            // If not clear lowers, append them to the end of vector.
936            new.extend(lowers);
937        }
938        inodes.extend(new);
939    }
940
941    // return the uppder layer fs.
942    pub async fn in_upper_layer(&self) -> bool {
943        let all_inodes = self.real_inodes.lock().await;
944        let first = all_inodes.first();
945        match first {
946            Some(v) => v.in_upper_layer,
947            None => false,
948        }
949    }
950
951    pub async fn upper_layer_only(&self) -> bool {
952        let real_inodes = self.real_inodes.lock().await;
953        let first = real_inodes.first();
954        match first {
955            Some(v) => {
956                if !v.in_upper_layer {
957                    false
958                } else {
959                    real_inodes.len() == 1
960                }
961            }
962            None => false,
963        }
964    }
965
966    pub async fn first_layer_inode(&self) -> (Arc<BoxedLayer>, bool, u64) {
967        let all_inodes = self.real_inodes.lock().await;
968        let first = all_inodes.first();
969        match first {
970            Some(v) => (v.layer.clone(), v.in_upper_layer, v.inode),
971            None => panic!("BUG: dangling OverlayInode"),
972        }
973    }
974
975    pub async fn child(&self, name: &str) -> Option<Arc<OverlayInode>> {
976        self.childrens.lock().await.get(name).cloned()
977    }
978
979    pub async fn remove_child(&self, name: &str) -> Option<Arc<OverlayInode>> {
980        self.childrens.lock().await.remove(name)
981    }
982
983    pub async fn insert_child(&self, name: &str, node: Arc<OverlayInode>) {
984        self.childrens.lock().await.insert(name.to_string(), node);
985    }
986
987    /// Handles operations on the upper layer inode of an `OverlayInode` in a thread-safe manner.
988    ///
989    /// This function locks the `real_inodes` field of the `OverlayInode` and retrieves the first
990    /// real inode (if any). If the first inode exists and belongs to the upper layer (`in_upper_layer` is true),
991    /// the provided callback `f` is invoked with the inode wrapped in `Some`. Otherwise, `f` is invoked with `None`.
992    ///
993    /// # Arguments
994    /// * `f`: A closure that takes an `Option<RealInode>` and returns a future. The future resolves to a `Result<bool>`.
995    ///
996    /// # Returns
997    /// * `Ok(bool)`: The result of invoking the callback `f`.
998    /// * `Err(Erron)`: An error is returned if:
999    ///   - There are no backend inodes (`real_inodes` is empty), indicating a dangling `OverlayInode`.
1000    ///   - The callback `f` itself returns an error.
1001    ///
1002    /// # Behavior
1003    /// 1. Locks the `real_inodes` field to ensure thread safety.
1004    /// 2. Checks if the first inode exists:
1005    ///    - If it exists and is in the upper layer, invokes `f(Some(inode))`.
1006    ///    - If it exists but is not in the upper layer, invokes `f(None)`.
1007    /// 3. If no inodes exist, returns an error indicating a dangling `OverlayInode`.
1008    ///
1009    /// # Example Use Case
1010    /// This function is typically used to perform operations on the upper layer inode of an `OverlayInode`,
1011    /// such as creating, modifying, or deleting files/directories in the overlay filesystem's upper layer.
1012    pub async fn handle_upper_inode_locked<F, Fut>(&self, f: F) -> Result<bool>
1013    where
1014        // Can pass a &RealInode (or None) to f for any lifetime 'a
1015        F: FnOnce(Option<Arc<RealInode>>) -> Fut,
1016        // f returns a Future that must live at least as long as 'a
1017        Fut: Future<Output = Result<bool>>,
1018    {
1019        let all_inodes = self.real_inodes.lock().await;
1020        let first = all_inodes.first();
1021        match first {
1022            Some(v) => {
1023                if v.in_upper_layer {
1024                    f(Some(v.clone())).await
1025                } else {
1026                    f(None).await
1027                }
1028            }
1029            None => Err(Error::other(format!(
1030                "BUG: dangling OverlayInode {} without any backend inode",
1031                self.inode
1032            ))),
1033        }
1034    }
1035}
1036#[allow(unused)]
1037fn entry_type_from_mode(mode: libc::mode_t) -> u8 {
1038    match mode & libc::S_IFMT {
1039        libc::S_IFBLK => libc::DT_BLK,
1040        libc::S_IFCHR => libc::DT_CHR,
1041        libc::S_IFDIR => libc::DT_DIR,
1042        libc::S_IFIFO => libc::DT_FIFO,
1043        libc::S_IFLNK => libc::DT_LNK,
1044        libc::S_IFREG => libc::DT_REG,
1045        libc::S_IFSOCK => libc::DT_SOCK,
1046        _ => libc::DT_UNKNOWN,
1047    }
1048}
1049impl OverlayFs {
1050    pub fn new(
1051        upper: Option<Arc<BoxedLayer>>,
1052        lowers: Vec<Arc<BoxedLayer>>,
1053        params: Config,
1054        root_inode: u64,
1055    ) -> Result<Self> {
1056        // load root inode
1057        Ok(OverlayFs {
1058            config: params,
1059            lower_layers: lowers,
1060            upper_layer: upper,
1061            inodes: RwLock::new(InodeStore::new()),
1062            handles: Mutex::new(HashMap::new()),
1063            next_handle: AtomicU64::new(1),
1064            writeback: AtomicBool::new(false),
1065            no_open: AtomicBool::new(false),
1066            no_opendir: AtomicBool::new(false),
1067            killpriv_v2: AtomicBool::new(false),
1068            perfile_dax: AtomicBool::new(false),
1069            root_inodes: root_inode,
1070        })
1071    }
1072
1073    pub fn root_inode(&self) -> Inode {
1074        self.root_inodes
1075    }
1076
1077    async fn alloc_inode(&self, path: &str) -> Result<u64> {
1078        self.inodes.write().await.alloc_inode(path)
1079    }
1080
1081    fn check_user_creatable_name(&self, name: &OsStr) -> Result<()> {
1082        let format = self
1083            .upper_layer
1084            .as_ref()
1085            .map(|layer| layer.whiteout_format())
1086            .unwrap_or_default();
1087        if is_user_creatable_name(format, name) {
1088            Ok(())
1089        } else {
1090            Err(Error::from_raw_os_error(libc::EINVAL))
1091        }
1092    }
1093
1094    /// Add a file layer and stack and merge the previous file layers.
1095    pub async fn push_layer(&mut self, layer: Arc<BoxedLayer>) -> Result<()> {
1096        let upper = self.upper_layer.take();
1097        if let Some(upper) = upper {
1098            self.lower_layers.push(upper);
1099        }
1100        self.upper_layer = Some(layer);
1101        // TODO: merge previous file layers. need optimization
1102        self.import().await?;
1103        Ok(())
1104    }
1105
1106    pub async fn import(&self) -> Result<()> {
1107        let mut root = OverlayInode::new();
1108        root.inode = self.root_inode();
1109        root.path = String::from("").into();
1110        root.name = String::from("").into();
1111        root.lookups = AtomicU64::new(2);
1112        root.real_inodes = Mutex::new(vec![]);
1113        let ctx = Request::default();
1114
1115        // Update upper inode
1116        if let Some(layer) = self.upper_layer.as_ref() {
1117            let ino = layer.root_inode();
1118            let real = RealInode::new(
1119                layer.clone(),
1120                true,
1121                ino,
1122                false,
1123                layer.is_opaque(ctx, ino).await?,
1124            )
1125            .await;
1126            root.real_inodes.lock().await.push(real.into());
1127        }
1128
1129        // Update lower inodes.
1130        for layer in self.lower_layers.iter() {
1131            let ino = layer.root_inode();
1132            let real: RealInode = RealInode::new(
1133                layer.clone(),
1134                false,
1135                ino,
1136                false,
1137                layer.is_opaque(ctx, ino).await?,
1138            )
1139            .await;
1140            root.real_inodes.lock().await.push(real.into());
1141        }
1142        let root_node = Arc::new(root);
1143
1144        // insert root inode into hash
1145        self.insert_inode(self.root_inode(), Arc::clone(&root_node))
1146            .await;
1147
1148        info!("loading root directory");
1149        self.load_directory(ctx, &root_node).await?;
1150        info!("loaded root directory");
1151
1152        Ok(())
1153    }
1154
1155    async fn root_node(&self) -> Arc<OverlayInode> {
1156        // Root node must exist.
1157        self.get_active_inode(self.root_inode()).await.unwrap()
1158    }
1159
1160    async fn insert_inode(&self, inode: u64, node: Arc<OverlayInode>) {
1161        self.inodes.write().await.insert_inode(inode, node).await;
1162    }
1163
1164    async fn get_active_inode(&self, inode: u64) -> Option<Arc<OverlayInode>> {
1165        self.inodes.read().await.get_inode(inode)
1166    }
1167
1168    // Get inode which is active or deleted.
1169    async fn get_all_inode(&self, inode: u64) -> Option<Arc<OverlayInode>> {
1170        let inode_store = self.inodes.read().await;
1171        match inode_store.get_inode(inode) {
1172            Some(n) => Some(n),
1173            None => inode_store.get_deleted_inode(inode),
1174        }
1175    }
1176
1177    // Return the inode only if it's permanently deleted from both self.inodes and self.deleted_inodes.
1178    async fn remove_inode(
1179        &self,
1180        inode: u64,
1181        path_removed: Option<String>,
1182    ) -> Option<Arc<OverlayInode>> {
1183        self.inodes
1184            .write()
1185            .await
1186            .remove_inode(inode, path_removed)
1187            .await
1188    }
1189
1190    // Lookup child OverlayInode with <name> under <parent> directory.
1191    // If name is empty, return parent itself.
1192    // Parent dir will be loaded, but returned OverlayInode won't.
1193    async fn lookup_node(
1194        &self,
1195        ctx: Request,
1196        parent: Inode,
1197        name: &str,
1198    ) -> Result<Arc<OverlayInode>> {
1199        if name.contains(SLASH_ASCII) {
1200            return Err(Error::from_raw_os_error(libc::EINVAL));
1201        }
1202
1203        // Parent inode is expected to be loaded before this function is called.
1204        // TODO: Is this correct?
1205        let pnode = match self.get_active_inode(parent).await {
1206            Some(v) => v,
1207            None => {
1208                match self.get_all_inode(parent).await {
1209                    Some(v) => {
1210                        trace!(
1211                            "overlayfs:mod.rs:1031:lookup_node: parent inode {parent} is deleted"
1212                        );
1213                        v
1214                    }
1215                    None => {
1216                        trace!(
1217                            "overlayfs:mod.rs:1034:lookup_node: parent inode {parent} not found"
1218                        );
1219                        // Parent inode is not found, return ENOENT.
1220                        return Err(Error::from_raw_os_error(libc::ENOENT));
1221                    }
1222                }
1223            }
1224        };
1225
1226        // Parent is whiteout-ed, return ENOENT.
1227        if pnode.whiteout.load(Ordering::Relaxed) {
1228            return Err(Error::from_raw_os_error(libc::ENOENT));
1229        }
1230
1231        let st = pnode.stat64(ctx).await?;
1232        if utils::is_dir(&st.attr.kind) && !pnode.loaded.load(Ordering::Relaxed) {
1233            // Parent is expected to be directory, load it first.
1234            self.load_directory(ctx, &pnode).await?;
1235        }
1236
1237        // Current file or dir.
1238        if name.eq(".")  
1239            // Root directory has no parent.
1240            || (parent == self.root_inode() && name.eq("..")) 
1241            // Special convention: empty name indicates current dir.
1242            || name.is_empty()
1243        {
1244            return Ok(Arc::clone(&pnode));
1245        }
1246
1247        match pnode.child(name).await {
1248            // Child is found.
1249            Some(v) => Ok(v),
1250            None => {
1251                trace!("lookup_node: child {name} not found");
1252                Err(Error::from_raw_os_error(libc::ENOENT))
1253            }
1254        }
1255    }
1256
1257    async fn lookup_node_ignore_enoent(
1258        &self,
1259        ctx: Request,
1260        parent: u64,
1261        name: &str,
1262    ) -> Result<Option<Arc<OverlayInode>>> {
1263        match self.lookup_node(ctx, parent, name).await {
1264            Ok(n) => Ok(Some(Arc::clone(&n))),
1265            Err(e) => {
1266                if let Some(raw_error) = e.raw_os_error()
1267                    && raw_error == libc::ENOENT
1268                {
1269                    return Ok(None);
1270                }
1271                Err(e)
1272            }
1273        }
1274    }
1275
1276    // Load entries of the directory from all layers, if node is not directory, return directly.
1277    async fn load_directory(&self, ctx: Request, node: &Arc<OverlayInode>) -> Result<()> {
1278        if node.loaded.load(Ordering::Relaxed) {
1279            return Ok(());
1280        }
1281
1282        // We got all childrens without inode.
1283        // info!("before scan childrens, ctx: {:?}, node: {:?}", ctx, node.inode);
1284        let childrens = node.scan_childrens(ctx).await?;
1285        // info!("scanned children");
1286
1287        // =============== Start Lock Area ===================
1288        // Lock OverlayFs inodes.
1289        let mut inode_store = self.inodes.write().await;
1290        // Lock the OverlayInode and its childrens.
1291        let mut node_children = node.childrens.lock().await;
1292
1293        // Check again in case another 'load_directory' function call gets locks and want to do duplicated work.
1294        if node.loaded.load(Ordering::Relaxed) {
1295            return Ok(());
1296        }
1297
1298        // Now we have two locks' protection, Fs inodes lock and OverlayInode's childrens lock.
1299        // info!("before iter childrens");
1300        for mut child in childrens.into_iter() {
1301            // Allocate inode for each child.
1302            let ino = inode_store.alloc_inode(&child.path.read().await)?;
1303
1304            let name = child.name.read().await.clone();
1305            child.inode = ino;
1306            // Create bi-directional link between parent and child.
1307            child.parent = Mutex::new(Arc::downgrade(node));
1308
1309            let arc_child = Arc::new(child);
1310            node_children.insert(name, arc_child.clone());
1311            // Record overlay inode in whole OverlayFs.
1312            inode_store.insert_inode(ino, arc_child).await;
1313        }
1314        // info!("after iter childrens");
1315
1316        node.loaded.store(true, Ordering::Relaxed);
1317
1318        Ok(())
1319    }
1320
1321    async fn forget_one(&self, inode: Inode, count: u64) {
1322        if inode == self.root_inode() || inode == 0 {
1323            return;
1324        }
1325
1326        let v = match self.get_all_inode(inode).await {
1327            Some(n) => n,
1328            None => {
1329                trace!("forget unknown inode: {inode}");
1330                return;
1331            }
1332        };
1333
1334        // Use fetch_update to atomically update lookups in a loop until it succeeds
1335        v.lookups
1336            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
1337                // If count is larger than current lookups, return 0
1338                // Otherwise subtract count from current lookups
1339                if current < count {
1340                    Some(0)
1341                } else {
1342                    Some(current - count)
1343                }
1344            })
1345            .expect("fetch_update failed");
1346
1347        let lookups = v.lookups.load(Ordering::Relaxed);
1348        trace!(
1349            "forget inode: {}, name {}, lookups: {}",
1350            inode,
1351            v.name.read().await,
1352            lookups
1353        );
1354        if lookups == 0 {
1355            debug!(
1356                "inode is forgotten: {}, name {}",
1357                inode,
1358                v.name.read().await
1359            );
1360            let _ = self.remove_inode(inode, None).await;
1361            let parent = v.parent.lock().await;
1362
1363            if let Some(p) = parent.upgrade() {
1364                // remove it from hashmap
1365                p.remove_child(&v.name.read().await).await;
1366            }
1367        }
1368    }
1369
1370    async fn do_lookup(&self, ctx: Request, parent: Inode, name: &str) -> Result<ReplyEntry> {
1371        let node = self.lookup_node(ctx, parent, name).await?;
1372        debug!("do_lookup: {name:?}, found");
1373
1374        if node.whiteout.load(Ordering::Relaxed) {
1375            eprintln!("Error: node.whiteout.load() called.");
1376            return Err(Error::from_raw_os_error(libc::ENOENT));
1377        }
1378
1379        let mut st = node.stat64(ctx).await?;
1380        st.attr.ino = node.inode;
1381        if utils::is_dir(&st.attr.kind) && !node.loaded.load(Ordering::Relaxed) {
1382            self.load_directory(ctx, &node).await?;
1383        }
1384
1385        // FIXME: can forget happen between found and increase reference counter?
1386        let tmp = node.lookups.fetch_add(1, Ordering::Relaxed);
1387        trace!("lookup count: {}", tmp + 1);
1388        Ok(ReplyEntry {
1389            ttl: st.ttl,
1390            attr: st.attr,
1391            generation: 0,
1392        })
1393    }
1394
1395    async fn do_statvfs(&self, ctx: Request, inode: Inode) -> Result<ReplyStatFs> {
1396        match self.get_active_inode(inode).await {
1397            Some(ovi) => {
1398                let all_inodes = ovi.real_inodes.lock().await;
1399                let real_inode = all_inodes
1400                    .first()
1401                    .ok_or(Error::other("backend inode not found"))?;
1402                Ok(real_inode.layer.statfs(ctx, real_inode.inode).await?)
1403            }
1404            None => Err(Error::from_raw_os_error(libc::ENOENT)),
1405        }
1406    }
1407
1408    #[allow(clippy::too_many_arguments)]
1409    async fn do_readdir<'a>(
1410        &self,
1411        ctx: Request,
1412        inode: Inode,
1413        handle: u64,
1414        offset: u64,
1415    ) -> Result<
1416        impl futures_util::stream::Stream<Item = std::result::Result<DirectoryEntry, Errno>> + Send + 'a,
1417    > {
1418        let snapshot = self.get_or_create_dir_snapshot(ctx, inode, handle).await?;
1419
1420        let entries: Vec<std::result::Result<DirectoryEntry, Errno>> =
1421            if offset < snapshot.len() as u64 {
1422                snapshot
1423                    .iter()
1424                    .skip(offset as usize)
1425                    .map(|entry| {
1426                        Ok(DirectoryEntry {
1427                            inode: entry.inode,
1428                            kind: entry.kind,
1429                            name: entry.name.clone(),
1430                            offset: entry.offset,
1431                        })
1432                    })
1433                    .collect()
1434            } else {
1435                vec![]
1436            };
1437
1438        Ok(iter(entries))
1439    }
1440
1441    #[allow(clippy::too_many_arguments)]
1442    async fn do_readdirplus<'a>(
1443        &self,
1444        ctx: Request,
1445        inode: Inode,
1446        handle: u64,
1447        offset: u64,
1448    ) -> Result<
1449        impl futures_util::stream::Stream<Item = std::result::Result<DirectoryEntryPlus, Errno>>
1450        + Send
1451        + 'a,
1452    > {
1453        let snapshot = self.get_or_create_dir_snapshot(ctx, inode, handle).await?;
1454
1455        let mut entries = Vec::new();
1456        if offset < snapshot.len() as u64 {
1457            for entry in snapshot.iter().skip(offset as usize) {
1458                // Increment lookup count for readdirplus as we are handing out a reference to the kernel.
1459                // We must do this here, not in snapshot creation, and we must NOT decrement it in HandleData drop.
1460                // The kernel will send a FORGET request when it's done with the entry.
1461                if let Some(node) = self.get_all_inode(entry.inode).await {
1462                    node.lookups.fetch_add(1, Ordering::Relaxed);
1463                }
1464                entries.push(Ok(entry.clone()));
1465            }
1466        }
1467
1468        Ok(iter(entries))
1469    }
1470
1471    async fn get_or_create_dir_snapshot(
1472        &self,
1473        ctx: Request,
1474        inode: Inode,
1475        handle: u64,
1476    ) -> Result<Vec<DirectoryEntryPlus>> {
1477        let handle_data = match self.handles.lock().await.get(&handle) {
1478            Some(hd) if hd.node.inode == inode => hd.clone(),
1479            _ => {
1480                // Fallback for cases without a valid handle (e.g. no-opendir)
1481                let node = self.lookup_node(ctx, inode, ".").await?;
1482                let st = node.stat64(ctx).await?;
1483                if !utils::is_dir(&st.attr.kind) {
1484                    return Err(Error::from_raw_os_error(libc::ENOTDIR));
1485                }
1486                // Create a temporary HandleData for this call only.
1487                Arc::new(HandleData {
1488                    node,
1489                    real_handle: None,
1490                    dir_snapshot: Mutex::new(None),
1491                })
1492            }
1493        };
1494
1495        // Optimistic check
1496        if let Some(snapshot) = handle_data.dir_snapshot.lock().await.as_ref() {
1497            return Ok(snapshot.clone());
1498        }
1499
1500        // Snapshot doesn't exist, create it.
1501        let ovl_inode = &handle_data.node;
1502        self.load_directory(ctx, ovl_inode).await?;
1503
1504        let mut entries = Vec::new();
1505
1506        // 1. Add "." entry
1507        let mut st_self = ovl_inode.stat64(ctx).await?;
1508        st_self.attr.ino = ovl_inode.inode;
1509        entries.push(DirectoryEntryPlus {
1510            inode: ovl_inode.inode,
1511            generation: 0,
1512            kind: st_self.attr.kind,
1513            name: ".".into(),
1514            offset: 1,
1515            attr: st_self.attr,
1516            entry_ttl: st_self.ttl,
1517            attr_ttl: st_self.ttl,
1518        });
1519
1520        // 2. Add ".." entry
1521        let parent_node = match ovl_inode.parent.lock().await.upgrade() {
1522            Some(node) => node,
1523            None => self.root_node().await,
1524        };
1525        let mut st_parent = parent_node.stat64(ctx).await?;
1526        st_parent.attr.ino = parent_node.inode;
1527        entries.push(DirectoryEntryPlus {
1528            inode: parent_node.inode,
1529            generation: 0,
1530            kind: st_parent.attr.kind,
1531            name: "..".into(),
1532            offset: 2,
1533            attr: st_parent.attr,
1534            entry_ttl: st_parent.ttl,
1535            attr_ttl: st_parent.ttl,
1536        });
1537
1538        // 3. Add children entries
1539        let children = ovl_inode.childrens.lock().await;
1540        for (name, child) in children.iter() {
1541            if child.whiteout.load(Ordering::Relaxed) {
1542                continue;
1543            }
1544            let mut st_child = child.stat64(ctx).await?;
1545            st_child.attr.ino = child.inode;
1546            entries.push(DirectoryEntryPlus {
1547                inode: child.inode,
1548                generation: 0,
1549                kind: st_child.attr.kind,
1550                name: name.clone().into(),
1551                offset: (entries.len() + 1) as i64,
1552                attr: st_child.attr,
1553                entry_ttl: st_child.ttl,
1554                attr_ttl: st_child.ttl,
1555            });
1556        }
1557        drop(children);
1558
1559        let mut snapshot_guard = handle_data.dir_snapshot.lock().await;
1560        if snapshot_guard.is_none() {
1561            // We won the race, install our prepared snapshot.
1562            *snapshot_guard = Some(entries.clone());
1563            Ok(entries)
1564        } else {
1565            // Another thread won the race while we were preparing.
1566            // Discard our work and use the existing snapshot.
1567            Ok(snapshot_guard.as_ref().unwrap().clone())
1568        }
1569    }
1570
1571    async fn do_mkdir(
1572        &self,
1573        ctx: Request,
1574        parent_node: Arc<OverlayInode>,
1575        name: &str,
1576        mode: u32,
1577        umask: u32,
1578    ) -> Result<()> {
1579        if self.upper_layer.is_none() {
1580            return Err(Error::from_raw_os_error(libc::EROFS));
1581        }
1582
1583        // Parent node was deleted.
1584        if parent_node.whiteout.load(Ordering::Relaxed) {
1585            return Err(Error::from_raw_os_error(libc::ENOENT));
1586        }
1587        self.check_user_creatable_name(OsStr::new(name))?;
1588
1589        let mut delete_whiteout = false;
1590        let mut set_opaque = false;
1591        if let Some(n) = self
1592            .lookup_node_ignore_enoent(ctx, parent_node.inode, name)
1593            .await?
1594        {
1595            // Node with same name exists, let's check if it's whiteout.
1596            if !n.whiteout.load(Ordering::Relaxed) {
1597                return Err(Error::from_raw_os_error(libc::EEXIST));
1598            }
1599
1600            if n.in_upper_layer().await {
1601                delete_whiteout = true;
1602            }
1603
1604            // Set opaque if child dir has lower layers.
1605            if !n.upper_layer_only().await {
1606                set_opaque = true;
1607            }
1608        }
1609
1610        // Copy parent node up if necessary.
1611        let pnode = self.copy_node_up(ctx, parent_node).await?;
1612
1613        let path = format!("{}/{}", pnode.path.read().await, name);
1614        let path_ref = &path;
1615        let new_node = Arc::new(Mutex::new(None));
1616        pnode
1617            .handle_upper_inode_locked(&mut |parent_real_inode: Option<Arc<RealInode>>| async {
1618                let parent_real_inode = match parent_real_inode {
1619                    Some(inode) => inode,
1620                    None => {
1621                        error!("BUG: parent doesn't have upper inode after copied up");
1622                        return Err(Error::from_raw_os_error(libc::EINVAL));
1623                    }
1624                };
1625                let osstr = OsStr::new(name);
1626                if delete_whiteout {
1627                    let _ = parent_real_inode
1628                        .layer
1629                        .delete_whiteout(ctx, parent_real_inode.inode, osstr)
1630                        .await;
1631                }
1632
1633                // Allocate inode number.
1634                let ino = self.alloc_inode(path_ref).await?;
1635                let child_dir = parent_real_inode.mkdir(ctx, name, mode, umask).await?;
1636                // Set opaque if child dir has lower layers.
1637                if set_opaque {
1638                    parent_real_inode
1639                        .layer
1640                        .set_opaque(ctx, child_dir.inode)
1641                        .await?;
1642                }
1643                let ovi =
1644                    OverlayInode::new_from_real_inode(name, ino, path_ref.clone(), child_dir).await;
1645                new_node.lock().await.replace(ovi);
1646                Ok(false)
1647            })
1648            .await?;
1649
1650        // new_node is always 'Some'
1651        let nn = new_node.lock().await.take();
1652        let arc_node = Arc::new(nn.unwrap());
1653        self.insert_inode(arc_node.inode, arc_node.clone()).await;
1654        pnode.insert_child(name, arc_node).await;
1655        Ok(())
1656    }
1657
1658    async fn do_mknod(
1659        &self,
1660        ctx: Request,
1661        parent_node: &Arc<OverlayInode>,
1662        name: &str,
1663        mode: u32,
1664        rdev: u32,
1665        umask: u32,
1666    ) -> Result<()> {
1667        if self.upper_layer.is_none() {
1668            return Err(Error::from_raw_os_error(libc::EROFS));
1669        }
1670
1671        // Parent node was deleted.
1672        if parent_node.whiteout.load(Ordering::Relaxed) {
1673            return Err(Error::from_raw_os_error(libc::ENOENT));
1674        }
1675        self.check_user_creatable_name(OsStr::new(name))?;
1676
1677        match self
1678            .lookup_node_ignore_enoent(ctx, parent_node.inode, name)
1679            .await?
1680        {
1681            Some(n) => {
1682                // Node with same name exists, let's check if it's whiteout.
1683                if !n.whiteout.load(Ordering::Relaxed) {
1684                    return Err(Error::from_raw_os_error(libc::EEXIST));
1685                }
1686
1687                // Copy parent node up if necessary.
1688                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1689                pnode
1690                    .handle_upper_inode_locked(
1691                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1692                            let parent_real_inode = match parent_real_inode {
1693                                Some(inode) => inode,
1694                                None => {
1695                                    error!("BUG: parent doesn't have upper inode after copied up");
1696                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1697                                }
1698                            };
1699                            let osstr = OsStr::new(name);
1700                            if n.in_upper_layer().await {
1701                                let _ = parent_real_inode
1702                                    .layer
1703                                    .delete_whiteout(ctx, parent_real_inode.inode, osstr)
1704                                    .await;
1705                            }
1706
1707                            let child_ri = parent_real_inode
1708                                .mknod(ctx, name, mode, rdev, umask)
1709                                .await?;
1710
1711                            // Replace existing real inodes with new one.
1712                            n.add_upper_inode(child_ri, true).await;
1713                            Ok(false)
1714                        },
1715                    )
1716                    .await?;
1717            }
1718            None => {
1719                // Copy parent node up if necessary.
1720                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1721                let new_node = Arc::new(Mutex::new(None));
1722                let path = format!("{}/{}", pnode.path.read().await, name);
1723                pnode
1724                    .handle_upper_inode_locked(
1725                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1726                            let parent_real_inode = match parent_real_inode {
1727                                Some(inode) => inode,
1728                                None => {
1729                                    error!("BUG: parent doesn't have upper inode after copied up");
1730                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1731                                }
1732                            };
1733
1734                            // Allocate inode number.
1735                            let ino = self.alloc_inode(&path).await?;
1736                            let child_ri = parent_real_inode
1737                                .mknod(ctx, name, mode, rdev, umask)
1738                                .await?;
1739                            let ovi = OverlayInode::new_from_real_inode(
1740                                name,
1741                                ino,
1742                                path.clone(),
1743                                child_ri,
1744                            )
1745                            .await;
1746
1747                            new_node.lock().await.replace(ovi);
1748                            Ok(false)
1749                        },
1750                    )
1751                    .await?;
1752
1753                let nn = new_node.lock().await.take();
1754                let arc_node = Arc::new(nn.unwrap());
1755                self.insert_inode(arc_node.inode, arc_node.clone()).await;
1756                pnode.insert_child(name, arc_node).await;
1757            }
1758        }
1759
1760        Ok(())
1761    }
1762
1763    async fn do_create(
1764        &self,
1765        ctx: Request,
1766        parent_node: &Arc<OverlayInode>,
1767        name: &OsStr,
1768        mode: u32,
1769        flags: u32,
1770    ) -> Result<Option<u64>> {
1771        let name_str = name
1772            .to_str()
1773            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
1774        let upper = self
1775            .upper_layer
1776            .as_ref()
1777            .cloned()
1778            .ok_or_else(|| Error::from_raw_os_error(libc::EROFS))?;
1779
1780        // Parent node was deleted.
1781        if parent_node.whiteout.load(Ordering::Relaxed) {
1782            return Err(Error::from_raw_os_error(libc::ENOENT));
1783        }
1784        self.check_user_creatable_name(name)?;
1785
1786        let handle: Arc<Mutex<Option<u64>>> = Arc::new(Mutex::new(None));
1787        let real_ino: Arc<Mutex<Option<u64>>> = Arc::new(Mutex::new(None));
1788        let new_ovi = match self
1789            .lookup_node_ignore_enoent(ctx, parent_node.inode, name_str)
1790            .await?
1791        {
1792            Some(n) => {
1793                // Node with same name exists, let's check if it's whiteout.
1794                if !n.whiteout.load(Ordering::Relaxed) {
1795                    return Err(Error::from_raw_os_error(libc::EEXIST));
1796                }
1797
1798                // Copy parent node up if necessary.
1799                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1800                pnode
1801                    .handle_upper_inode_locked(
1802                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1803                            let parent_real_inode = match parent_real_inode {
1804                                Some(inode) => inode,
1805                                None => {
1806                                    error!("BUG: parent doesn't have upper inode after copied up");
1807                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1808                                }
1809                            };
1810
1811                            if n.in_upper_layer().await {
1812                                let _ = parent_real_inode
1813                                    .layer
1814                                    .delete_whiteout(ctx, parent_real_inode.inode, name)
1815                                    .await;
1816                            }
1817
1818                            let (child_ri, hd) =
1819                                parent_real_inode.create(ctx, name_str, mode, flags).await?;
1820                            real_ino.lock().await.replace(child_ri.inode);
1821                            handle.lock().await.replace(hd.unwrap());
1822
1823                            // Replace existing real inodes with new one.
1824                            n.add_upper_inode(child_ri, true).await;
1825                            Ok(false)
1826                        },
1827                    )
1828                    .await?;
1829                n.clone()
1830            }
1831            None => {
1832                // Copy parent node up if necessary.
1833                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1834                let new_node = Arc::new(Mutex::new(None));
1835                let path = format!("{}/{}", pnode.path.read().await, name_str);
1836                pnode
1837                    .handle_upper_inode_locked(
1838                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1839                            let parent_real_inode = match parent_real_inode {
1840                                Some(inode) => inode,
1841                                None => {
1842                                    error!("BUG: parent doesn't have upper inode after copied up");
1843                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1844                                }
1845                            };
1846
1847                            let (child_ri, hd) =
1848                                parent_real_inode.create(ctx, name_str, mode, flags).await?;
1849                            real_ino.lock().await.replace(child_ri.inode);
1850                            handle.lock().await.replace(hd.unwrap());
1851                            // Allocate inode number.
1852                            let ino = self.alloc_inode(&path).await?;
1853                            let ovi = OverlayInode::new_from_real_inode(
1854                                name_str,
1855                                ino,
1856                                path.clone(),
1857                                child_ri,
1858                            )
1859                            .await;
1860
1861                            new_node.lock().await.replace(ovi);
1862                            Ok(false)
1863                        },
1864                    )
1865                    .await?;
1866
1867                // new_node is always 'Some'
1868                let nn = new_node.lock().await.take();
1869                let arc_node = Arc::new(nn.unwrap());
1870                self.insert_inode(arc_node.inode, arc_node.clone()).await;
1871                pnode.insert_child(name_str, arc_node.clone()).await;
1872                arc_node
1873            }
1874        };
1875
1876        let final_handle = match *handle.lock().await {
1877            Some(hd) => {
1878                if self.no_open.load(Ordering::Relaxed) {
1879                    None
1880                } else {
1881                    let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1882                    let handle_data = HandleData {
1883                        node: new_ovi,
1884                        real_handle: Some(RealHandle {
1885                            layer: upper.clone(),
1886                            in_upper_layer: true,
1887                            inode: real_ino.lock().await.unwrap(),
1888                            handle: AtomicU64::new(hd),
1889                        }),
1890                        dir_snapshot: Mutex::new(None),
1891                    };
1892                    self.handles
1893                        .lock()
1894                        .await
1895                        .insert(handle, Arc::new(handle_data));
1896                    Some(handle)
1897                }
1898            }
1899            None => None,
1900        };
1901        Ok(final_handle)
1902    }
1903
1904    async fn do_rename(
1905        &self,
1906        req: Request,
1907        parent: Inode,
1908        name: &OsStr,
1909        new_parent: Inode,
1910        new_name: &OsStr,
1911    ) -> Result<()> {
1912        let name_str = name
1913            .to_str()
1914            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
1915        let new_name_str = new_name
1916            .to_str()
1917            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
1918        self.check_user_creatable_name(new_name)?;
1919
1920        let parent_node = self.lookup_node(req, parent, "").await?;
1921        let new_parent_node = self.lookup_node(req, new_parent, "").await?;
1922        let src_node = self.lookup_node(req, parent, name_str).await?;
1923        let dest_node_opt = self
1924            .lookup_node_ignore_enoent(req, new_parent, new_name_str)
1925            .await?;
1926        // trace!("parent_node: {}, new_parent_node: {}, src_node: {}, dest_node_opt: {:?}", parent_node.inode, new_parent_node.inode, src_node.inode, dest_node_opt.as_ref().map(|n| n.inode));
1927
1928        if let Some(dest_node) = &dest_node_opt {
1929            let src_is_dir = src_node.is_dir(req).await?;
1930            let dest_is_dir = dest_node.is_dir(req).await?;
1931            if src_is_dir != dest_is_dir {
1932                return Err(Error::from_raw_os_error(libc::EISDIR));
1933            }
1934            if dest_is_dir {
1935                self.copy_directory_up(req, dest_node.clone()).await?;
1936                let (count, _) = dest_node.count_entries_and_whiteout(req).await?;
1937                if count > 0 {
1938                    return Err(Error::from_raw_os_error(libc::ENOTEMPTY));
1939                }
1940            }
1941        }
1942
1943        let pnode = self.copy_node_up(req, parent_node).await?;
1944        let new_pnode = self.copy_node_up(req, new_parent_node).await?;
1945        let s_node = self.copy_node_up(req, src_node).await?;
1946
1947        let need_whiteout = !s_node.upper_layer_only().await;
1948
1949        let (p_layer, _, p_inode) = pnode.first_layer_inode().await;
1950        let (new_p_layer, _, new_p_inode) = new_pnode.first_layer_inode().await;
1951        assert!(Arc::ptr_eq(&p_layer, &new_p_layer));
1952
1953        p_layer
1954            .rename(req, p_inode, name, new_p_inode, new_name)
1955            .await?;
1956
1957        // Handle the replaced destination node (if any).
1958        if let Some(dest_node) = dest_node_opt {
1959            let path = dest_node.path.read().await.clone();
1960            self.remove_inode(dest_node.inode, Some(path)).await;
1961        }
1962
1963        // Update the moved source node's state.
1964
1965        // Remove from old parent.
1966        pnode.remove_child(name_str).await;
1967        self.remove_inode(s_node.inode, s_node.path.read().await.clone().into())
1968            .await;
1969        let new_path = format!("{}/{}", new_pnode.path.read().await, new_name_str);
1970        *s_node.path.write().await = new_path;
1971        *s_node.name.write().await = new_name_str.to_string();
1972        *s_node.parent.lock().await = Arc::downgrade(&new_pnode);
1973        new_pnode.insert_child(new_name_str, s_node.clone()).await;
1974        self.insert_inode(s_node.inode, s_node).await;
1975
1976        // Create whiteout at the old location if necessary.
1977        if need_whiteout {
1978            p_layer.create_whiteout(req, p_inode, name).await?;
1979        }
1980
1981        Ok(())
1982    }
1983
1984    async fn do_link(
1985        &self,
1986        ctx: Request,
1987        src_node: &Arc<OverlayInode>,
1988        new_parent: &Arc<OverlayInode>,
1989        name: &str,
1990    ) -> Result<()> {
1991        if self.upper_layer.is_none() {
1992            return Err(Error::from_raw_os_error(libc::EROFS));
1993        }
1994
1995        // Node is whiteout.
1996        if src_node.whiteout.load(Ordering::Relaxed) || new_parent.whiteout.load(Ordering::Relaxed)
1997        {
1998            return Err(Error::from_raw_os_error(libc::ENOENT));
1999        }
2000        self.check_user_creatable_name(OsStr::new(name))?;
2001
2002        let st = src_node.stat64(ctx).await?;
2003        if utils::is_dir(&st.attr.kind) {
2004            // Directory can't be hardlinked.
2005            return Err(Error::from_raw_os_error(libc::EPERM));
2006        }
2007
2008        let src_node = self.copy_node_up(ctx, Arc::clone(src_node)).await?;
2009        let new_parent = self.copy_node_up(ctx, Arc::clone(new_parent)).await?;
2010        let src_ino = src_node.first_layer_inode().await.2;
2011
2012        if let Some(existing_node) = self
2013            .lookup_node_ignore_enoent(ctx, new_parent.inode, name)
2014            .await?
2015        {
2016            // If it's not a whiteout, it's an error
2017            if !existing_node.whiteout.load(Ordering::Relaxed) {
2018                return Err(Error::from_raw_os_error(libc::EEXIST));
2019            }
2020            // If it is a whiteout, we will overwrite it.
2021            // First, remove the physical whiteout file in the upper layer.
2022            new_parent
2023                .handle_upper_inode_locked(&mut |parent_real_inode: Option<Arc<RealInode>>| async {
2024                    let parent_ri = parent_real_inode.ok_or_else(|| {
2025                        error!("BUG: parent doesn't have upper inode after copied up");
2026                        Error::from_raw_os_error(libc::EINVAL)
2027                    })?;
2028                    // Only delete if the whiteout is in the upper layer
2029                    if existing_node.in_upper_layer().await {
2030                        let _ = parent_ri
2031                            .layer
2032                            .delete_whiteout(ctx, parent_ri.inode, OsStr::new(name))
2033                            .await;
2034                    }
2035                    Ok(false)
2036                })
2037                .await?;
2038        }
2039
2040        new_parent
2041            .handle_upper_inode_locked(&mut |parent_real_inode: Option<Arc<RealInode>>| async {
2042                let parent_real_inode = match parent_real_inode {
2043                    Some(inode) => inode,
2044                    None => {
2045                        error!("BUG: parent doesn't have upper inode after copied up");
2046                        return Err(Error::from_raw_os_error(libc::EINVAL));
2047                    }
2048                };
2049
2050                parent_real_inode.link(ctx, src_ino, name).await?;
2051
2052                Ok(false)
2053            })
2054            .await?;
2055
2056        self.insert_inode(src_node.inode, src_node.clone()).await;
2057        new_parent.insert_child(name, src_node).await;
2058
2059        Ok(())
2060    }
2061
2062    async fn do_symlink(
2063        &self,
2064        ctx: Request,
2065        linkname: &str,
2066        parent_node: &Arc<OverlayInode>,
2067        name: &str,
2068    ) -> Result<()> {
2069        let name_os = OsStr::new(name);
2070        if self.upper_layer.is_none() {
2071            return Err(Error::from_raw_os_error(libc::EROFS));
2072        }
2073
2074        // parent was deleted.
2075        if parent_node.whiteout.load(Ordering::Relaxed) {
2076            return Err(Error::from_raw_os_error(libc::ENOENT));
2077        }
2078        self.check_user_creatable_name(name_os)?;
2079
2080        match self
2081            .lookup_node_ignore_enoent(ctx, parent_node.inode, name)
2082            .await?
2083        {
2084            Some(n) => {
2085                // Node with same name exists, let's check if it's whiteout.
2086                if !n.whiteout.load(Ordering::Relaxed) {
2087                    return Err(Error::from_raw_os_error(libc::EEXIST));
2088                }
2089
2090                // Copy parent node up if necessary.
2091                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
2092                pnode
2093                    .handle_upper_inode_locked(
2094                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
2095                            let parent_real_inode = match parent_real_inode {
2096                                Some(inode) => inode,
2097                                None => {
2098                                    error!("BUG: parent doesn't have upper inode after copied up");
2099                                    return Err(Error::from_raw_os_error(libc::EINVAL));
2100                                }
2101                            };
2102
2103                            if n.in_upper_layer().await {
2104                                let _ = parent_real_inode
2105                                    .layer
2106                                    .delete_whiteout(ctx, parent_real_inode.inode, name_os)
2107                                    .await;
2108                            }
2109
2110                            let child_ri = parent_real_inode.symlink(ctx, linkname, name).await?;
2111
2112                            // Replace existing real inodes with new one.
2113                            n.add_upper_inode(child_ri, true).await;
2114                            Ok(false)
2115                        },
2116                    )
2117                    .await?;
2118            }
2119            None => {
2120                // Copy parent node up if necessary.
2121                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
2122                let new_node: Arc<Mutex<Option<OverlayInode>>> = Arc::new(Mutex::new(None));
2123                let path = format!("{}/{}", pnode.path.read().await, name);
2124                pnode
2125                    .handle_upper_inode_locked(
2126                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
2127                            let parent_real_inode = match parent_real_inode {
2128                                Some(inode) => inode,
2129                                None => {
2130                                    error!("BUG: parent doesn't have upper inode after copied up");
2131                                    return Err(Error::from_raw_os_error(libc::EINVAL));
2132                                }
2133                            };
2134
2135                            // Allocate inode number.
2136                            let ino = self.alloc_inode(&path).await?;
2137                            let child_ri = parent_real_inode.symlink(ctx, linkname, name).await?;
2138                            let ovi = OverlayInode::new_from_real_inode(
2139                                name,
2140                                ino,
2141                                path.clone(),
2142                                child_ri,
2143                            )
2144                            .await;
2145
2146                            new_node.lock().await.replace(ovi);
2147                            Ok(false)
2148                        },
2149                    )
2150                    .await?;
2151
2152                // new_node is always 'Some'
2153                let arc_node = Arc::new(new_node.lock().await.take().unwrap());
2154                self.insert_inode(arc_node.inode, arc_node.clone()).await;
2155                pnode.insert_child(name, arc_node).await;
2156            }
2157        }
2158
2159        Ok(())
2160    }
2161
2162    /// Copies a symbolic link from a lower layer to the upper layer.
2163    ///
2164    /// This function is a part of the copy-up process, triggered when a symlink that
2165    /// only exists in a lower layer is modified. It reads the link target and attributes
2166    /// from the lower layer and creates an identical symlink in the upper layer, crucially
2167    /// preserving the original host UID and GID.
2168    async fn copy_symlink_up(
2169        &self,
2170        ctx: Request,
2171        node: Arc<OverlayInode>,
2172    ) -> Result<Arc<OverlayInode>> {
2173        if node.in_upper_layer().await {
2174            return Ok(node);
2175        }
2176
2177        let parent_node = if let Some(ref n) = node.parent.lock().await.upgrade() {
2178            Arc::clone(n)
2179        } else {
2180            return Err(Error::other("no parent?"));
2181        };
2182
2183        // To preserve original ownership, we must get the raw, unmapped host attributes.
2184        // We achieve this by calling `do_getattr_helper`, which is specifically designed
2185        // to bypass the ID mapping logic. This is safe and does not affect other
2186        // functionalities because `do_getattr_helper` and the standard `stat64()` call
2187        // both rely on the same underlying `stat` system call; they only differ in
2188        // whether the resulting `uid` and `gid` are mapped.
2189        let (self_layer, _, self_inode) = node.first_layer_inode().await;
2190        let re = self_layer
2191            .getattr_with_mapping(self_inode, None, false)
2192            .await?;
2193        let st = ReplyAttr {
2194            ttl: re.1,
2195            attr: convert_stat64_to_file_attr(re.0),
2196        };
2197
2198        if !parent_node.in_upper_layer().await {
2199            parent_node.clone().create_upper_dir(ctx, None).await?;
2200        }
2201
2202        // Read the linkname from lower layer.
2203        let reply_data = self_layer.readlink(ctx, self_inode).await?;
2204        // Convert path to &str.
2205        let path = std::str::from_utf8(&reply_data.data)
2206            .map_err(|_| Error::from_raw_os_error(libc::EINVAL))?;
2207
2208        let new_upper_real: Arc<Mutex<Option<RealInode>>> = Arc::new(Mutex::new(None));
2209        parent_node
2210            .handle_upper_inode_locked(&mut |parent_upper_inode: Option<Arc<RealInode>>| async {
2211                // We already create upper dir for parent_node above.
2212                let parent_real_inode =
2213                    parent_upper_inode.ok_or_else(|| Error::from_raw_os_error(libc::EROFS))?;
2214                // We manually unfold the `symlink` logic here instead of calling the `symlink` method directly.
2215                // This is necessary to preserve the original file's UID and GID during the copy-up process.
2216                if !parent_real_inode.in_upper_layer {
2217                    return Err(Error::from_raw_os_error(libc::EROFS));
2218                }
2219                let link_name = OsStr::new(path);
2220                let filename = node.name.read().await;
2221                let filename = OsStr::new(filename.as_str());
2222                let op_ctx = crate::context::OperationContext::with_credentials(
2223                    ctx,
2224                    st.attr.uid,
2225                    st.attr.gid,
2226                );
2227                let entry = parent_real_inode
2228                    .layer
2229                    .symlink_with_context(op_ctx, parent_real_inode.inode, filename, link_name)
2230                    .await?;
2231                let ri = RealInode {
2232                    layer: parent_real_inode.layer.clone(),
2233                    in_upper_layer: true,
2234                    inode: entry.attr.ino,
2235                    whiteout: false,
2236                    opaque: false,
2237                    stat: Some(ReplyAttr {
2238                        ttl: entry.ttl,
2239                        attr: entry.attr,
2240                    }),
2241                };
2242                new_upper_real.lock().await.replace(ri);
2243                Ok(false)
2244            })
2245            .await?;
2246
2247        if let Some(real_inode) = new_upper_real.lock().await.take() {
2248            // update upper_inode and first_inode()
2249            node.add_upper_inode(real_inode, true).await;
2250        }
2251
2252        Ok(node)
2253    }
2254
2255    /// macOS-only twin of `overlayfs::OverlayFs::try_macos_apfs_clone_up`.
2256    /// See that method for the full reasoning; this duplicates it because
2257    /// the unionfs layer trait carries `async_trait` bounds and a different
2258    /// `RealInode` shape, so they can't be folded into one helper without
2259    /// further refactoring.
2260    #[cfg(target_os = "macos")]
2261    async fn try_macos_apfs_clone_up_unionfs(
2262        &self,
2263        lower_layer: &Arc<BoxedLayer>,
2264        lower_inode: Inode,
2265        parent_node: &Arc<OverlayInode>,
2266        node: &Arc<OverlayInode>,
2267    ) -> Result<Option<Arc<OverlayInode>>> {
2268        use std::ffi::CString;
2269        use std::os::unix::ffi::OsStrExt;
2270
2271        let Some(src_path) = lower_layer.host_path_of(lower_inode).await else {
2272            return Ok(None);
2273        };
2274
2275        let parent_layer_inode = std::sync::Arc::new(tokio::sync::Mutex::new(None));
2276        {
2277            let pli = parent_layer_inode.clone();
2278            parent_node
2279                .handle_upper_inode_locked(&mut |parent_upper: Option<Arc<RealInode>>| async {
2280                    if let Some(p) = parent_upper {
2281                        *pli.lock().await = Some((p.layer.clone(), p.inode));
2282                    }
2283                    Ok(false)
2284                })
2285                .await?;
2286        }
2287        let Some((upper_layer, upper_parent_inode)) = parent_layer_inode.lock().await.clone()
2288        else {
2289            return Ok(None);
2290        };
2291
2292        let Some(dst_dir_path) = upper_layer.host_path_of(upper_parent_inode).await else {
2293            return Ok(None);
2294        };
2295
2296        let name_owned = node.name.read().await.clone();
2297        let dst_full = dst_dir_path.join(&name_owned);
2298
2299        let src_c = CString::new(src_path.as_os_str().as_bytes())
2300            .map_err(|_| Error::from_raw_os_error(libc::EINVAL))?;
2301        let dst_c = CString::new(dst_full.as_os_str().as_bytes())
2302            .map_err(|_| Error::from_raw_os_error(libc::EINVAL))?;
2303
2304        match crate::passthrough::util::try_apfs_clonefile(&src_c, &dst_c) {
2305            Ok(true) => {}
2306            Ok(false) => return Ok(None),
2307            Err(e) => {
2308                if e.raw_os_error() != Some(libc::ENOTSUP) && e.raw_os_error() != Some(libc::EXDEV)
2309                {
2310                    return Err(e);
2311                }
2312                return Ok(None);
2313            }
2314        }
2315
2316        let entry = upper_layer
2317            .lookup(
2318                Request::default(),
2319                upper_parent_inode,
2320                OsStr::new(&name_owned),
2321            )
2322            .await?;
2323        let real = RealInode {
2324            layer: upper_layer,
2325            in_upper_layer: true,
2326            inode: entry.attr.ino,
2327            whiteout: false,
2328            opaque: false,
2329            stat: Some(ReplyAttr {
2330                ttl: entry.ttl,
2331                attr: entry.attr,
2332            }),
2333        };
2334        node.add_upper_inode(real, true).await;
2335        Ok(Some(Arc::clone(node)))
2336    }
2337
2338    /// Copies a regular file and its contents from a lower layer to the upper layer.
2339    ///
2340    /// This function is a core part of the copy-up process, triggered when a regular file
2341    /// that only exists in a lower layer is written to. It creates an empty file in the
2342    /// upper layer with the original file's attributes (mode, UID, GID), and then copies
2343    /// the entire content from the lower layer file to the new upper layer file.
2344    async fn copy_regfile_up(
2345        &self,
2346        ctx: Request,
2347        node: Arc<OverlayInode>,
2348    ) -> Result<Arc<OverlayInode>> {
2349        if node.in_upper_layer().await {
2350            return Ok(node);
2351        }
2352
2353        let parent_node = if let Some(ref n) = node.parent.lock().await.upgrade() {
2354            Arc::clone(n)
2355        } else {
2356            return Err(Error::other("no parent?"));
2357        };
2358
2359        // To preserve original ownership, we must get the raw, unmapped host attributes.
2360        // We achieve this by calling `do_getattr_helper`, which is specifically designed
2361        // to bypass the ID mapping logic. This is safe and does not affect other
2362        // functionalities because `do_getattr_helper` and the standard `stat64()` call
2363        // both rely on the same underlying `stat` system call; they only differ in
2364        // whether the resulting `uid` and `gid` are mapped.
2365        let (lower_layer, _, lower_inode) = node.first_layer_inode().await;
2366        let re = lower_layer
2367            .getattr_with_mapping(lower_inode, None, false)
2368            .await?;
2369        let st = ReplyAttr {
2370            ttl: re.1,
2371            attr: convert_stat64_to_file_attr(re.0),
2372        };
2373        trace!(
2374            "copy_regfile_up: node {} in lower layer's inode {}",
2375            node.inode, lower_inode
2376        );
2377
2378        if !parent_node.in_upper_layer().await {
2379            parent_node.clone().create_upper_dir(ctx, None).await?;
2380        }
2381
2382        // === macOS APFS reflink fast path =================================
2383        // Mirrors the overlayfs path: if both layers expose a host-fs path,
2384        // try `clonefile(2)` and skip the create + read/write loop. Falls
2385        // back silently on EXDEV/ENOTSUP or layers that have no host path.
2386        #[cfg(target_os = "macos")]
2387        if let Some(node) = self
2388            .try_macos_apfs_clone_up_unionfs(&lower_layer, lower_inode, &parent_node, &node)
2389            .await?
2390        {
2391            return Ok(node);
2392        }
2393
2394        // create the file in upper layer using information from lower layer
2395
2396        let flags = libc::O_WRONLY;
2397        let mode = mode_from_kind_and_perm(st.attr.kind, st.attr.perm);
2398
2399        let upper_handle = Arc::new(Mutex::new(0));
2400        let upper_real_inode = Arc::new(Mutex::new(None));
2401        parent_node
2402            .handle_upper_inode_locked(&mut |parent_upper_inode: Option<Arc<RealInode>>| async {
2403                // We already create upper dir for parent_node.
2404                let parent_real_inode = parent_upper_inode.ok_or_else(|| {
2405                    error!("parent {} has no upper inode", parent_node.inode);
2406                    Error::from_raw_os_error(libc::EINVAL)
2407                })?;
2408                // We manually unfold the `create` logic here instead of calling the `create` method directly.
2409                // This is necessary to preserve the original file's UID and GID during the copy-up process.
2410                if !parent_real_inode.in_upper_layer {
2411                    return Err(Error::from_raw_os_error(libc::EROFS));
2412                }
2413                let name = node.name.read().await;
2414                let name = OsStr::new(name.as_str());
2415                let op_ctx = crate::context::OperationContext::with_credentials(
2416                    ctx,
2417                    st.attr.uid,
2418                    st.attr.gid,
2419                );
2420                let create_rep = parent_real_inode
2421                    .layer
2422                    .create_with_context(
2423                        op_ctx,
2424                        parent_real_inode.inode,
2425                        name,
2426                        mode,
2427                        flags.try_into().unwrap(),
2428                    )
2429                    .await?;
2430
2431                let (inode, h) = (
2432                    RealInode {
2433                        layer: parent_real_inode.layer.clone(),
2434                        in_upper_layer: true,
2435                        inode: create_rep.attr.ino,
2436                        whiteout: false,
2437                        opaque: false,
2438                        stat: Some(ReplyAttr {
2439                            ttl: create_rep.ttl,
2440                            attr: create_rep.attr,
2441                        }),
2442                    },
2443                    Some(create_rep.fh),
2444                );
2445                trace!(
2446                    "copy_regfile_up: created upper file {name:?} with inode {}",
2447                    inode.inode
2448                );
2449                *upper_handle.lock().await = h.unwrap_or(0);
2450                upper_real_inode.lock().await.replace(inode);
2451                Ok(false)
2452            })
2453            .await?;
2454
2455        let rep = lower_layer
2456            .open(ctx, lower_inode, libc::O_RDONLY as u32)
2457            .await?;
2458
2459        let lower_handle = rep.fh;
2460
2461        // need to use work directory and then rename file to
2462        // final destination for atomic reasons.. not deal with it for now,
2463        // use stupid copy at present.
2464        // FIXME: this need a lot of work here, ntimes, xattr, etc.
2465
2466        // Copy from lower real inode to upper real inode.
2467        // TODO: use sendfile here.
2468
2469        let u_handle = *upper_handle.lock().await;
2470        let ri = upper_real_inode.lock().await.take();
2471        if let Some(ri) = ri {
2472            let mut offset: usize = 0;
2473            let size = 4 * 1024 * 1024;
2474
2475            loop {
2476                let ret = lower_layer
2477                    .read(ctx, lower_inode, lower_handle, offset as u64, size)
2478                    .await?;
2479
2480                let len = ret.data.len();
2481                if len == 0 {
2482                    break;
2483                }
2484
2485                let ret = ri
2486                    .layer
2487                    .write(ctx, ri.inode, u_handle, offset as u64, &ret.data, 0, 0)
2488                    .await?;
2489
2490                assert_eq!(ret.written as usize, len);
2491                offset += ret.written as usize;
2492            }
2493
2494            if let Err(e) = ri.layer.release(ctx, ri.inode, u_handle, 0, 0, true).await {
2495                let e: std::io::Error = e.into();
2496                // Ignore ENOSYS.
2497                if e.raw_os_error() != Some(libc::ENOSYS) {
2498                    return Err(e);
2499                }
2500            }
2501            node.add_upper_inode(ri, true).await;
2502        } else {
2503            error!("BUG: upper real inode is None after copy up");
2504        }
2505
2506        lower_layer
2507            .release(ctx, lower_inode, lower_handle, 0, 0, true)
2508            .await?;
2509
2510        Ok(Arc::clone(&node))
2511    }
2512
2513    /// Copies the specified node to the upper layer of the filesystem
2514    ///
2515    /// Performs different operations based on the node type:
2516    /// - **Directory**: Creates a corresponding directory in the upper layer
2517    /// - **Symbolic link**: Recursively copies to the upper layer
2518    /// - **Regular file**: Copies file content to the upper layer
2519    ///
2520    /// # Parameters
2521    /// * `ctx`: FUSE request context
2522    /// * `node`: Reference to the node to be copied
2523    ///
2524    /// # Returns
2525    /// Returns a reference to the upper-layer node on success, or an error on failure
2526    async fn copy_node_up(
2527        &self,
2528        ctx: Request,
2529        node: Arc<OverlayInode>,
2530    ) -> Result<Arc<OverlayInode>> {
2531        if node.in_upper_layer().await {
2532            return Ok(node);
2533        }
2534
2535        let st = node.stat64(ctx).await?;
2536        match st.attr.kind {
2537            FileType::Directory => {
2538                node.clone().create_upper_dir(ctx, None).await?;
2539                Ok(node)
2540            }
2541            FileType::Symlink => {
2542                // For symlink.
2543                self.copy_symlink_up(ctx, node).await
2544            }
2545            FileType::RegularFile => {
2546                // For regular file.
2547                self.copy_regfile_up(ctx, node).await
2548            }
2549            _ => {
2550                // For other file types. return error.
2551                Err(Error::from_raw_os_error(libc::EINVAL))
2552            }
2553        }
2554    }
2555
2556    /// recursively copy directory and all its contents to upper layer
2557    async fn copy_directory_up(
2558        &self,
2559        ctx: Request,
2560        node: Arc<OverlayInode>,
2561    ) -> Result<Arc<OverlayInode>> {
2562        // Ensure the directory itself is copied up first
2563        self.copy_node_up(ctx, node.clone()).await?;
2564
2565        // load directory to cache
2566        self.load_directory(ctx, &node).await?;
2567
2568        // go through all children
2569        let children = node.childrens.lock().await.clone();
2570        for (_name, child) in children.iter() {
2571            if _name == "." || _name == ".." {
2572                continue;
2573            }
2574            // jump over whiteout
2575            if child.whiteout.load(Ordering::Relaxed) {
2576                continue;
2577            }
2578            let st = child.stat64(ctx).await?;
2579            if !child.in_upper_layer().await {
2580                match st.attr.kind {
2581                    FileType::Directory => {
2582                        // recursively copy subdirectory
2583                        Box::pin(self.copy_directory_up(ctx, child.clone())).await?;
2584                    }
2585                    FileType::Symlink | FileType::RegularFile => {
2586                        // copy node up symlink or regular file
2587                        Box::pin(self.copy_node_up(ctx, child.clone())).await?;
2588                    }
2589                    _ => {
2590                        // other file types are ignored
2591                    }
2592                }
2593            } else if utils::is_dir(&st.attr.kind) {
2594                // If it is already in the upper layer, but the directory is not loaded,
2595                // ensure that its contents are also copied up recursively.
2596                Box::pin(self.copy_directory_up(ctx, child.clone())).await?;
2597            }
2598        }
2599
2600        Ok(node)
2601    }
2602
2603    async fn do_rm(&self, ctx: Request, parent: u64, name: &OsStr, dir: bool) -> Result<()> {
2604        // 1. Read-only mount guard
2605        if self.upper_layer.is_none() {
2606            return Err(Error::from_raw_os_error(libc::EROFS));
2607        }
2608
2609        // 2. Locate the parent Overlay Inode.
2610        // Find parent Overlay Inode.
2611        let pnode = self.lookup_node(ctx, parent, "").await?;
2612        if pnode.whiteout.load(Ordering::Relaxed) {
2613            return Err(Error::from_raw_os_error(libc::ENOENT));
2614        }
2615        let to_name = name
2616            .to_str()
2617            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
2618
2619        // 3. Locate the child Overlay Inode for the given name
2620        // Find the Overlay Inode for child with <name>.
2621        let node = self.lookup_node(ctx, parent, to_name).await?;
2622        if node.whiteout.load(Ordering::Relaxed) {
2623            // already deleted.
2624            return Err(Error::from_raw_os_error(libc::ENOENT));
2625        }
2626
2627        // 4. If removing a directory, ensure it is empty of real entries
2628        if dir {
2629            self.load_directory(ctx, &node).await?;
2630            let (count, whiteouts) = node.count_entries_and_whiteout(ctx).await?;
2631            trace!("entries: {count}, whiteouts: {whiteouts}\n");
2632            if count > 0 {
2633                return Err(Error::from_raw_os_error(libc::ENOTEMPTY));
2634            }
2635
2636            // Delete all whiteouts.
2637            if whiteouts > 0 && node.in_upper_layer().await {
2638                self.empty_node_directory(ctx, Arc::clone(&node)).await?;
2639            }
2640
2641            trace!("whiteouts deleted!\n");
2642        }
2643
2644        // 5. Decide whether we need to create a whiteout entry
2645        // We'll filp this off if upper-layer unlink suffices or parent is opaque
2646        let need_whiteout = AtomicBool::new(true);
2647        let pnode = self.copy_node_up(ctx, Arc::clone(&pnode)).await?;
2648
2649        if node.upper_layer_only().await {
2650            need_whiteout.store(false, Ordering::Relaxed);
2651        }
2652
2653        let mut df = |parent_upper_inode: Option<Arc<RealInode>>| async {
2654            let parent_real_inode = parent_upper_inode.ok_or_else(|| {
2655                error!(
2656                    "BUG: parent {} has no upper inode after copy up",
2657                    pnode.inode
2658                );
2659                Error::from_raw_os_error(libc::EINVAL)
2660            })?;
2661
2662            // Parent is opaque, it shadows everything in lower layers so no need to create extra whiteouts.
2663            if parent_real_inode.opaque {
2664                need_whiteout.store(false, Ordering::Relaxed);
2665            }
2666            if dir {
2667                parent_real_inode
2668                    .layer
2669                    .rmdir(ctx, parent_real_inode.inode, name)
2670                    .await?;
2671            } else {
2672                parent_real_inode
2673                    .layer
2674                    .unlink(ctx, parent_real_inode.inode, name)
2675                    .await?;
2676            }
2677
2678            Ok(false)
2679        };
2680
2681        // 6. Perform the unlink/rmdir operation and memory cleanup
2682        if node.in_upper_layer().await {
2683            pnode.handle_upper_inode_locked(&mut df).await?;
2684        }
2685        pnode.remove_child(to_name).await;
2686        let path = node.path.read().await.clone();
2687        self.remove_inode(node.inode, Some(path)).await;
2688
2689        // 7. If needed, create a entry in the upper layer to mask lower-layer files
2690        if need_whiteout.load(Ordering::Relaxed) {
2691            trace!("do_rm: creating whiteout\n");
2692            // pnode is copied up, so it has upper layer.
2693            pnode
2694                .handle_upper_inode_locked(
2695                    &mut |parent_upper_inode: Option<Arc<RealInode>>| async {
2696                        let parent_real_inode = parent_upper_inode.ok_or_else(|| {
2697                            error!(
2698                                "BUG: parent {} has no upper inode after copy up",
2699                                pnode.inode
2700                            );
2701                            Error::from_raw_os_error(libc::EINVAL)
2702                        })?;
2703
2704                        let child_ri = parent_real_inode.create_whiteout(ctx, to_name).await?; //FIXME..............
2705                        let path = format!("{}/{}", pnode.path.read().await, to_name);
2706                        let ino: u64 = self.alloc_inode(&path).await?;
2707                        let ovi = Arc::new(
2708                            OverlayInode::new_from_real_inode(to_name, ino, path.clone(), child_ri)
2709                                .await,
2710                        );
2711
2712                        self.insert_inode(ino, ovi.clone()).await;
2713                        pnode.insert_child(to_name, ovi.clone()).await;
2714                        Ok(false)
2715                    },
2716                )
2717                .await?;
2718        }
2719
2720        Ok(())
2721    }
2722
2723    async fn do_fsync(
2724        &self,
2725        ctx: Request,
2726        inode: Inode,
2727        datasync: bool,
2728        handle: Handle,
2729        syncdir: bool,
2730    ) -> Result<()> {
2731        // Use O_RDONLY flags which indicates no copy up.
2732        let data = self
2733            .get_data(ctx, Some(handle), inode, libc::O_RDONLY as u32)
2734            .await?;
2735
2736        trace!("do_fsync: got data for handle: {handle}, inode:{inode}");
2737
2738        match data.real_handle {
2739            // FIXME: need to test if inode matches corresponding handle?
2740            None => {
2741                trace!("do_fsync: no real handle found for handle: {handle}, inode:{inode}");
2742                Err(Error::from_raw_os_error(libc::ENOENT))
2743            }
2744            Some(ref rh) => {
2745                let real_handle = rh.handle.load(Ordering::Relaxed);
2746                // TODO: check if it's in upper layer? @weizhang555
2747                if syncdir {
2748                    trace!(
2749                        "do_fsync: layer.fsyncdir called for handle: {}, inode:{}; rh.inode: {}, real_handle: {}",
2750                        handle, inode, rh.inode, real_handle
2751                    );
2752                    rh.layer
2753                        .fsyncdir(ctx, rh.inode, real_handle, datasync)
2754                        .await
2755                        .map_err(|e| e.into())
2756                } else {
2757                    rh.layer
2758                        .fsync(ctx, rh.inode, real_handle, datasync)
2759                        .await
2760                        .map_err(|e| e.into())
2761                }
2762            }
2763        }
2764    }
2765
2766    // Delete everything in the directory only on upper layer, ignore lower layers.
2767    async fn empty_node_directory(&self, ctx: Request, node: Arc<OverlayInode>) -> Result<()> {
2768        let st = node.stat64(ctx).await?;
2769        if !utils::is_dir(&st.attr.kind) {
2770            // This function can only be called on directories.
2771            return Err(Error::from_raw_os_error(libc::ENOTDIR));
2772        }
2773
2774        let (layer, in_upper, inode) = node.first_layer_inode().await;
2775        if !in_upper {
2776            return Ok(());
2777        }
2778
2779        // Copy node.childrens Hashmap to Vector, the Vector is also used as temp storage,
2780        // Without this, Rust won't allow us to remove them from node.childrens.
2781        let iter = node
2782            .childrens
2783            .lock()
2784            .await
2785            .values()
2786            .cloned()
2787            .collect::<Vec<_>>();
2788
2789        for child in iter {
2790            // We only care about upper layer, ignore lower layers.
2791            if child.in_upper_layer().await {
2792                let child_name = child.name.read().await.clone();
2793                let child_name_os = OsStr::new(&child_name);
2794                if child.whiteout.load(Ordering::Relaxed) {
2795                    layer.delete_whiteout(ctx, inode, child_name_os).await?
2796                } else {
2797                    let s = child.stat64(ctx).await?;
2798                    let cname: &OsStr = OsStr::new(&child_name_os);
2799                    if utils::is_dir(&s.attr.kind) {
2800                        let (count, whiteouts) = child.count_entries_and_whiteout(ctx).await?;
2801                        if count + whiteouts > 0 {
2802                            let cb = child.clone();
2803                            Box::pin(async move { self.empty_node_directory(ctx, cb).await })
2804                                .await?;
2805                        }
2806                        layer.rmdir(ctx, inode, cname).await?
2807                    } else {
2808                        layer.unlink(ctx, inode, cname).await?;
2809                    }
2810                }
2811
2812                let cpath = child.path.read().await.clone();
2813                // delete the child
2814                self.remove_inode(child.inode, Some(cpath)).await;
2815                node.remove_child(&child_name).await;
2816            }
2817        }
2818
2819        Ok(())
2820    }
2821
2822    async fn find_real_info_from_handle(
2823        &self,
2824        handle: Handle,
2825    ) -> Result<(Arc<BoxedLayer>, Inode, Handle)> {
2826        match self.handles.lock().await.get(&handle) {
2827            Some(h) => match h.real_handle {
2828                Some(ref rhd) => {
2829                    trace!(
2830                        "find_real_info_from_handle: layer in upper: {}",
2831                        rhd.in_upper_layer
2832                    );
2833                    Ok((
2834                        rhd.layer.clone(),
2835                        rhd.inode,
2836                        rhd.handle.load(Ordering::Relaxed),
2837                    ))
2838                }
2839                None => Err(Error::from_raw_os_error(libc::ENOENT)),
2840            },
2841
2842            None => Err(Error::from_raw_os_error(libc::ENOENT)),
2843        }
2844    }
2845
2846    async fn find_real_inode(&self, inode: Inode) -> Result<(Arc<BoxedLayer>, Inode)> {
2847        if let Some(n) = self.get_active_inode(inode).await {
2848            let (first_layer, _, first_inode) = n.first_layer_inode().await;
2849            return Ok((first_layer, first_inode));
2850        } else if let Some(n) = self.get_all_inode(inode).await {
2851            trace!("find_real_inode: found inode by get_all_inode: {}", n.inode);
2852            let (first_layer, _, first_inode) = n.first_layer_inode().await;
2853            return Ok((first_layer, first_inode));
2854        }
2855
2856        Err(Error::from_raw_os_error(libc::ENOENT))
2857    }
2858
2859    async fn get_data(
2860        &self,
2861        ctx: Request,
2862        handle: Option<Handle>,
2863        inode: Inode,
2864        flags: u32,
2865    ) -> Result<Arc<HandleData>> {
2866        let no_open = self.no_open.load(Ordering::Relaxed);
2867        if !no_open {
2868            if let Some(h) = handle
2869                && let Some(v) = self.handles.lock().await.get(&h)
2870                && v.node.inode == inode
2871            {
2872                // trace!("get_data: found handle");
2873                return Ok(Arc::clone(v));
2874            }
2875        } else {
2876            let readonly: bool = flags
2877                & (libc::O_APPEND | libc::O_CREAT | libc::O_TRUNC | libc::O_RDWR | libc::O_WRONLY)
2878                    as u32
2879                == 0;
2880
2881            // lookup node
2882            let node = self.lookup_node(ctx, inode, "").await?;
2883
2884            // whiteout node
2885            if node.whiteout.load(Ordering::Relaxed) {
2886                return Err(Error::from_raw_os_error(libc::ENOENT));
2887            }
2888
2889            if !readonly {
2890                // Check if upper layer exists, return EROFS is not exists.
2891                self.upper_layer
2892                    .as_ref()
2893                    .cloned()
2894                    .ok_or_else(|| Error::from_raw_os_error(libc::EROFS))?;
2895                // copy up to upper layer
2896                self.copy_node_up(ctx, Arc::clone(&node)).await?;
2897            }
2898
2899            let (layer, in_upper_layer, inode) = node.first_layer_inode().await;
2900            let handle_data = HandleData {
2901                node: Arc::clone(&node),
2902                real_handle: Some(RealHandle {
2903                    layer,
2904                    in_upper_layer,
2905                    inode,
2906                    handle: AtomicU64::new(0),
2907                }),
2908                dir_snapshot: Mutex::new(None),
2909            };
2910            return Ok(Arc::new(handle_data));
2911        }
2912
2913        Err(Error::from_raw_os_error(libc::ENOENT))
2914    }
2915
2916    // extend or init the inodes number to one overlay if the current number is done.
2917    pub async fn extend_inode_alloc(&self, key: u64) {
2918        let next_inode = key * INODE_ALLOC_BATCH;
2919        let limit_inode = next_inode + INODE_ALLOC_BATCH - 1;
2920        self.inodes
2921            .write()
2922            .await
2923            .extend_inode_number(next_inode, limit_inode);
2924    }
2925}
2926
2927/// Wrap the parameters for mounting overlay filesystem.
2928#[derive(Debug, Clone)]
2929pub struct OverlayArgs<P, Q, R, M, N, I>
2930where
2931    P: AsRef<Path>,
2932    Q: AsRef<Path>,
2933    R: AsRef<Path>,
2934    M: AsRef<str>,
2935    N: Into<String>,
2936    I: IntoIterator<Item = R>,
2937{
2938    pub mountpoint: P,
2939    pub upperdir: Q,
2940    pub lowerdir: I,
2941    pub privileged: bool,
2942    pub mapping: Option<M>,
2943    pub name: Option<N>,
2944    pub allow_other: bool,
2945}
2946
2947/// Mounts the filesystem using the given parameters and returns the mount handle.
2948///
2949/// # Parameters
2950/// - `mountpoint`: Path to the mount point.
2951/// - `upperdir`: Path to the upper directory.
2952/// - `lowerdir`: Paths to the lower directories.
2953/// - `privileged`: If true, use privileged mount; otherwise, unprivileged mount.
2954/// - `mapping`: Optional user/group ID mapping for unprivileged mounts.
2955/// - `name`: Optional name for the filesystem.
2956/// - `allow_other`: If true, allows other users to access the filesystem.
2957///
2958/// # Returns
2959/// A mount handle on success.
2960pub async fn mount_fs<P, Q, R, M, N, I>(
2961    args: OverlayArgs<P, Q, R, M, N, I>,
2962) -> rfuse3::raw::MountHandle
2963where
2964    P: AsRef<Path>,
2965    Q: AsRef<Path>,
2966    R: AsRef<Path>,
2967    M: AsRef<str>,
2968    N: Into<String>,
2969    I: IntoIterator<Item = R>,
2970{
2971    // Create lower layers
2972    let mut lower_layers: Vec<Arc<BoxedLayer>> = Vec::new();
2973    for lower in args.lowerdir {
2974        let layer = new_passthroughfs_layer(PassthroughArgs {
2975            root_dir: lower,
2976            mapping: args.mapping.as_ref().map(|m| m.as_ref()),
2977        })
2978        .await
2979        .expect("Failed to create lower filesystem layer");
2980        lower_layers.push(Arc::new(layer) as Arc<BoxedLayer>);
2981    }
2982    // Create upper layer
2983    let upper_layer: Arc<BoxedLayer> = Arc::new(
2984        new_passthroughfs_layer(PassthroughArgs {
2985            root_dir: args.upperdir,
2986            mapping: args.mapping.as_ref().map(|m| m.as_ref()),
2987        })
2988        .await
2989        .expect("Failed to create upper filesystem layer"),
2990    );
2991
2992    // Configure overlay filesystem
2993    let config = Config {
2994        mountpoint: args.mountpoint.as_ref().to_path_buf(),
2995        do_import: true,
2996        ..Default::default()
2997    };
2998    let overlayfs = OverlayFs::new(Some(upper_layer), lower_layers, config, 1)
2999        .expect("Failed to initialize OverlayFs");
3000    let logfs = LoggingFileSystem::new(overlayfs);
3001
3002    let mount_path: OsString = OsString::from(args.mountpoint.as_ref().as_os_str());
3003
3004    // Obtain the current user's uid and gid
3005    let uid = unsafe { libc::getuid() };
3006    let gid = unsafe { libc::getgid() };
3007
3008    let mut mount_options = MountOptions::default();
3009    #[cfg(target_os = "linux")]
3010    mount_options.force_readdir_plus(true);
3011
3012    mount_options
3013        .uid(uid)
3014        .gid(gid)
3015        .allow_other(args.allow_other);
3016    if let Some(name) = args.name {
3017        mount_options.fs_name(name);
3018    }
3019
3020    // Mount filesystem based on privilege flag and return the mount handle
3021    if !args.privileged {
3022        debug!("Mounting with unprivileged mode");
3023        Session::new(mount_options)
3024            .mount_with_unprivileged(logfs, mount_path)
3025            .await
3026            .expect("Unprivileged mount failed")
3027    } else {
3028        debug!("Mounting with privileged mode");
3029        Session::new(mount_options)
3030            .mount(logfs, mount_path)
3031            .await
3032            .expect("Privileged mount failed")
3033    }
3034}