Skip to main content

libfuse_fs/overlayfs/
mod.rs

1// Copyright (C) 2023 Ant Group. All rights reserved.
2//  2024 From [fuse_backend_rs](https://github.com/cloud-hypervisor/fuse-backend-rs)
3// SPDX-License-Identifier: Apache-2.0
4
5#![allow(missing_docs)]
6mod async_io;
7pub mod config;
8mod inode_store;
9mod layer;
10mod utils;
11
12//mod tempfile;
13use core::panic;
14use std::collections::HashMap;
15use std::ffi::{OsStr, OsString};
16use std::future::Future;
17use std::io::{Error, Result};
18use std::path::Path;
19
20use config::Config;
21use futures::StreamExt as _;
22use rfuse3::raw::reply::{
23    DirectoryEntry, DirectoryEntryPlus, ReplyAttr, ReplyEntry, ReplyOpen, ReplyStatFs,
24};
25use rfuse3::raw::{Filesystem, Request, Session};
26use std::sync::{Arc, Weak};
27use tracing::debug;
28use tracing::error;
29use tracing::info;
30use tracing::trace;
31
32use rfuse3::{Errno, FileType, MountOptions, mode_from_kind_and_perm};
33const SLASH_ASCII: char = '/';
34use futures::future::join_all;
35use futures::stream::iter;
36
37use crate::passthrough::{PassthroughArgs, PassthroughFs, new_passthroughfs_layer};
38use crate::util::convert_stat64_to_file_attr;
39use crate::util::whiteout::{WhiteoutFormat, is_user_creatable_name, oci_whiteout_name};
40use inode_store::InodeStore;
41use layer::Layer;
42use rfuse3::raw::logfs::LoggingFileSystem;
43use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
44
45use tokio::sync::{Mutex, RwLock};
46
47pub type Inode = u64;
48pub type Handle = u64;
49
50type BoxedLayer = PassthroughFs;
51//type BoxedFileSystem = Box<dyn FileSystem<Inode = Inode, Handle = Handle> + Send + Sync>;
52const INODE_ALLOC_BATCH: u64 = 0x1_0000_0000;
53// RealInode represents one inode object in specific layer.
54// Also, each RealInode maps to one Entry, which should be 'forgotten' after drop.
55// Important note: do not impl Clone trait for it or refcount will be messed up.
56pub(crate) struct RealInode {
57    pub layer: Arc<PassthroughFs>,
58    pub in_upper_layer: bool,
59    pub inode: u64,
60    // File is whiteouted, we need to hide it.
61    pub whiteout: bool,
62    // Directory is opaque, we need to hide all entries inside it.
63    pub opaque: bool,
64    pub stat: Option<ReplyAttr>,
65}
66
67// OverlayInode must be protected by lock, it can be operated by multiple threads.
68// #[derive(Default)]
69pub(crate) struct OverlayInode {
70    // Inode hash table, map from 'name' to 'OverlayInode'.
71    pub childrens: Mutex<HashMap<String, Arc<OverlayInode>>>,
72    pub parent: Mutex<Weak<OverlayInode>>,
73    // Backend inodes from all layers.
74    pub real_inodes: Mutex<Vec<Arc<RealInode>>>,
75    // Inode number.
76    pub inode: u64,
77    pub path: RwLock<String>,
78    pub name: RwLock<String>,
79    pub lookups: AtomicU64,
80    // Node is whiteout-ed.
81    pub whiteout: AtomicBool,
82    // Directory is loaded.
83    pub loaded: AtomicBool,
84}
85
86#[derive(Default)]
87pub enum CachePolicy {
88    Never,
89    #[default]
90    Auto,
91    Always,
92}
93pub struct OverlayFs {
94    config: Config,
95    lower_layers: Vec<Arc<PassthroughFs>>,
96    upper_layer: Option<Arc<PassthroughFs>>,
97    // All inodes in FS.
98    inodes: RwLock<InodeStore>,
99    // Open file handles.
100    handles: Mutex<HashMap<u64, Arc<HandleData>>>,
101    next_handle: AtomicU64,
102    writeback: AtomicBool,
103    no_open: AtomicBool,
104    no_opendir: AtomicBool,
105    killpriv_v2: AtomicBool,
106    perfile_dax: AtomicBool,
107    root_inodes: u64,
108}
109
110// This is a wrapper of one inode in specific layer, It can't impl Clone trait.
111struct RealHandle {
112    layer: Arc<PassthroughFs>,
113    in_upper_layer: bool,
114    inode: u64,
115    handle: AtomicU64,
116}
117
118struct HandleData {
119    node: Arc<OverlayInode>,
120    //offset: libc::off_t,
121    real_handle: Option<RealHandle>,
122    // Cache the directory entries for stable readdir offsets.
123    // The snapshot contains all necessary info to avoid re-accessing childrens map.
124    dir_snapshot: Mutex<Option<Vec<DirectoryEntryPlus>>>,
125}
126
127// RealInode is a wrapper of one inode in specific layer.
128// All layer operations returning Entry should be wrapped in RealInode implementation
129// so that we can increase the refcount(lookup count) of each inode and decrease it after Drop.
130// Important: do not impl 'Copy' trait for it or refcount will be messed up.
131impl RealInode {
132    async fn new(
133        layer: Arc<PassthroughFs>,
134        in_upper_layer: bool,
135        inode: u64,
136        whiteout: bool,
137        opaque: bool,
138    ) -> Self {
139        let mut ri = RealInode {
140            layer,
141            in_upper_layer,
142            inode,
143            whiteout,
144            opaque,
145            stat: None,
146        };
147        match ri.stat64_ignore_enoent(&Request::default()).await {
148            Ok(v) => {
149                ri.stat = v;
150            }
151            Err(e) => {
152                error!("stat64 failed during RealInode creation: {e}");
153            }
154        }
155        ri
156    }
157
158    async fn stat64(&self, req: &Request) -> Result<ReplyAttr> {
159        let layer = self.layer.as_ref();
160        if self.inode == 0 {
161            return Err(Error::from_raw_os_error(libc::ENOENT));
162        }
163        // trace!("stat64: trying to getattr req: {:?}", req);
164        layer
165            .getattr(*req, self.inode, None, 0)
166            .await
167            .map_err(|e| e.into())
168    }
169
170    async fn stat64_ignore_enoent(&self, req: &Request) -> Result<Option<ReplyAttr>> {
171        match self.stat64(req).await {
172            Ok(v1) => Ok(Some(v1)),
173            Err(e) => match e.raw_os_error() {
174                Some(raw_error) => {
175                    if raw_error == libc::ENOENT
176                        || raw_error == libc::ENAMETOOLONG
177                        || raw_error == libc::ESTALE
178                    {
179                        return Ok(None);
180                    }
181                    Err(e)
182                }
183                None => Err(e),
184            },
185        }
186    }
187
188    // Do real lookup action in specific layer, this call will increase Entry refcount which must be released later.
189    async fn lookup_child_ignore_enoent(
190        &self,
191        ctx: Request,
192        name: &str,
193    ) -> Result<Option<ReplyEntry>> {
194        let cname = OsStr::new(name);
195        // Real inode must have a layer.
196        let layer = self.layer.as_ref();
197        match layer.lookup(ctx, self.inode, cname).await {
198            Ok(v) => {
199                // Negative entry also indicates missing entry.
200                if v.attr.ino == 0 {
201                    return Ok(None);
202                }
203                Ok(Some(v))
204            }
205            Err(e) => {
206                let ioerror: std::io::Error = e.into();
207                if let Some(raw_error) = ioerror.raw_os_error()
208                    && (raw_error == libc::ENOENT || raw_error == libc::ENAMETOOLONG)
209                {
210                    return Ok(None);
211                }
212
213                Err(e.into())
214            }
215        }
216    }
217
218    // Find child inode in same layer under this directory(Self).
219    // Return None if not found.
220    async fn lookup_child(&self, ctx: Request, name: &str) -> Result<Option<RealInode>> {
221        if self.whiteout {
222            return Ok(None);
223        }
224
225        let layer = self.layer.as_ref();
226
227        // Find child Entry with <name> under directory with inode <self.inode>.
228        match self.lookup_child_ignore_enoent(ctx, name).await? {
229            Some(v) => {
230                // The Entry must be forgotten in each layer, which will be done automatically by Drop operation.
231                let (whiteout, opaque) = if v.attr.kind == FileType::Directory {
232                    (false, layer.is_opaque(ctx, v.attr.ino).await?)
233                } else {
234                    let is_wh = match layer.whiteout_format() {
235                        WhiteoutFormat::CharDev => layer.is_whiteout(ctx, v.attr.ino).await?,
236                        WhiteoutFormat::OciWhiteout => {
237                            // OCI: marker is sibling `.wh.<name>` in same dir.
238                            let wh_name = oci_whiteout_name(std::ffi::OsStr::new(name));
239                            match layer.lookup(ctx, self.inode, &wh_name).await {
240                                Ok(marker) if marker.attr.ino != 0 => {
241                                    layer.forget(ctx, marker.attr.ino, 1).await;
242                                    true
243                                }
244                                Ok(_) => false,
245                                Err(e) => {
246                                    let ie: std::io::Error = e.into();
247                                    if ie.raw_os_error() == Some(libc::ENOENT) {
248                                        false
249                                    } else {
250                                        return Err(ie);
251                                    }
252                                }
253                            }
254                        }
255                    };
256                    (is_wh, false)
257                };
258
259                Ok(Some(RealInode {
260                    layer: self.layer.clone(),
261                    in_upper_layer: self.in_upper_layer,
262                    inode: v.attr.ino,
263                    whiteout,
264                    opaque,
265                    stat: Some(ReplyAttr {
266                        ttl: v.ttl,
267                        attr: v.attr,
268                    }),
269                }))
270            }
271            None => Ok(None),
272        }
273    }
274
275    // Read directory entries from specific RealInode, error out if it's not directory.
276    async fn readdir(&self, ctx: Request) -> Result<HashMap<String, RealInode>> {
277        // Deleted inode should not be read.
278        if self.whiteout {
279            return Err(Error::from_raw_os_error(libc::ENOENT));
280        }
281        // trace!("readdir: before stat");
282        let stat = match self.stat.clone() {
283            Some(v) => v,
284            None => self.stat64(&ctx).await?,
285        };
286
287        // Must be directory.
288        if stat.attr.kind != FileType::Directory {
289            return Err(Error::from_raw_os_error(libc::ENOTDIR));
290        }
291
292        // Open the directory and load each entry.
293        let opendir_res = self
294            .layer
295            .opendir(ctx, self.inode, libc::O_RDONLY as u32)
296            .await;
297        // trace!("readdir: after opendir");
298        let handle = match opendir_res {
299            Ok(handle) => handle,
300
301            // opendir may not be supported if no_opendir is set, so we can ignore this error.
302            Err(e) => {
303                let ioerror: std::io::Error = e.into();
304                match ioerror.raw_os_error() {
305                    Some(raw_error) if raw_error == libc::ENOSYS => {
306                        // We can still call readdir with inode if opendir is not supported in this layer.
307                        ReplyOpen { fh: 0, flags: 0 }
308                    }
309                    Some(_) => {
310                        return Err(e.into());
311                    }
312                    None => {
313                        return Err(e.into());
314                    }
315                }
316            }
317        };
318
319        let child_names = self.layer.readdir(ctx, self.inode, handle.fh, 0).await?;
320        // Non-zero handle indicates successful 'open', we should 'release' it.
321        if handle.fh > 0 {
322            self.layer
323                .releasedir(ctx, self.inode, handle.fh, handle.flags)
324                .await?
325            //DIFF
326        }
327
328        // Lookup all child and construct "RealInode"s.
329        let child_real_inodes = Arc::new(Mutex::new(HashMap::new()));
330        let oci_mode = matches!(self.layer.whiteout_format(), WhiteoutFormat::OciWhiteout);
331        let a_map = child_names.entries.map(|entery| async {
332            match entery {
333                Ok(dire) => {
334                    let dname = dire
335                        .name
336                        .into_string()
337                        .map_err(|_| Errno::from(libc::EINVAL))?;
338                    if dname == "." || dname == ".." {
339                        return Ok(());
340                    }
341                    if oci_mode {
342                        // Opaque-dir marker: bookkeeping only, never surfaced.
343                        if crate::util::whiteout::is_oci_opaque_marker(std::ffi::OsStr::new(&dname))
344                        {
345                            return Ok(());
346                        }
347                        // .wh.<base> marker: hide the marker name itself, but
348                        // register a whiteout entry under <base> so that the
349                        // union-merge step correctly drops the lower-layer
350                        // entry (otherwise lower's `<base>` would leak through
351                        // any time the upper layer is re-scanned from disk).
352                        if let Some(base) =
353                            crate::util::whiteout::oci_whiteout_target(std::ffi::OsStr::new(&dname))
354                        {
355                            let base_str = base.to_string_lossy().into_owned();
356                            // Look up the marker so we have its inode (needed
357                            // for forget/unlink later) and its attrs.
358                            let marker = self
359                                .layer
360                                .lookup(ctx, self.inode, std::ffi::OsStr::new(&dname))
361                                .await?;
362                            let real = RealInode {
363                                layer: self.layer.clone(),
364                                in_upper_layer: self.in_upper_layer,
365                                inode: marker.attr.ino,
366                                whiteout: true,
367                                opaque: false,
368                                stat: Some(ReplyAttr {
369                                    ttl: marker.ttl,
370                                    attr: marker.attr,
371                                }),
372                            };
373                            child_real_inodes.lock().await.insert(base_str, real);
374                            return Ok(());
375                        }
376                        // Unknown ".wh.*" form (defensive): skip rather than
377                        // surface — same conservative choice as before.
378                        if dname.starts_with(crate::util::whiteout::OCI_WHITEOUT_PREFIX) {
379                            return Ok(());
380                        }
381                    }
382                    if let Some(child) = self.lookup_child(ctx, &dname).await? {
383                        child_real_inodes.lock().await.insert(dname, child);
384                    }
385                    Ok(())
386                }
387                Err(err) => Err(err),
388            }
389        });
390        for result in join_all(a_map.collect::<Vec<_>>().await).await {
391            result?;
392        }
393        // Now into_inner func is safety.
394        let re = Arc::try_unwrap(child_real_inodes)
395            .map_err(|_| Errno::new_not_exist())?
396            .into_inner();
397        // trace!("readdir: return");
398        Ok(re)
399    }
400
401    async fn create_whiteout(&self, ctx: Request, name: &str) -> Result<RealInode> {
402        if !self.in_upper_layer {
403            return Err(Error::from_raw_os_error(libc::EROFS));
404        }
405
406        // from &str to &OsStr
407        let name_osstr = OsStr::new(name);
408        let entry = self
409            .layer
410            .create_whiteout(ctx, self.inode, name_osstr)
411            .await?;
412
413        // Wrap whiteout to RealInode.
414        Ok(RealInode {
415            layer: self.layer.clone(),
416            in_upper_layer: true,
417            inode: entry.attr.ino,
418            whiteout: true,
419            opaque: false,
420            stat: Some(ReplyAttr {
421                ttl: entry.ttl,
422                attr: entry.attr,
423            }),
424        })
425    }
426
427    async fn mkdir(&self, ctx: Request, name: &str, mode: u32, umask: u32) -> Result<RealInode> {
428        if !self.in_upper_layer {
429            return Err(Error::from_raw_os_error(libc::EROFS));
430        }
431
432        let name_osstr = OsStr::new(name);
433        let entry = self
434            .layer
435            .mkdir(ctx, self.inode, name_osstr, mode, umask)
436            .await?;
437
438        // update node's first_layer
439        Ok(RealInode {
440            layer: self.layer.clone(),
441            in_upper_layer: true,
442            inode: entry.attr.ino,
443            whiteout: false,
444            opaque: false,
445            stat: Some(ReplyAttr {
446                ttl: entry.ttl,
447                attr: entry.attr,
448            }),
449        })
450    }
451
452    async fn create(
453        &self,
454        ctx: Request,
455        name: &str,
456        mode: u32,
457        flags: u32,
458    ) -> Result<(RealInode, Option<u64>)> {
459        if !self.in_upper_layer {
460            return Err(Error::from_raw_os_error(libc::EROFS));
461        }
462        let name = OsStr::new(name);
463        let create_rep = self
464            .layer
465            .create(ctx, self.inode, name, mode, flags)
466            .await?;
467
468        Ok((
469            RealInode {
470                layer: self.layer.clone(),
471                in_upper_layer: true,
472                inode: create_rep.attr.ino,
473                whiteout: false,
474                opaque: false,
475                stat: Some(ReplyAttr {
476                    ttl: create_rep.ttl,
477                    attr: create_rep.attr,
478                }),
479            },
480            Some(create_rep.fh),
481        ))
482    }
483
484    async fn mknod(
485        &self,
486        ctx: Request,
487        name: &str,
488        mode: u32,
489        rdev: u32,
490        _umask: u32,
491    ) -> Result<RealInode> {
492        if !self.in_upper_layer {
493            return Err(Error::from_raw_os_error(libc::EROFS));
494        }
495        let name = OsStr::new(name);
496        let rep = self.layer.mknod(ctx, self.inode, name, mode, rdev).await?;
497        Ok(RealInode {
498            layer: self.layer.clone(),
499            in_upper_layer: true,
500            inode: rep.attr.ino,
501            whiteout: false,
502            opaque: false,
503            stat: Some(ReplyAttr {
504                ttl: rep.ttl,
505                attr: rep.attr,
506            }),
507        })
508    }
509
510    async fn link(&self, ctx: Request, ino: u64, name: &str) -> Result<RealInode> {
511        if !self.in_upper_layer {
512            return Err(Error::from_raw_os_error(libc::EROFS));
513        }
514        let name = OsStr::new(name);
515        let entry = self.layer.link(ctx, ino, self.inode, name).await?;
516
517        let opaque = if utils::is_dir(&entry.attr.kind) {
518            self.layer.is_opaque(ctx, entry.attr.ino).await?
519        } else {
520            false
521        };
522        Ok(RealInode {
523            layer: self.layer.clone(),
524            in_upper_layer: true,
525            inode: entry.attr.ino,
526            whiteout: false,
527            opaque,
528            stat: Some(ReplyAttr {
529                ttl: entry.ttl,
530                attr: entry.attr,
531            }),
532        })
533    }
534
535    // Create a symlink in self directory.
536    async fn symlink(&self, ctx: Request, link_name: &str, filename: &str) -> Result<RealInode> {
537        if !self.in_upper_layer {
538            return Err(Error::from_raw_os_error(libc::EROFS));
539        }
540        let link_name = OsStr::new(link_name);
541        let filename = OsStr::new(filename);
542        let entry = self
543            .layer
544            .symlink(ctx, self.inode, filename, link_name)
545            .await?;
546
547        Ok(RealInode {
548            layer: self.layer.clone(),
549            in_upper_layer: true,
550            inode: entry.attr.ino,
551            whiteout: false,
552            opaque: false,
553            stat: Some(ReplyAttr {
554                ttl: entry.ttl,
555                attr: entry.attr,
556            }),
557        })
558    }
559}
560
561impl Drop for RealInode {
562    fn drop(&mut self) {
563        let layer = Arc::clone(&self.layer);
564        let inode = self.inode;
565        tokio::spawn(async move {
566            let ctx = Request::default();
567            layer.forget(ctx, inode, 1).await;
568        });
569    }
570}
571
572impl OverlayInode {
573    pub fn new() -> Self {
574        Self {
575            childrens: Mutex::new(HashMap::new()),
576            parent: Mutex::new(Weak::new()),
577            real_inodes: Mutex::new(vec![]),
578            inode: 0,
579            path: RwLock::new(String::new()),
580            name: RwLock::new(String::new()),
581            lookups: AtomicU64::new(0),
582            whiteout: AtomicBool::new(false),
583            loaded: AtomicBool::new(false),
584        }
585    }
586    // Allocate new OverlayInode based on one RealInode,
587    // inode number is always 0 since only OverlayFs has global unique inode allocator.
588    pub async fn new_from_real_inode(
589        name: &str,
590        ino: u64,
591        path: String,
592        real_inode: RealInode,
593    ) -> Self {
594        let mut new = OverlayInode::new();
595        new.inode = ino;
596        new.path = path.into();
597        new.name = name.to_string().into();
598        new.whiteout.store(real_inode.whiteout, Ordering::Relaxed);
599        new.lookups = AtomicU64::new(1);
600        new.real_inodes = Mutex::new(vec![real_inode.into()]);
601        new
602    }
603
604    pub async fn new_from_real_inodes(
605        name: &str,
606        ino: u64,
607        path: String,
608        real_inodes: Vec<RealInode>,
609    ) -> Result<Self> {
610        if real_inodes.is_empty() {
611            error!("BUG: new_from_real_inodes() called with empty real_inodes");
612            return Err(Error::from_raw_os_error(libc::EINVAL));
613        }
614
615        let mut first = true;
616        let mut new = Self::new();
617        for ri in real_inodes {
618            let whiteout = ri.whiteout;
619            let opaque = ri.opaque;
620            let stat = match &ri.stat {
621                Some(v) => v.clone(),
622                None => ri.stat64(&Request::default()).await?,
623            };
624
625            if first {
626                first = false;
627                new = Self::new_from_real_inode(name, ino, path.clone(), ri).await;
628
629                // This is whiteout, no need to check lower layers.
630                if whiteout {
631                    break;
632                }
633
634                // A non-directory file shadows all lower layers as default.
635                if !utils::is_dir(&stat.attr.kind) {
636                    break;
637                }
638
639                // Opaque directory shadows all lower layers.
640                if opaque {
641                    break;
642                }
643            } else {
644                // This is whiteout, no need to record this, break directly.
645                if ri.whiteout {
646                    break;
647                }
648
649                // Only directory have multiple real inodes, so if this is non-first real-inode
650                // and it's not directory, it should indicates some invalid layout. @weizhang555
651                if !utils::is_dir(&stat.attr.kind) {
652                    error!("invalid layout: non-directory has multiple real inodes");
653                    break;
654                }
655
656                // Valid directory.
657                new.real_inodes.lock().await.push(ri.into());
658                // Opaque directory shadows all lower layers.
659                if opaque {
660                    break;
661                }
662            }
663        }
664        Ok(new)
665    }
666
667    pub async fn stat64(&self, ctx: Request) -> Result<ReplyAttr> {
668        // try layers in order or just take stat from first layer?
669        for l in self.real_inodes.lock().await.iter() {
670            if let Some(v) = l.stat64_ignore_enoent(&ctx).await? {
671                return Ok(v);
672            }
673        }
674
675        // not in any layer
676        Err(Error::from_raw_os_error(libc::ENOENT))
677    }
678
679    pub async fn is_dir(&self, ctx: Request) -> Result<bool> {
680        let st = self.stat64(ctx).await?;
681        Ok(utils::is_dir(&st.attr.kind))
682    }
683
684    pub async fn count_entries_and_whiteout(&self, ctx: Request) -> Result<(u64, u64)> {
685        let mut count = 0;
686        let mut whiteouts = 0;
687
688        let st = self.stat64(ctx).await?;
689
690        // must be directory
691        if !utils::is_dir(&st.attr.kind) {
692            return Err(Error::from_raw_os_error(libc::ENOTDIR));
693        }
694
695        for (_, child) in self.childrens.lock().await.iter() {
696            if child.whiteout.load(Ordering::Relaxed) {
697                whiteouts += 1;
698            } else {
699                count += 1;
700            }
701        }
702        Ok((count, whiteouts))
703    }
704
705    pub async fn open(
706        &self,
707        ctx: Request,
708        flags: u32,
709        _fuse_flags: u32,
710    ) -> Result<(Arc<BoxedLayer>, ReplyOpen)> {
711        let (layer, _, inode) = self.first_layer_inode().await;
712        let ro = layer.as_ref().open(ctx, inode, flags).await?;
713        Ok((layer, ro))
714    }
715
716    // Self is directory, fill all childrens.
717    pub async fn scan_childrens(self: &Arc<Self>, ctx: Request) -> Result<Vec<OverlayInode>> {
718        let st = self.stat64(ctx).await?;
719        if !utils::is_dir(&st.attr.kind) {
720            return Err(Error::from_raw_os_error(libc::ENOTDIR));
721        }
722
723        let mut all_layer_inodes: HashMap<String, Vec<RealInode>> = HashMap::new();
724        // read out directories from each layer
725        // Scan from upper layer to lower layer.
726        for ri in self.real_inodes.lock().await.iter() {
727            if ri.whiteout {
728                // Node is deleted from some upper layer, skip it.
729                debug!("directory is whiteout");
730                break;
731            }
732
733            let stat = match &ri.stat {
734                Some(v) => v.clone(),
735                None => ri.stat64(&ctx).await?,
736            };
737
738            if !utils::is_dir(&stat.attr.kind) {
739                debug!("{} is not a directory", self.path.read().await);
740                // not directory
741                break;
742            }
743
744            // Read all entries from one layer.
745            let entries: HashMap<String, RealInode> = ri.readdir(ctx).await?;
746
747            // Merge entries from one layer to all_layer_inodes.
748            for (name, inode) in entries {
749                match all_layer_inodes.get_mut(&name) {
750                    Some(v) => {
751                        // Append additional RealInode to the end of vector.
752                        v.push(inode)
753                    }
754                    None => {
755                        all_layer_inodes.insert(name, vec![inode]);
756                    }
757                }
758            }
759
760            // if opaque, stop here
761            if ri.opaque {
762                debug!("directory {} is opaque", self.path.read().await);
763                break;
764            }
765        }
766
767        // Construct OverlayInode for each entry.
768        let mut childrens = vec![];
769        for (name, real_inodes) in all_layer_inodes {
770            // Inode numbers are not allocated yet.
771            let path = format!("{}/{}", self.path.read().await, name);
772            let new = Self::new_from_real_inodes(name.as_str(), 0, path, real_inodes).await?;
773            childrens.push(new);
774        }
775
776        Ok(childrens)
777    }
778
779    /// Create a new directory in upper layer for node, node must be directory.
780    ///
781    /// Recursively ensures a directory path exists in the upper layer.
782    ///
783    /// This function is a critical part of the copy-up process. When a file or directory
784    /// needs to be copied up, this function is called on its parent to ensure the entire
785    /// directory hierarchy exists in the upper layer first. It works recursively:
786    /// 1. If the current directory is already in the upper layer, it does nothing.
787    /// 2. If not, it first calls itself on its own parent directory.
788    /// 3. Once the parent is guaranteed to be in the upper layer, it creates the current
789    ///    directory within the parent's upper-layer representation.
790    ///
791    /// Crucially, it preserves the original directory's ownership (UID/GID) and permissions
792    /// by using the [`do_getattr_helper`][crate::passthrough::PassthroughFs::do_getattr_helper] and
793    /// [`do_mkdir_helper`][crate::passthrough::PassthroughFs::do_mkdir_helper] functions.
794    pub async fn create_upper_dir(
795        self: Arc<Self>,
796        ctx: Request,
797        mode_umask: Option<(u32, u32)>,
798    ) -> Result<()> {
799        // To preserve original ownership, we must get the raw, unmapped host attributes.
800        // We achieve this by calling `do_getattr_helper`, which is specifically designed
801        // to bypass the ID mapping logic. This is safe and does not affect other
802        // functionalities because `do_getattr_helper` and the standard `stat64()` call
803        // both rely on the same underlying `stat` system call; they only differ in
804        // whether the resulting `uid` and `gid` are mapped.
805        let (self_layer, _, self_inode) = self.first_layer_inode().await;
806        let re = self_layer.do_getattr_helper(self_inode, None).await?;
807        let st = ReplyAttr {
808            ttl: re.1,
809            attr: convert_stat64_to_file_attr(re.0),
810        };
811        if !utils::is_dir(&st.attr.kind) {
812            return Err(Error::from_raw_os_error(libc::ENOTDIR));
813        }
814
815        // If node already has upper layer, we can just return here.
816        if self.in_upper_layer().await {
817            return Ok(());
818        }
819
820        // not in upper layer, check parent.
821        let pnode = if let Some(n) = self.parent.lock().await.upgrade() {
822            Arc::clone(&n)
823        } else {
824            return Err(Error::other("no parent?"));
825        };
826
827        if !pnode.in_upper_layer().await {
828            Box::pin(pnode.clone().create_upper_dir(ctx, None)).await?; // recursive call
829        }
830        let child: Arc<Mutex<Option<RealInode>>> = Arc::new(Mutex::new(None));
831        let c_name = self.name.read().await.clone();
832        let _ = pnode
833            .handle_upper_inode_locked(&mut |parent_upper_inode: Option<Arc<RealInode>>| async {
834                match parent_upper_inode {
835                    Some(parent_ri) => {
836                        let ri = match mode_umask {
837                            // We manually unfold the `mkdir` logic here instead of calling the `mkdir` method directly.
838                            // This is necessary to preserve the original directory's UID and GID during the copy-up process.
839                            Some((mode, umask)) => {
840                                if !parent_ri.in_upper_layer {
841                                    return Err(Error::from_raw_os_error(libc::EROFS));
842                                }
843                                let name_osstr = OsStr::new(&c_name);
844                                let entry = parent_ri
845                                    .layer
846                                    .do_mkdir_helper(
847                                        ctx,
848                                        parent_ri.inode,
849                                        name_osstr,
850                                        mode,
851                                        umask,
852                                        st.attr.uid,
853                                        st.attr.gid,
854                                    )
855                                    .await?;
856                                RealInode {
857                                    layer: parent_ri.layer.clone(),
858                                    in_upper_layer: true,
859                                    inode: entry.attr.ino,
860                                    whiteout: false,
861                                    opaque: false,
862                                    stat: Some(ReplyAttr {
863                                        ttl: entry.ttl,
864                                        attr: entry.attr,
865                                    }),
866                                }
867                            }
868                            None => {
869                                if !parent_ri.in_upper_layer {
870                                    return Err(Error::from_raw_os_error(libc::EROFS));
871                                }
872                                let name_osstr = OsStr::new(&c_name);
873                                let entry = parent_ri
874                                    .layer
875                                    .do_mkdir_helper(
876                                        ctx,
877                                        parent_ri.inode,
878                                        name_osstr,
879                                        mode_from_kind_and_perm(st.attr.kind, st.attr.perm),
880                                        0,
881                                        st.attr.uid,
882                                        st.attr.gid,
883                                    )
884                                    .await?;
885                                RealInode {
886                                    layer: parent_ri.layer.clone(),
887                                    in_upper_layer: true,
888                                    inode: entry.attr.ino,
889                                    whiteout: false,
890                                    opaque: false,
891                                    stat: Some(ReplyAttr {
892                                        ttl: entry.ttl,
893                                        attr: entry.attr,
894                                    }),
895                                }
896                            }
897                        };
898                        // create directory here
899                        child.lock().await.replace(ri);
900                    }
901                    None => {
902                        error!(
903                            "BUG: parent {} has no upper inode after create_upper_dir",
904                            pnode.inode
905                        );
906                        return Err(Error::from_raw_os_error(libc::EINVAL));
907                    }
908                }
909                Ok(false)
910            })
911            .await?;
912
913        if let Some(ri) = child.lock().await.take() {
914            // Push the new real inode to the front of vector.
915            self.add_upper_inode(ri, false).await;
916        }
917
918        Ok(())
919    }
920
921    // Add new upper RealInode to OverlayInode, clear all lower RealInodes if 'clear_lowers' is true.
922    async fn add_upper_inode(self: &Arc<Self>, ri: RealInode, clear_lowers: bool) {
923        let mut inodes = self.real_inodes.lock().await;
924        // Update self according to upper attribute.
925        self.whiteout.store(ri.whiteout, Ordering::Relaxed);
926
927        // Push the new real inode to the front of vector.
928        let mut new = vec![Arc::new(ri)];
929        // Drain lower RealInodes.
930        let lowers = inodes.drain(..).collect::<Vec<Arc<RealInode>>>();
931        if !clear_lowers {
932            // If not clear lowers, append them to the end of vector.
933            new.extend(lowers);
934        }
935        inodes.extend(new);
936    }
937
938    // return the uppder layer fs.
939    pub async fn in_upper_layer(&self) -> bool {
940        let all_inodes = self.real_inodes.lock().await;
941        let first = all_inodes.first();
942        match first {
943            Some(v) => v.in_upper_layer,
944            None => false,
945        }
946    }
947
948    pub async fn upper_layer_only(&self) -> bool {
949        let real_inodes = self.real_inodes.lock().await;
950        let first = real_inodes.first();
951        match first {
952            Some(v) => {
953                if !v.in_upper_layer {
954                    false
955                } else {
956                    real_inodes.len() == 1
957                }
958            }
959            None => false,
960        }
961    }
962
963    pub async fn first_layer_inode(&self) -> (Arc<BoxedLayer>, bool, u64) {
964        let all_inodes = self.real_inodes.lock().await;
965        let first = all_inodes.first();
966        match first {
967            Some(v) => (v.layer.clone(), v.in_upper_layer, v.inode),
968            None => panic!("BUG: dangling OverlayInode"),
969        }
970    }
971
972    pub async fn child(&self, name: &str) -> Option<Arc<OverlayInode>> {
973        self.childrens.lock().await.get(name).cloned()
974    }
975
976    pub async fn remove_child(&self, name: &str) -> Option<Arc<OverlayInode>> {
977        self.childrens.lock().await.remove(name)
978    }
979
980    pub async fn insert_child(&self, name: &str, node: Arc<OverlayInode>) {
981        self.childrens.lock().await.insert(name.to_string(), node);
982    }
983
984    /// Handles operations on the upper layer inode of an `OverlayInode` in a thread-safe manner.
985    ///
986    /// This function locks the `real_inodes` field of the `OverlayInode` and retrieves the first
987    /// real inode (if any). If the first inode exists and belongs to the upper layer (`in_upper_layer` is true),
988    /// the provided callback `f` is invoked with the inode wrapped in `Some`. Otherwise, `f` is invoked with `None`.
989    ///
990    /// # Arguments
991    /// * `f`: A closure that takes an `Option<RealInode>` and returns a future. The future resolves to a `Result<bool>`.
992    ///
993    /// # Returns
994    /// * `Ok(bool)`: The result of invoking the callback `f`.
995    /// * `Err(Erron)`: An error is returned if:
996    ///   - There are no backend inodes (`real_inodes` is empty), indicating a dangling `OverlayInode`.
997    ///   - The callback `f` itself returns an error.
998    ///
999    /// # Behavior
1000    /// 1. Locks the `real_inodes` field to ensure thread safety.
1001    /// 2. Checks if the first inode exists:
1002    ///    - If it exists and is in the upper layer, invokes `f(Some(inode))`.
1003    ///    - If it exists but is not in the upper layer, invokes `f(None)`.
1004    /// 3. If no inodes exist, returns an error indicating a dangling `OverlayInode`.
1005    ///
1006    /// # Example Use Case
1007    /// This function is typically used to perform operations on the upper layer inode of an `OverlayInode`,
1008    /// such as creating, modifying, or deleting files/directories in the overlay filesystem's upper layer.
1009    pub async fn handle_upper_inode_locked<F, Fut>(&self, f: F) -> Result<bool>
1010    where
1011        // Can pass a &RealInode (or None) to f for any lifetime 'a
1012        F: FnOnce(Option<Arc<RealInode>>) -> Fut,
1013        // f returns a Future that must live at least as long as 'a
1014        Fut: Future<Output = Result<bool>>,
1015    {
1016        let all_inodes = self.real_inodes.lock().await;
1017        let first = all_inodes.first();
1018        match first {
1019            Some(v) => {
1020                if v.in_upper_layer {
1021                    f(Some(v.clone())).await
1022                } else {
1023                    f(None).await
1024                }
1025            }
1026            None => Err(Error::other(format!(
1027                "BUG: dangling OverlayInode {} without any backend inode",
1028                self.inode
1029            ))),
1030        }
1031    }
1032}
1033#[allow(unused)]
1034fn entry_type_from_mode(mode: libc::mode_t) -> u8 {
1035    match mode & libc::S_IFMT {
1036        libc::S_IFBLK => libc::DT_BLK,
1037        libc::S_IFCHR => libc::DT_CHR,
1038        libc::S_IFDIR => libc::DT_DIR,
1039        libc::S_IFIFO => libc::DT_FIFO,
1040        libc::S_IFLNK => libc::DT_LNK,
1041        libc::S_IFREG => libc::DT_REG,
1042        libc::S_IFSOCK => libc::DT_SOCK,
1043        _ => libc::DT_UNKNOWN,
1044    }
1045}
1046impl OverlayFs {
1047    pub fn new(
1048        upper: Option<Arc<BoxedLayer>>,
1049        lowers: Vec<Arc<BoxedLayer>>,
1050        params: Config,
1051        root_inode: u64,
1052    ) -> Result<Self> {
1053        Ok(OverlayFs {
1054            config: params,
1055            lower_layers: lowers,
1056            upper_layer: upper,
1057            inodes: RwLock::new(InodeStore::new()),
1058            handles: Mutex::new(HashMap::new()),
1059            next_handle: AtomicU64::new(1),
1060            writeback: AtomicBool::new(false),
1061            no_open: AtomicBool::new(false),
1062            no_opendir: AtomicBool::new(false),
1063            killpriv_v2: AtomicBool::new(false),
1064            perfile_dax: AtomicBool::new(false),
1065            root_inodes: root_inode,
1066        })
1067    }
1068
1069    pub fn root_inode(&self) -> Inode {
1070        self.root_inodes
1071    }
1072
1073    async fn alloc_inode(&self, path: &str) -> Result<u64> {
1074        self.inodes.write().await.alloc_inode(path)
1075    }
1076
1077    fn check_user_creatable_name(&self, name: &OsStr) -> Result<()> {
1078        let format = self
1079            .upper_layer
1080            .as_ref()
1081            .map(|layer| layer.whiteout_format())
1082            .unwrap_or_default();
1083        if is_user_creatable_name(format, name) {
1084            Ok(())
1085        } else {
1086            Err(Error::from_raw_os_error(libc::EINVAL))
1087        }
1088    }
1089
1090    /// Add a file layer and stack and merge the previous file layers.
1091    pub async fn push_layer(&mut self, layer: Arc<BoxedLayer>) -> Result<()> {
1092        let upper = self.upper_layer.take();
1093        if let Some(upper) = upper {
1094            self.lower_layers.push(upper);
1095        }
1096        self.upper_layer = Some(layer);
1097        // TODO: merge previous file layers. need optimization
1098        self.import().await?;
1099        Ok(())
1100    }
1101
1102    pub async fn import(&self) -> Result<()> {
1103        let mut root = OverlayInode::new();
1104        root.inode = self.root_inode();
1105        root.path = String::from("").into();
1106        root.name = String::from("").into();
1107        root.lookups = AtomicU64::new(2);
1108        root.real_inodes = Mutex::new(vec![]);
1109        let ctx = Request::default();
1110
1111        // Update upper inode
1112        if let Some(layer) = self.upper_layer.as_ref() {
1113            let ino = layer.root_inode();
1114            let real = RealInode::new(
1115                layer.clone(),
1116                true,
1117                ino,
1118                false,
1119                layer.is_opaque(ctx, ino).await?,
1120            )
1121            .await;
1122            root.real_inodes.lock().await.push(real.into());
1123        }
1124
1125        // Update lower inodes.
1126        for layer in self.lower_layers.iter() {
1127            let ino = layer.root_inode();
1128            let real: RealInode = RealInode::new(
1129                layer.clone(),
1130                false,
1131                ino,
1132                false,
1133                layer.is_opaque(ctx, ino).await?,
1134            )
1135            .await;
1136            root.real_inodes.lock().await.push(real.into());
1137        }
1138        let root_node = Arc::new(root);
1139
1140        // insert root inode into hash
1141        self.insert_inode(self.root_inode(), Arc::clone(&root_node))
1142            .await;
1143
1144        info!("loading root directory");
1145        self.load_directory(ctx, &root_node).await?;
1146        info!("loaded root directory");
1147
1148        Ok(())
1149    }
1150
1151    async fn root_node(&self) -> Arc<OverlayInode> {
1152        // Root node must exist.
1153        self.get_active_inode(self.root_inode()).await.unwrap()
1154    }
1155
1156    async fn insert_inode(&self, inode: u64, node: Arc<OverlayInode>) {
1157        self.inodes.write().await.insert_inode(inode, node).await;
1158    }
1159
1160    async fn get_active_inode(&self, inode: u64) -> Option<Arc<OverlayInode>> {
1161        self.inodes.read().await.get_inode(inode)
1162    }
1163
1164    // Get inode which is active or deleted.
1165    async fn get_all_inode(&self, inode: u64) -> Option<Arc<OverlayInode>> {
1166        let inode_store = self.inodes.read().await;
1167        match inode_store.get_inode(inode) {
1168            Some(n) => Some(n),
1169            None => inode_store.get_deleted_inode(inode),
1170        }
1171    }
1172
1173    // Return the inode only if it's permanently deleted from both self.inodes and self.deleted_inodes.
1174    async fn remove_inode(
1175        &self,
1176        inode: u64,
1177        path_removed: Option<String>,
1178    ) -> Option<Arc<OverlayInode>> {
1179        self.inodes
1180            .write()
1181            .await
1182            .remove_inode(inode, path_removed)
1183            .await
1184    }
1185
1186    // Lookup child OverlayInode with <name> under <parent> directory.
1187    // If name is empty, return parent itself.
1188    // Parent dir will be loaded, but returned OverlayInode won't.
1189    async fn lookup_node(
1190        &self,
1191        ctx: Request,
1192        parent: Inode,
1193        name: &str,
1194    ) -> Result<Arc<OverlayInode>> {
1195        if name.contains(SLASH_ASCII) {
1196            return Err(Error::from_raw_os_error(libc::EINVAL));
1197        }
1198
1199        // Parent inode is expected to be loaded before this function is called.
1200        // TODO: Is this correct?
1201        let pnode = match self.get_active_inode(parent).await {
1202            Some(v) => v,
1203            None => {
1204                match self.get_all_inode(parent).await {
1205                    Some(v) => {
1206                        trace!(
1207                            "overlayfs:mod.rs:1031:lookup_node: parent inode {parent} is deleted"
1208                        );
1209                        v
1210                    }
1211                    None => {
1212                        trace!(
1213                            "overlayfs:mod.rs:1034:lookup_node: parent inode {parent} not found"
1214                        );
1215                        // Parent inode is not found, return ENOENT.
1216                        return Err(Error::from_raw_os_error(libc::ENOENT));
1217                    }
1218                }
1219            }
1220        };
1221
1222        // Parent is whiteout-ed, return ENOENT.
1223        if pnode.whiteout.load(Ordering::Relaxed) {
1224            return Err(Error::from_raw_os_error(libc::ENOENT));
1225        }
1226
1227        let st = pnode.stat64(ctx).await?;
1228        if utils::is_dir(&st.attr.kind) && !pnode.loaded.load(Ordering::Relaxed) {
1229            // Parent is expected to be directory, load it first.
1230            self.load_directory(ctx, &pnode).await?;
1231        }
1232
1233        // Current file or dir.
1234        if name.eq(".")  
1235            // Root directory has no parent.
1236            || (parent == self.root_inode() && name.eq("..")) 
1237            // Special convention: empty name indicates current dir.
1238            || name.is_empty()
1239        {
1240            return Ok(Arc::clone(&pnode));
1241        }
1242
1243        match pnode.child(name).await {
1244            // Child is found.
1245            Some(v) => Ok(v),
1246            None => {
1247                trace!("lookup_node: child {name} not found");
1248                Err(Error::from_raw_os_error(libc::ENOENT))
1249            }
1250        }
1251    }
1252
1253    async fn lookup_node_ignore_enoent(
1254        &self,
1255        ctx: Request,
1256        parent: u64,
1257        name: &str,
1258    ) -> Result<Option<Arc<OverlayInode>>> {
1259        match self.lookup_node(ctx, parent, name).await {
1260            Ok(n) => Ok(Some(Arc::clone(&n))),
1261            Err(e) => {
1262                if let Some(raw_error) = e.raw_os_error()
1263                    && raw_error == libc::ENOENT
1264                {
1265                    return Ok(None);
1266                }
1267                Err(e)
1268            }
1269        }
1270    }
1271
1272    // Load entries of the directory from all layers, if node is not directory, return directly.
1273    async fn load_directory(&self, ctx: Request, node: &Arc<OverlayInode>) -> Result<()> {
1274        if node.loaded.load(Ordering::Relaxed) {
1275            return Ok(());
1276        }
1277
1278        // We got all childrens without inode.
1279        // info!("before scan childrens, ctx: {:?}, node: {:?}", ctx, node.inode);
1280        let childrens = node.scan_childrens(ctx).await?;
1281        // info!("scanned children");
1282
1283        // =============== Start Lock Area ===================
1284        // Lock OverlayFs inodes.
1285        let mut inode_store = self.inodes.write().await;
1286        // Lock the OverlayInode and its childrens.
1287        let mut node_children = node.childrens.lock().await;
1288
1289        // Check again in case another 'load_directory' function call gets locks and want to do duplicated work.
1290        if node.loaded.load(Ordering::Relaxed) {
1291            return Ok(());
1292        }
1293
1294        // Now we have two locks' protection, Fs inodes lock and OverlayInode's childrens lock.
1295        // info!("before iter childrens");
1296        for mut child in childrens.into_iter() {
1297            // Allocate inode for each child.
1298            let ino = inode_store.alloc_inode(&child.path.read().await)?;
1299
1300            let name = child.name.read().await.clone();
1301            child.inode = ino;
1302            // Create bi-directional link between parent and child.
1303            child.parent = Mutex::new(Arc::downgrade(node));
1304
1305            let arc_child = Arc::new(child);
1306            node_children.insert(name, arc_child.clone());
1307            // Record overlay inode in whole OverlayFs.
1308            inode_store.insert_inode(ino, arc_child).await;
1309        }
1310        // info!("after iter childrens");
1311
1312        node.loaded.store(true, Ordering::Relaxed);
1313
1314        Ok(())
1315    }
1316
1317    async fn forget_one(&self, inode: Inode, count: u64) {
1318        if inode == self.root_inode() || inode == 0 {
1319            return;
1320        }
1321
1322        let v = match self.get_all_inode(inode).await {
1323            Some(n) => n,
1324            None => {
1325                trace!("forget unknown inode: {inode}");
1326                return;
1327            }
1328        };
1329
1330        // Use fetch_update to atomically update lookups in a loop until it succeeds
1331        v.lookups
1332            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
1333                // If count is larger than current lookups, return 0
1334                // Otherwise subtract count from current lookups
1335                if current < count {
1336                    Some(0)
1337                } else {
1338                    Some(current - count)
1339                }
1340            })
1341            .expect("fetch_update failed");
1342
1343        let lookups = v.lookups.load(Ordering::Relaxed);
1344        trace!(
1345            "forget inode: {}, name {}, lookups: {}",
1346            inode,
1347            v.name.read().await,
1348            lookups
1349        );
1350        if lookups == 0 {
1351            debug!(
1352                "inode is forgotten: {}, name {}",
1353                inode,
1354                v.name.read().await
1355            );
1356            let _ = self.remove_inode(inode, None).await;
1357            let parent = v.parent.lock().await;
1358
1359            if let Some(p) = parent.upgrade() {
1360                // remove it from hashmap
1361                p.remove_child(&v.name.read().await).await;
1362            }
1363        }
1364    }
1365
1366    async fn do_lookup(&self, ctx: Request, parent: Inode, name: &str) -> Result<ReplyEntry> {
1367        let node = self.lookup_node(ctx, parent, name).await?;
1368        debug!("do_lookup: {name:?}, found");
1369
1370        if node.whiteout.load(Ordering::Relaxed) {
1371            eprintln!("Error: node.whiteout.load() called.");
1372            return Err(Error::from_raw_os_error(libc::ENOENT));
1373        }
1374
1375        let mut st = node.stat64(ctx).await?;
1376        st.attr.ino = node.inode;
1377        if utils::is_dir(&st.attr.kind) && !node.loaded.load(Ordering::Relaxed) {
1378            self.load_directory(ctx, &node).await?;
1379        }
1380
1381        // FIXME: can forget happen between found and increase reference counter?
1382        let tmp = node.lookups.fetch_add(1, Ordering::Relaxed);
1383        trace!("lookup count: {}", tmp + 1);
1384        Ok(ReplyEntry {
1385            ttl: st.ttl,
1386            attr: st.attr,
1387            generation: 0,
1388        })
1389    }
1390
1391    async fn do_statvfs(&self, ctx: Request, inode: Inode) -> Result<ReplyStatFs> {
1392        match self.get_active_inode(inode).await {
1393            Some(ovi) => {
1394                let all_inodes = ovi.real_inodes.lock().await;
1395                let real_inode = all_inodes
1396                    .first()
1397                    .ok_or(Error::other("backend inode not found"))?;
1398                Ok(real_inode.layer.statfs(ctx, real_inode.inode).await?)
1399            }
1400            None => Err(Error::from_raw_os_error(libc::ENOENT)),
1401        }
1402    }
1403
1404    #[allow(clippy::too_many_arguments)]
1405    async fn do_readdir<'a>(
1406        &self,
1407        ctx: Request,
1408        inode: Inode,
1409        handle: u64,
1410        offset: u64,
1411    ) -> Result<
1412        impl futures_util::stream::Stream<Item = std::result::Result<DirectoryEntry, Errno>> + Send + 'a,
1413    > {
1414        let snapshot = self.get_or_create_dir_snapshot(ctx, inode, handle).await?;
1415
1416        let entries: Vec<std::result::Result<DirectoryEntry, Errno>> =
1417            if offset < snapshot.len() as u64 {
1418                snapshot
1419                    .iter()
1420                    .skip(offset as usize)
1421                    .map(|entry| {
1422                        Ok(DirectoryEntry {
1423                            inode: entry.inode,
1424                            kind: entry.kind,
1425                            name: entry.name.clone(),
1426                            offset: entry.offset,
1427                        })
1428                    })
1429                    .collect()
1430            } else {
1431                vec![]
1432            };
1433
1434        Ok(iter(entries))
1435    }
1436
1437    #[allow(clippy::too_many_arguments)]
1438    async fn do_readdirplus<'a>(
1439        &self,
1440        ctx: Request,
1441        inode: Inode,
1442        handle: u64,
1443        offset: u64,
1444    ) -> Result<
1445        impl futures_util::stream::Stream<Item = std::result::Result<DirectoryEntryPlus, Errno>>
1446        + Send
1447        + 'a,
1448    > {
1449        let snapshot = self.get_or_create_dir_snapshot(ctx, inode, handle).await?;
1450
1451        let mut entries = Vec::new();
1452        if offset < snapshot.len() as u64 {
1453            for entry in snapshot.iter().skip(offset as usize) {
1454                // Increment lookup count for readdirplus as we are handing out a reference to the kernel.
1455                // We must do this here, not in snapshot creation, and we must NOT decrement it in HandleData drop.
1456                // The kernel will send a FORGET request when it's done with the entry.
1457                if let Some(node) = self.get_all_inode(entry.inode).await {
1458                    node.lookups.fetch_add(1, Ordering::Relaxed);
1459                }
1460                entries.push(Ok(entry.clone()));
1461            }
1462        }
1463
1464        Ok(iter(entries))
1465    }
1466
1467    async fn get_or_create_dir_snapshot(
1468        &self,
1469        ctx: Request,
1470        inode: Inode,
1471        handle: u64,
1472    ) -> Result<Vec<DirectoryEntryPlus>> {
1473        let handle_data = match self.handles.lock().await.get(&handle) {
1474            Some(hd) if hd.node.inode == inode => hd.clone(),
1475            _ => {
1476                // Fallback for cases without a valid handle (e.g. no-opendir)
1477                let node = self.lookup_node(ctx, inode, ".").await?;
1478                let st = node.stat64(ctx).await?;
1479                if !utils::is_dir(&st.attr.kind) {
1480                    return Err(Error::from_raw_os_error(libc::ENOTDIR));
1481                }
1482                // Create a temporary HandleData for this call only.
1483                Arc::new(HandleData {
1484                    node,
1485                    real_handle: None,
1486                    dir_snapshot: Mutex::new(None),
1487                })
1488            }
1489        };
1490
1491        // Optimistic check
1492        if let Some(snapshot) = handle_data.dir_snapshot.lock().await.as_ref() {
1493            return Ok(snapshot.clone());
1494        }
1495
1496        // Snapshot doesn't exist, create it.
1497        let ovl_inode = &handle_data.node;
1498        self.load_directory(ctx, ovl_inode).await?;
1499
1500        let mut entries = Vec::new();
1501
1502        // 1. Add "." entry
1503        let mut st_self = ovl_inode.stat64(ctx).await?;
1504        st_self.attr.ino = ovl_inode.inode;
1505        entries.push(DirectoryEntryPlus {
1506            inode: ovl_inode.inode,
1507            generation: 0,
1508            kind: st_self.attr.kind,
1509            name: ".".into(),
1510            offset: 1,
1511            attr: st_self.attr,
1512            entry_ttl: st_self.ttl,
1513            attr_ttl: st_self.ttl,
1514        });
1515
1516        // 2. Add ".." entry
1517        let parent_node = match ovl_inode.parent.lock().await.upgrade() {
1518            Some(node) => node,
1519            None => self.root_node().await,
1520        };
1521        let mut st_parent = parent_node.stat64(ctx).await?;
1522        st_parent.attr.ino = parent_node.inode;
1523        entries.push(DirectoryEntryPlus {
1524            inode: parent_node.inode,
1525            generation: 0,
1526            kind: st_parent.attr.kind,
1527            name: "..".into(),
1528            offset: 2,
1529            attr: st_parent.attr,
1530            entry_ttl: st_parent.ttl,
1531            attr_ttl: st_parent.ttl,
1532        });
1533
1534        // 3. Add children entries
1535        let children = ovl_inode.childrens.lock().await;
1536        for (name, child) in children.iter() {
1537            if child.whiteout.load(Ordering::Relaxed) {
1538                continue;
1539            }
1540            let mut st_child = child.stat64(ctx).await?;
1541            st_child.attr.ino = child.inode;
1542            entries.push(DirectoryEntryPlus {
1543                inode: child.inode,
1544                generation: 0,
1545                kind: st_child.attr.kind,
1546                name: name.clone().into(),
1547                offset: (entries.len() + 1) as i64,
1548                attr: st_child.attr,
1549                entry_ttl: st_child.ttl,
1550                attr_ttl: st_child.ttl,
1551            });
1552        }
1553        drop(children);
1554
1555        let mut snapshot_guard = handle_data.dir_snapshot.lock().await;
1556        if snapshot_guard.is_none() {
1557            // We won the race, install our prepared snapshot.
1558            *snapshot_guard = Some(entries.clone());
1559            Ok(entries)
1560        } else {
1561            // Another thread won the race while we were preparing.
1562            // Discard our work and use the existing snapshot.
1563            Ok(snapshot_guard.as_ref().unwrap().clone())
1564        }
1565    }
1566
1567    async fn do_mkdir(
1568        &self,
1569        ctx: Request,
1570        parent_node: Arc<OverlayInode>,
1571        name: &str,
1572        mode: u32,
1573        umask: u32,
1574    ) -> Result<()> {
1575        if self.upper_layer.is_none() {
1576            return Err(Error::from_raw_os_error(libc::EROFS));
1577        }
1578
1579        // Parent node was deleted.
1580        if parent_node.whiteout.load(Ordering::Relaxed) {
1581            return Err(Error::from_raw_os_error(libc::ENOENT));
1582        }
1583        self.check_user_creatable_name(OsStr::new(name))?;
1584
1585        let mut delete_whiteout = false;
1586        let mut set_opaque = false;
1587        if let Some(n) = self
1588            .lookup_node_ignore_enoent(ctx, parent_node.inode, name)
1589            .await?
1590        {
1591            // Node with same name exists, let's check if it's whiteout.
1592            if !n.whiteout.load(Ordering::Relaxed) {
1593                return Err(Error::from_raw_os_error(libc::EEXIST));
1594            }
1595
1596            if n.in_upper_layer().await {
1597                delete_whiteout = true;
1598            }
1599
1600            // Set opaque if child dir has lower layers.
1601            if !n.upper_layer_only().await {
1602                set_opaque = true;
1603            }
1604        }
1605
1606        // Copy parent node up if necessary.
1607        let pnode = self.copy_node_up(ctx, parent_node).await?;
1608
1609        let path = format!("{}/{}", pnode.path.read().await, name);
1610        let path_ref = &path;
1611        let new_node = Arc::new(Mutex::new(None));
1612        pnode
1613            .handle_upper_inode_locked(&mut |parent_real_inode: Option<Arc<RealInode>>| async {
1614                let parent_real_inode = match parent_real_inode {
1615                    Some(inode) => inode,
1616                    None => {
1617                        error!("BUG: parent doesn't have upper inode after copied up");
1618                        return Err(Error::from_raw_os_error(libc::EINVAL));
1619                    }
1620                };
1621                let osstr = OsStr::new(name);
1622                if delete_whiteout {
1623                    let _ = parent_real_inode
1624                        .layer
1625                        .delete_whiteout(ctx, parent_real_inode.inode, osstr)
1626                        .await;
1627                }
1628
1629                // Allocate inode number.
1630                let ino = self.alloc_inode(path_ref).await?;
1631                let child_dir = parent_real_inode.mkdir(ctx, name, mode, umask).await?;
1632                // Set opaque if child dir has lower layers.
1633                if set_opaque {
1634                    parent_real_inode
1635                        .layer
1636                        .set_opaque(ctx, child_dir.inode)
1637                        .await?;
1638                }
1639                let ovi =
1640                    OverlayInode::new_from_real_inode(name, ino, path_ref.clone(), child_dir).await;
1641                new_node.lock().await.replace(ovi);
1642                Ok(false)
1643            })
1644            .await?;
1645
1646        // new_node is always 'Some'
1647        let nn = new_node.lock().await.take();
1648        let arc_node = Arc::new(nn.unwrap());
1649        self.insert_inode(arc_node.inode, arc_node.clone()).await;
1650        pnode.insert_child(name, arc_node).await;
1651        Ok(())
1652    }
1653
1654    async fn do_mknod(
1655        &self,
1656        ctx: Request,
1657        parent_node: &Arc<OverlayInode>,
1658        name: &str,
1659        mode: u32,
1660        rdev: u32,
1661        umask: u32,
1662    ) -> Result<()> {
1663        if self.upper_layer.is_none() {
1664            return Err(Error::from_raw_os_error(libc::EROFS));
1665        }
1666
1667        // Parent node was deleted.
1668        if parent_node.whiteout.load(Ordering::Relaxed) {
1669            return Err(Error::from_raw_os_error(libc::ENOENT));
1670        }
1671        self.check_user_creatable_name(OsStr::new(name))?;
1672
1673        match self
1674            .lookup_node_ignore_enoent(ctx, parent_node.inode, name)
1675            .await?
1676        {
1677            Some(n) => {
1678                // Node with same name exists, let's check if it's whiteout.
1679                if !n.whiteout.load(Ordering::Relaxed) {
1680                    return Err(Error::from_raw_os_error(libc::EEXIST));
1681                }
1682
1683                // Copy parent node up if necessary.
1684                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1685                pnode
1686                    .handle_upper_inode_locked(
1687                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1688                            let parent_real_inode = match parent_real_inode {
1689                                Some(inode) => inode,
1690                                None => {
1691                                    error!("BUG: parent doesn't have upper inode after copied up");
1692                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1693                                }
1694                            };
1695                            let osstr = OsStr::new(name);
1696                            if n.in_upper_layer().await {
1697                                let _ = parent_real_inode
1698                                    .layer
1699                                    .delete_whiteout(ctx, parent_real_inode.inode, osstr)
1700                                    .await;
1701                            }
1702
1703                            let child_ri = parent_real_inode
1704                                .mknod(ctx, name, mode, rdev, umask)
1705                                .await?;
1706
1707                            // Replace existing real inodes with new one.
1708                            n.add_upper_inode(child_ri, true).await;
1709                            Ok(false)
1710                        },
1711                    )
1712                    .await?;
1713            }
1714            None => {
1715                // Copy parent node up if necessary.
1716                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1717                let new_node = Arc::new(Mutex::new(None));
1718                let path = format!("{}/{}", pnode.path.read().await, name);
1719                pnode
1720                    .handle_upper_inode_locked(
1721                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1722                            let parent_real_inode = match parent_real_inode {
1723                                Some(inode) => inode,
1724                                None => {
1725                                    error!("BUG: parent doesn't have upper inode after copied up");
1726                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1727                                }
1728                            };
1729
1730                            // Allocate inode number.
1731                            let ino = self.alloc_inode(&path).await?;
1732                            let child_ri = parent_real_inode
1733                                .mknod(ctx, name, mode, rdev, umask)
1734                                .await?;
1735                            let ovi = OverlayInode::new_from_real_inode(
1736                                name,
1737                                ino,
1738                                path.clone(),
1739                                child_ri,
1740                            )
1741                            .await;
1742
1743                            new_node.lock().await.replace(ovi);
1744                            Ok(false)
1745                        },
1746                    )
1747                    .await?;
1748
1749                let nn = new_node.lock().await.take();
1750                let arc_node = Arc::new(nn.unwrap());
1751                self.insert_inode(arc_node.inode, arc_node.clone()).await;
1752                pnode.insert_child(name, arc_node).await;
1753            }
1754        }
1755
1756        Ok(())
1757    }
1758
1759    async fn do_create(
1760        &self,
1761        ctx: Request,
1762        parent_node: &Arc<OverlayInode>,
1763        name: &OsStr,
1764        mode: u32,
1765        flags: u32,
1766    ) -> Result<Option<u64>> {
1767        let name_str = name
1768            .to_str()
1769            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
1770        let upper = self
1771            .upper_layer
1772            .as_ref()
1773            .cloned()
1774            .ok_or_else(|| Error::from_raw_os_error(libc::EROFS))?;
1775
1776        // Parent node was deleted.
1777        if parent_node.whiteout.load(Ordering::Relaxed) {
1778            return Err(Error::from_raw_os_error(libc::ENOENT));
1779        }
1780        self.check_user_creatable_name(name)?;
1781
1782        let handle: Arc<Mutex<Option<u64>>> = Arc::new(Mutex::new(None));
1783        let real_ino: Arc<Mutex<Option<u64>>> = Arc::new(Mutex::new(None));
1784        let new_ovi = match self
1785            .lookup_node_ignore_enoent(ctx, parent_node.inode, name_str)
1786            .await?
1787        {
1788            Some(n) => {
1789                // Node with same name exists, let's check if it's whiteout.
1790                if !n.whiteout.load(Ordering::Relaxed) {
1791                    return Err(Error::from_raw_os_error(libc::EEXIST));
1792                }
1793
1794                // Copy parent node up if necessary.
1795                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1796                pnode
1797                    .handle_upper_inode_locked(
1798                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1799                            let parent_real_inode = match parent_real_inode {
1800                                Some(inode) => inode,
1801                                None => {
1802                                    error!("BUG: parent doesn't have upper inode after copied up");
1803                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1804                                }
1805                            };
1806
1807                            if n.in_upper_layer().await {
1808                                let _ = parent_real_inode
1809                                    .layer
1810                                    .delete_whiteout(ctx, parent_real_inode.inode, name)
1811                                    .await;
1812                            }
1813
1814                            let (child_ri, hd) =
1815                                parent_real_inode.create(ctx, name_str, mode, flags).await?;
1816                            real_ino.lock().await.replace(child_ri.inode);
1817                            handle.lock().await.replace(hd.unwrap());
1818
1819                            // Replace existing real inodes with new one.
1820                            n.add_upper_inode(child_ri, true).await;
1821                            Ok(false)
1822                        },
1823                    )
1824                    .await?;
1825                n.clone()
1826            }
1827            None => {
1828                // Copy parent node up if necessary.
1829                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
1830                let new_node = Arc::new(Mutex::new(None));
1831                let path = format!("{}/{}", pnode.path.read().await, name_str);
1832                pnode
1833                    .handle_upper_inode_locked(
1834                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
1835                            let parent_real_inode = match parent_real_inode {
1836                                Some(inode) => inode,
1837                                None => {
1838                                    error!("BUG: parent doesn't have upper inode after copied up");
1839                                    return Err(Error::from_raw_os_error(libc::EINVAL));
1840                                }
1841                            };
1842
1843                            let (child_ri, hd) =
1844                                parent_real_inode.create(ctx, name_str, mode, flags).await?;
1845                            real_ino.lock().await.replace(child_ri.inode);
1846                            handle.lock().await.replace(hd.unwrap());
1847                            // Allocate inode number.
1848                            let ino = self.alloc_inode(&path).await?;
1849                            let ovi = OverlayInode::new_from_real_inode(
1850                                name_str,
1851                                ino,
1852                                path.clone(),
1853                                child_ri,
1854                            )
1855                            .await;
1856
1857                            new_node.lock().await.replace(ovi);
1858                            Ok(false)
1859                        },
1860                    )
1861                    .await?;
1862
1863                // new_node is always 'Some'
1864                let nn = new_node.lock().await.take();
1865                let arc_node = Arc::new(nn.unwrap());
1866                self.insert_inode(arc_node.inode, arc_node.clone()).await;
1867                pnode.insert_child(name_str, arc_node.clone()).await;
1868                arc_node
1869            }
1870        };
1871
1872        let final_handle = match *handle.lock().await {
1873            Some(hd) => {
1874                if self.no_open.load(Ordering::Relaxed) {
1875                    None
1876                } else {
1877                    let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1878                    let handle_data = HandleData {
1879                        node: new_ovi,
1880                        real_handle: Some(RealHandle {
1881                            layer: upper.clone(),
1882                            in_upper_layer: true,
1883                            inode: real_ino.lock().await.unwrap(),
1884                            handle: AtomicU64::new(hd),
1885                        }),
1886                        dir_snapshot: Mutex::new(None),
1887                    };
1888                    self.handles
1889                        .lock()
1890                        .await
1891                        .insert(handle, Arc::new(handle_data));
1892                    Some(handle)
1893                }
1894            }
1895            None => None,
1896        };
1897        Ok(final_handle)
1898    }
1899
1900    async fn do_rename(
1901        &self,
1902        req: Request,
1903        parent: Inode,
1904        name: &OsStr,
1905        new_parent: Inode,
1906        new_name: &OsStr,
1907    ) -> Result<()> {
1908        let name_str = name
1909            .to_str()
1910            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
1911        let new_name_str = new_name
1912            .to_str()
1913            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
1914        self.check_user_creatable_name(new_name)?;
1915
1916        let parent_node = self.lookup_node(req, parent, "").await?;
1917        let new_parent_node = self.lookup_node(req, new_parent, "").await?;
1918        let src_node = self.lookup_node(req, parent, name_str).await?;
1919        let dest_node_opt = self
1920            .lookup_node_ignore_enoent(req, new_parent, new_name_str)
1921            .await?;
1922        // trace!("parent_node: {}, new_parent_node: {}, src_node: {}, dest_node_opt: {:?}", parent_node.inode, new_parent_node.inode, src_node.inode, dest_node_opt.as_ref().map(|n| n.inode));
1923
1924        if let Some(dest_node) = &dest_node_opt {
1925            let src_is_dir = src_node.is_dir(req).await?;
1926            let dest_is_dir = dest_node.is_dir(req).await?;
1927            if src_is_dir != dest_is_dir {
1928                return Err(Error::from_raw_os_error(libc::EISDIR));
1929            }
1930            if dest_is_dir {
1931                self.copy_directory_up(req, dest_node.clone()).await?;
1932                let (count, _) = dest_node.count_entries_and_whiteout(req).await?;
1933                if count > 0 {
1934                    return Err(Error::from_raw_os_error(libc::ENOTEMPTY));
1935                }
1936            }
1937        }
1938
1939        let pnode = self.copy_node_up(req, parent_node).await?;
1940        let new_pnode = self.copy_node_up(req, new_parent_node).await?;
1941        let s_node = self.copy_node_up(req, src_node).await?;
1942
1943        let need_whiteout = !s_node.upper_layer_only().await;
1944
1945        let (p_layer, _, p_inode) = pnode.first_layer_inode().await;
1946        let (new_p_layer, _, new_p_inode) = new_pnode.first_layer_inode().await;
1947        assert!(Arc::ptr_eq(&p_layer, &new_p_layer));
1948
1949        p_layer
1950            .rename(req, p_inode, name, new_p_inode, new_name)
1951            .await?;
1952
1953        // Handle the replaced destination node (if any).
1954        if let Some(dest_node) = dest_node_opt {
1955            let path = dest_node.path.read().await.clone();
1956            self.remove_inode(dest_node.inode, Some(path)).await;
1957        }
1958
1959        // Update the moved source node's state.
1960
1961        // Remove from old parent.
1962        pnode.remove_child(name_str).await;
1963        self.remove_inode(s_node.inode, s_node.path.read().await.clone().into())
1964            .await;
1965        let new_path = format!("{}/{}", new_pnode.path.read().await, new_name_str);
1966        *s_node.path.write().await = new_path;
1967        *s_node.name.write().await = new_name_str.to_string();
1968        *s_node.parent.lock().await = Arc::downgrade(&new_pnode);
1969        new_pnode.insert_child(new_name_str, s_node.clone()).await;
1970        self.insert_inode(s_node.inode, s_node).await;
1971
1972        // Create whiteout at the old location if necessary.
1973        if need_whiteout {
1974            p_layer.create_whiteout(req, p_inode, name).await?;
1975        }
1976
1977        Ok(())
1978    }
1979
1980    async fn do_link(
1981        &self,
1982        ctx: Request,
1983        src_node: &Arc<OverlayInode>,
1984        new_parent: &Arc<OverlayInode>,
1985        name: &str,
1986    ) -> Result<()> {
1987        if self.upper_layer.is_none() {
1988            return Err(Error::from_raw_os_error(libc::EROFS));
1989        }
1990
1991        // Node is whiteout.
1992        if src_node.whiteout.load(Ordering::Relaxed) || new_parent.whiteout.load(Ordering::Relaxed)
1993        {
1994            return Err(Error::from_raw_os_error(libc::ENOENT));
1995        }
1996        self.check_user_creatable_name(OsStr::new(name))?;
1997
1998        let st = src_node.stat64(ctx).await?;
1999        if utils::is_dir(&st.attr.kind) {
2000            // Directory can't be hardlinked.
2001            return Err(Error::from_raw_os_error(libc::EPERM));
2002        }
2003
2004        let src_node = self.copy_node_up(ctx, Arc::clone(src_node)).await?;
2005        let new_parent = self.copy_node_up(ctx, Arc::clone(new_parent)).await?;
2006        let src_ino = src_node.first_layer_inode().await.2;
2007
2008        if let Some(existing_node) = self
2009            .lookup_node_ignore_enoent(ctx, new_parent.inode, name)
2010            .await?
2011        {
2012            // If it's not a whiteout, it's an error
2013            if !existing_node.whiteout.load(Ordering::Relaxed) {
2014                return Err(Error::from_raw_os_error(libc::EEXIST));
2015            }
2016            // If it is a whiteout, we will overwrite it.
2017            // First, remove the physical whiteout file in the upper layer.
2018            new_parent
2019                .handle_upper_inode_locked(&mut |parent_real_inode: Option<Arc<RealInode>>| async {
2020                    let parent_ri = parent_real_inode.ok_or_else(|| {
2021                        error!("BUG: parent doesn't have upper inode after copied up");
2022                        Error::from_raw_os_error(libc::EINVAL)
2023                    })?;
2024                    // Only delete if the whiteout is in the upper layer
2025                    if existing_node.in_upper_layer().await {
2026                        let _ = parent_ri
2027                            .layer
2028                            .delete_whiteout(ctx, parent_ri.inode, OsStr::new(name))
2029                            .await;
2030                    }
2031                    Ok(false)
2032                })
2033                .await?;
2034        }
2035
2036        new_parent
2037            .handle_upper_inode_locked(&mut |parent_real_inode: Option<Arc<RealInode>>| async {
2038                let parent_real_inode = match parent_real_inode {
2039                    Some(inode) => inode,
2040                    None => {
2041                        error!("BUG: parent doesn't have upper inode after copied up");
2042                        return Err(Error::from_raw_os_error(libc::EINVAL));
2043                    }
2044                };
2045
2046                parent_real_inode.link(ctx, src_ino, name).await?;
2047
2048                Ok(false)
2049            })
2050            .await?;
2051
2052        self.insert_inode(src_node.inode, src_node.clone()).await;
2053        new_parent.insert_child(name, src_node).await;
2054
2055        Ok(())
2056    }
2057
2058    async fn do_symlink(
2059        &self,
2060        ctx: Request,
2061        linkname: &str,
2062        parent_node: &Arc<OverlayInode>,
2063        name: &str,
2064    ) -> Result<()> {
2065        let name_os = OsStr::new(name);
2066        if self.upper_layer.is_none() {
2067            return Err(Error::from_raw_os_error(libc::EROFS));
2068        }
2069
2070        // parent was deleted.
2071        if parent_node.whiteout.load(Ordering::Relaxed) {
2072            return Err(Error::from_raw_os_error(libc::ENOENT));
2073        }
2074        self.check_user_creatable_name(name_os)?;
2075
2076        match self
2077            .lookup_node_ignore_enoent(ctx, parent_node.inode, name)
2078            .await?
2079        {
2080            Some(n) => {
2081                // Node with same name exists, let's check if it's whiteout.
2082                if !n.whiteout.load(Ordering::Relaxed) {
2083                    return Err(Error::from_raw_os_error(libc::EEXIST));
2084                }
2085
2086                // Copy parent node up if necessary.
2087                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
2088                pnode
2089                    .handle_upper_inode_locked(
2090                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
2091                            let parent_real_inode = match parent_real_inode {
2092                                Some(inode) => inode,
2093                                None => {
2094                                    error!("BUG: parent doesn't have upper inode after copied up");
2095                                    return Err(Error::from_raw_os_error(libc::EINVAL));
2096                                }
2097                            };
2098
2099                            if n.in_upper_layer().await {
2100                                let _ = parent_real_inode
2101                                    .layer
2102                                    .delete_whiteout(ctx, parent_real_inode.inode, name_os)
2103                                    .await;
2104                            }
2105
2106                            let child_ri = parent_real_inode.symlink(ctx, linkname, name).await?;
2107
2108                            // Replace existing real inodes with new one.
2109                            n.add_upper_inode(child_ri, true).await;
2110                            Ok(false)
2111                        },
2112                    )
2113                    .await?;
2114            }
2115            None => {
2116                // Copy parent node up if necessary.
2117                let pnode = self.copy_node_up(ctx, Arc::clone(parent_node)).await?;
2118                let new_node: Arc<Mutex<Option<OverlayInode>>> = Arc::new(Mutex::new(None));
2119                let path = format!("{}/{}", pnode.path.read().await, name);
2120                pnode
2121                    .handle_upper_inode_locked(
2122                        &mut |parent_real_inode: Option<Arc<RealInode>>| async {
2123                            let parent_real_inode = match parent_real_inode {
2124                                Some(inode) => inode,
2125                                None => {
2126                                    error!("BUG: parent doesn't have upper inode after copied up");
2127                                    return Err(Error::from_raw_os_error(libc::EINVAL));
2128                                }
2129                            };
2130
2131                            // Allocate inode number.
2132                            let ino = self.alloc_inode(&path).await?;
2133                            let child_ri = parent_real_inode.symlink(ctx, linkname, name).await?;
2134                            let ovi = OverlayInode::new_from_real_inode(
2135                                name,
2136                                ino,
2137                                path.clone(),
2138                                child_ri,
2139                            )
2140                            .await;
2141
2142                            new_node.lock().await.replace(ovi);
2143                            Ok(false)
2144                        },
2145                    )
2146                    .await?;
2147
2148                // new_node is always 'Some'
2149                let arc_node = Arc::new(new_node.lock().await.take().unwrap());
2150                self.insert_inode(arc_node.inode, arc_node.clone()).await;
2151                pnode.insert_child(name, arc_node).await;
2152            }
2153        }
2154
2155        Ok(())
2156    }
2157
2158    /// Copies a symbolic link from a lower layer to the upper layer.
2159    ///
2160    /// This function is a part of the copy-up process, triggered when a symlink that
2161    /// only exists in a lower layer is modified. It reads the link target and attributes
2162    /// from the lower layer and creates an identical symlink in the upper layer, crucially
2163    /// preserving the original host UID and GID.
2164    async fn copy_symlink_up(
2165        &self,
2166        ctx: Request,
2167        node: Arc<OverlayInode>,
2168    ) -> Result<Arc<OverlayInode>> {
2169        if node.in_upper_layer().await {
2170            return Ok(node);
2171        }
2172
2173        let parent_node = if let Some(ref n) = node.parent.lock().await.upgrade() {
2174            Arc::clone(n)
2175        } else {
2176            return Err(Error::other("no parent?"));
2177        };
2178
2179        // To preserve original ownership, we must get the raw, unmapped host attributes.
2180        // We achieve this by calling `do_getattr_helper`, which is specifically designed
2181        // to bypass the ID mapping logic. This is safe and does not affect other
2182        // functionalities because `do_getattr_helper` and the standard `stat64()` call
2183        // both rely on the same underlying `stat` system call; they only differ in
2184        // whether the resulting `uid` and `gid` are mapped.
2185        let (self_layer, _, self_inode) = node.first_layer_inode().await;
2186        let re = self_layer.do_getattr_helper(self_inode, None).await?;
2187        let st = ReplyAttr {
2188            ttl: re.1,
2189            attr: convert_stat64_to_file_attr(re.0),
2190        };
2191
2192        if !parent_node.in_upper_layer().await {
2193            parent_node.clone().create_upper_dir(ctx, None).await?;
2194        }
2195
2196        // Read the linkname from lower layer.
2197        let reply_data = self_layer.readlink(ctx, self_inode).await?;
2198        // Convert path to &str.
2199        let path = std::str::from_utf8(&reply_data.data)
2200            .map_err(|_| Error::from_raw_os_error(libc::EINVAL))?;
2201
2202        let new_upper_real: Arc<Mutex<Option<RealInode>>> = Arc::new(Mutex::new(None));
2203        parent_node
2204            .handle_upper_inode_locked(&mut |parent_upper_inode: Option<Arc<RealInode>>| async {
2205                // We already create upper dir for parent_node above.
2206                let parent_real_inode =
2207                    parent_upper_inode.ok_or_else(|| Error::from_raw_os_error(libc::EROFS))?;
2208                // We manually unfold the `symlink` logic here instead of calling the `symlink` method directly.
2209                // This is necessary to preserve the original file's UID and GID during the copy-up process.
2210                if !parent_real_inode.in_upper_layer {
2211                    return Err(Error::from_raw_os_error(libc::EROFS));
2212                }
2213                let link_name = OsStr::new(path);
2214                let filename = node.name.read().await;
2215                let filename = OsStr::new(filename.as_str());
2216                let entry = parent_real_inode
2217                    .layer
2218                    .do_symlink_helper(
2219                        ctx,
2220                        parent_real_inode.inode,
2221                        filename,
2222                        link_name,
2223                        st.attr.uid,
2224                        st.attr.gid,
2225                    )
2226                    .await?;
2227                let ri = RealInode {
2228                    layer: parent_real_inode.layer.clone(),
2229                    in_upper_layer: true,
2230                    inode: entry.attr.ino,
2231                    whiteout: false,
2232                    opaque: false,
2233                    stat: Some(ReplyAttr {
2234                        ttl: entry.ttl,
2235                        attr: entry.attr,
2236                    }),
2237                };
2238                new_upper_real.lock().await.replace(ri);
2239                Ok(false)
2240            })
2241            .await?;
2242
2243        if let Some(real_inode) = new_upper_real.lock().await.take() {
2244            // update upper_inode and first_inode()
2245            node.add_upper_inode(real_inode, true).await;
2246        }
2247
2248        Ok(node)
2249    }
2250
2251    /// macOS-only: try `clonefile(2)` between two passthrough layers.
2252    ///
2253    /// Returns `Ok(Some(node))` when the clone landed and the upper inode
2254    /// was attached to `node`; `Ok(None)` when one of the layers has no
2255    /// host-fs path or the kernel rejected the clone (cross-volume,
2256    /// non-APFS). Errors propagate the underlying `io::Error`.
2257    ///
2258    /// This is structurally an optimization shortcut; the slow read/write
2259    /// path remains the source of truth, so any failure here just falls
2260    /// through.
2261    #[cfg(target_os = "macos")]
2262    async fn try_macos_apfs_clone_up(
2263        &self,
2264        lower_layer: &Arc<BoxedLayer>,
2265        lower_inode: Inode,
2266        parent_node: &Arc<OverlayInode>,
2267        node: &Arc<OverlayInode>,
2268    ) -> Result<Option<Arc<OverlayInode>>> {
2269        use std::ffi::CString;
2270        use std::os::unix::ffi::OsStrExt;
2271
2272        let Some(src_path) = lower_layer.host_path_of(lower_inode).await else {
2273            return Ok(None);
2274        };
2275
2276        // Capture upper-parent layer + inode under the parent's
2277        // `handle_upper_inode_locked`. We don't yet know the upper layer's
2278        // host_path_of result.
2279        let parent_layer_inode = std::sync::Arc::new(tokio::sync::Mutex::new(None));
2280        {
2281            let pli = parent_layer_inode.clone();
2282            parent_node
2283                .handle_upper_inode_locked(&mut |parent_upper: Option<Arc<RealInode>>| async {
2284                    if let Some(p) = parent_upper {
2285                        *pli.lock().await = Some((p.layer.clone(), p.inode));
2286                    }
2287                    Ok(false)
2288                })
2289                .await?;
2290        }
2291        let Some((upper_layer, upper_parent_inode)) = parent_layer_inode.lock().await.clone()
2292        else {
2293            return Ok(None);
2294        };
2295
2296        let Some(dst_dir_path) = upper_layer.host_path_of(upper_parent_inode).await else {
2297            return Ok(None);
2298        };
2299
2300        let name_owned = node.name.read().await.clone();
2301        let dst_full = dst_dir_path.join(&name_owned);
2302
2303        let src_c = CString::new(src_path.as_os_str().as_bytes())
2304            .map_err(|_| Error::from_raw_os_error(libc::EINVAL))?;
2305        let dst_c = CString::new(dst_full.as_os_str().as_bytes())
2306            .map_err(|_| Error::from_raw_os_error(libc::EINVAL))?;
2307
2308        match crate::passthrough::util::try_apfs_clonefile(&src_c, &dst_c) {
2309            Ok(true) => {}
2310            Ok(false) => return Ok(None),
2311            Err(e) => {
2312                // EEXIST is unexpected here (caller path is "no upper exists
2313                // yet"). Surface real errors instead of silently falling
2314                // through, so we don't mask data-loss bugs.
2315                if e.raw_os_error() != Some(libc::ENOTSUP) && e.raw_os_error() != Some(libc::EXDEV)
2316                {
2317                    return Err(e);
2318                }
2319                return Ok(None);
2320            }
2321        }
2322
2323        // Clone landed. Now look it up through the upper layer so the
2324        // RealInode tracks the new entry and node.add_upper_inode wires it
2325        // into the OverlayInode tree.
2326        let entry = upper_layer
2327            .lookup(
2328                Request::default(),
2329                upper_parent_inode,
2330                OsStr::new(&name_owned),
2331            )
2332            .await?;
2333        let real = RealInode {
2334            layer: upper_layer,
2335            in_upper_layer: true,
2336            inode: entry.attr.ino,
2337            whiteout: false,
2338            opaque: false,
2339            stat: Some(ReplyAttr {
2340                ttl: entry.ttl,
2341                attr: entry.attr,
2342            }),
2343        };
2344        node.add_upper_inode(real, true).await;
2345        Ok(Some(Arc::clone(node)))
2346    }
2347
2348    /// Copies a regular file and its contents from a lower layer to the upper layer.
2349    ///
2350    /// This function is a core part of the copy-up process, triggered when a regular file
2351    /// that only exists in a lower layer is written to. It creates an empty file in the
2352    /// upper layer with the original file's attributes (mode, UID, GID), and then copies
2353    /// the entire content from the lower layer file to the new upper layer file.
2354    async fn copy_regfile_up(
2355        &self,
2356        ctx: Request,
2357        node: Arc<OverlayInode>,
2358    ) -> Result<Arc<OverlayInode>> {
2359        if node.in_upper_layer().await {
2360            return Ok(node);
2361        }
2362
2363        let parent_node = if let Some(ref n) = node.parent.lock().await.upgrade() {
2364            Arc::clone(n)
2365        } else {
2366            return Err(Error::other("no parent?"));
2367        };
2368
2369        // To preserve original ownership, we must get the raw, unmapped host attributes.
2370        // We achieve this by calling `do_getattr_helper`, which is specifically designed
2371        // to bypass the ID mapping logic. This is safe and does not affect other
2372        // functionalities because `do_getattr_helper` and the standard `stat64()` call
2373        // both rely on the same underlying `stat` system call; they only differ in
2374        // whether the resulting `uid` and `gid` are mapped.
2375        let (lower_layer, _, lower_inode) = node.first_layer_inode().await;
2376        let re = lower_layer.do_getattr_helper(lower_inode, None).await?;
2377        let st = ReplyAttr {
2378            ttl: re.1,
2379            attr: convert_stat64_to_file_attr(re.0),
2380        };
2381        trace!(
2382            "copy_regfile_up: node {} in lower layer's inode {}",
2383            node.inode, lower_inode
2384        );
2385
2386        if !parent_node.in_upper_layer().await {
2387            parent_node.clone().create_upper_dir(ctx, None).await?;
2388        }
2389
2390        // === macOS APFS reflink fast path =================================
2391        // When both source and destination are passthrough on the same APFS
2392        // volume, `clonefile(2)` produces an O(1) copy-on-write copy. The
2393        // result is byte-identical to the lower file with all metadata
2394        // (mode, uid, gid, xattrs, mtime) preserved, so we can skip the
2395        // create + read/write loop entirely.
2396        //
2397        // Falls back silently to the slow path on:
2398        //   * Layers without a host-fs path (memory FS, etc.)
2399        //   * Cross-volume copy (EXDEV)
2400        //   * Filesystems that don't support clones (ENOTSUP)
2401        #[cfg(target_os = "macos")]
2402        if let Some(node) = self
2403            .try_macos_apfs_clone_up(&lower_layer, lower_inode, &parent_node, &node)
2404            .await?
2405        {
2406            return Ok(node);
2407        }
2408
2409        // create the file in upper layer using information from lower layer
2410
2411        let flags = libc::O_WRONLY;
2412        let mode = mode_from_kind_and_perm(st.attr.kind, st.attr.perm);
2413
2414        let upper_handle = Arc::new(Mutex::new(0));
2415        let upper_real_inode = Arc::new(Mutex::new(None));
2416        parent_node
2417            .handle_upper_inode_locked(&mut |parent_upper_inode: Option<Arc<RealInode>>| async {
2418                // We already create upper dir for parent_node.
2419                let parent_real_inode = parent_upper_inode.ok_or_else(|| {
2420                    error!("parent {} has no upper inode", parent_node.inode);
2421                    Error::from_raw_os_error(libc::EINVAL)
2422                })?;
2423                // We manually unfold the `create` logic here instead of calling the `create` method directly.
2424                // This is necessary to preserve the original file's UID and GID during the copy-up process.
2425                if !parent_real_inode.in_upper_layer {
2426                    return Err(Error::from_raw_os_error(libc::EROFS));
2427                }
2428                let name = node.name.read().await;
2429                let name = OsStr::new(name.as_str());
2430                let create_rep = parent_real_inode
2431                    .layer
2432                    .do_create_helper(
2433                        ctx,
2434                        parent_real_inode.inode,
2435                        name,
2436                        mode,
2437                        flags.try_into().unwrap(),
2438                        st.attr.uid,
2439                        st.attr.gid,
2440                    )
2441                    .await?;
2442
2443                let (inode, h) = (
2444                    RealInode {
2445                        layer: parent_real_inode.layer.clone(),
2446                        in_upper_layer: true,
2447                        inode: create_rep.attr.ino,
2448                        whiteout: false,
2449                        opaque: false,
2450                        stat: Some(ReplyAttr {
2451                            ttl: create_rep.ttl,
2452                            attr: create_rep.attr,
2453                        }),
2454                    },
2455                    Some(create_rep.fh),
2456                );
2457                trace!(
2458                    "copy_regfile_up: created upper file {name:?} with inode {}",
2459                    inode.inode
2460                );
2461                *upper_handle.lock().await = h.unwrap_or(0);
2462                upper_real_inode.lock().await.replace(inode);
2463                Ok(false)
2464            })
2465            .await?;
2466
2467        let rep = lower_layer
2468            .open(ctx, lower_inode, libc::O_RDONLY as u32)
2469            .await?;
2470
2471        let lower_handle = rep.fh;
2472
2473        // need to use work directory and then rename file to
2474        // final destination for atomic reasons.. not deal with it for now,
2475        // use stupid copy at present.
2476        // FIXME: this need a lot of work here, ntimes, xattr, etc.
2477
2478        // Copy from lower real inode to upper real inode.
2479        // TODO: use sendfile here.
2480
2481        let u_handle = *upper_handle.lock().await;
2482        let ri = upper_real_inode.lock().await.take();
2483        if let Some(ri) = ri {
2484            let mut offset: usize = 0;
2485            let size = 4 * 1024 * 1024;
2486
2487            loop {
2488                let ret = lower_layer
2489                    .read(ctx, lower_inode, lower_handle, offset as u64, size)
2490                    .await?;
2491
2492                let len = ret.data.len();
2493                if len == 0 {
2494                    break;
2495                }
2496
2497                let ret = ri
2498                    .layer
2499                    .write(ctx, ri.inode, u_handle, offset as u64, &ret.data, 0, 0)
2500                    .await?;
2501
2502                assert_eq!(ret.written as usize, len);
2503                offset += ret.written as usize;
2504            }
2505
2506            if let Err(e) = ri.layer.release(ctx, ri.inode, u_handle, 0, 0, true).await {
2507                let e: std::io::Error = e.into();
2508                // Ignore ENOSYS.
2509                if e.raw_os_error() != Some(libc::ENOSYS) {
2510                    return Err(e);
2511                }
2512            }
2513            node.add_upper_inode(ri, true).await;
2514        } else {
2515            error!("BUG: upper real inode is None after copy up");
2516        }
2517
2518        lower_layer
2519            .release(ctx, lower_inode, lower_handle, 0, 0, true)
2520            .await?;
2521
2522        Ok(Arc::clone(&node))
2523    }
2524
2525    /// Copies the specified node to the upper layer of the filesystem
2526    ///
2527    /// Performs different operations based on the node type:
2528    /// - **Directory**: Creates a corresponding directory in the upper layer
2529    /// - **Symbolic link**: Recursively copies to the upper layer
2530    /// - **Regular file**: Copies file content to the upper layer
2531    ///
2532    /// # Parameters
2533    /// * `ctx`: FUSE request context
2534    /// * `node`: Reference to the node to be copied
2535    ///
2536    /// # Returns
2537    /// Returns a reference to the upper-layer node on success, or an error on failure
2538    async fn copy_node_up(
2539        &self,
2540        ctx: Request,
2541        node: Arc<OverlayInode>,
2542    ) -> Result<Arc<OverlayInode>> {
2543        if node.in_upper_layer().await {
2544            return Ok(node);
2545        }
2546
2547        let st = node.stat64(ctx).await?;
2548        match st.attr.kind {
2549            FileType::Directory => {
2550                node.clone().create_upper_dir(ctx, None).await?;
2551                Ok(node)
2552            }
2553            FileType::Symlink => {
2554                // For symlink.
2555                self.copy_symlink_up(ctx, node).await
2556            }
2557            FileType::RegularFile => {
2558                // For regular file.
2559                self.copy_regfile_up(ctx, node).await
2560            }
2561            _ => {
2562                // For other file types. return error.
2563                Err(Error::from_raw_os_error(libc::EINVAL))
2564            }
2565        }
2566    }
2567
2568    /// recursively copy directory and all its contents to upper layer
2569    async fn copy_directory_up(
2570        &self,
2571        ctx: Request,
2572        node: Arc<OverlayInode>,
2573    ) -> Result<Arc<OverlayInode>> {
2574        // Ensure the directory itself is copied up first
2575        self.copy_node_up(ctx, node.clone()).await?;
2576
2577        // load directory to cache
2578        self.load_directory(ctx, &node).await?;
2579
2580        // go through all children
2581        let children = node.childrens.lock().await.clone();
2582        for (_name, child) in children.iter() {
2583            if _name == "." || _name == ".." {
2584                continue;
2585            }
2586            // jump over whiteout
2587            if child.whiteout.load(Ordering::Relaxed) {
2588                continue;
2589            }
2590            let st = child.stat64(ctx).await?;
2591            if !child.in_upper_layer().await {
2592                match st.attr.kind {
2593                    FileType::Directory => {
2594                        // recursively copy subdirectory
2595                        Box::pin(self.copy_directory_up(ctx, child.clone())).await?;
2596                    }
2597                    FileType::Symlink | FileType::RegularFile => {
2598                        // copy node up symlink or regular file
2599                        Box::pin(self.copy_node_up(ctx, child.clone())).await?;
2600                    }
2601                    _ => {
2602                        // other file types are ignored
2603                    }
2604                }
2605            } else if utils::is_dir(&st.attr.kind) {
2606                // If it is already in the upper layer, but the directory is not loaded,
2607                // ensure that its contents are also copied up recursively.
2608                Box::pin(self.copy_directory_up(ctx, child.clone())).await?;
2609            }
2610        }
2611
2612        Ok(node)
2613    }
2614
2615    async fn do_rm(&self, ctx: Request, parent: u64, name: &OsStr, dir: bool) -> Result<()> {
2616        // 1. Read-only mount guard
2617        if self.upper_layer.is_none() {
2618            return Err(Error::from_raw_os_error(libc::EROFS));
2619        }
2620
2621        // 2. Locate the parent Overlay Inode.
2622        // Find parent Overlay Inode.
2623        let pnode = self.lookup_node(ctx, parent, "").await?;
2624        if pnode.whiteout.load(Ordering::Relaxed) {
2625            return Err(Error::from_raw_os_error(libc::ENOENT));
2626        }
2627        let to_name = name
2628            .to_str()
2629            .ok_or_else(|| Error::from_raw_os_error(libc::EINVAL))?;
2630
2631        // 3. Locate the child Overlay Inode for the given name
2632        // Find the Overlay Inode for child with <name>.
2633        let node = self.lookup_node(ctx, parent, to_name).await?;
2634        if node.whiteout.load(Ordering::Relaxed) {
2635            // already deleted.
2636            return Err(Error::from_raw_os_error(libc::ENOENT));
2637        }
2638
2639        // 4. If removing a directory, ensure it is empty of real entries
2640        if dir {
2641            self.load_directory(ctx, &node).await?;
2642            let (count, whiteouts) = node.count_entries_and_whiteout(ctx).await?;
2643            trace!("entries: {count}, whiteouts: {whiteouts}\n");
2644            if count > 0 {
2645                return Err(Error::from_raw_os_error(libc::ENOTEMPTY));
2646            }
2647
2648            // Delete all whiteouts.
2649            if whiteouts > 0 && node.in_upper_layer().await {
2650                self.empty_node_directory(ctx, Arc::clone(&node)).await?;
2651            }
2652
2653            trace!("whiteouts deleted!\n");
2654        }
2655
2656        // 5. Decide whether we need to create a whiteout entry
2657        // We'll filp this off if upper-layer unlink suffices or parent is opaque
2658        let need_whiteout = AtomicBool::new(true);
2659        let pnode = self.copy_node_up(ctx, Arc::clone(&pnode)).await?;
2660
2661        if node.upper_layer_only().await {
2662            need_whiteout.store(false, Ordering::Relaxed);
2663        }
2664
2665        let mut df = |parent_upper_inode: Option<Arc<RealInode>>| async {
2666            let parent_real_inode = parent_upper_inode.ok_or_else(|| {
2667                error!(
2668                    "BUG: parent {} has no upper inode after copy up",
2669                    pnode.inode
2670                );
2671                Error::from_raw_os_error(libc::EINVAL)
2672            })?;
2673
2674            // Parent is opaque, it shadows everything in lower layers so no need to create extra whiteouts.
2675            if parent_real_inode.opaque {
2676                need_whiteout.store(false, Ordering::Relaxed);
2677            }
2678            if dir {
2679                parent_real_inode
2680                    .layer
2681                    .rmdir(ctx, parent_real_inode.inode, name)
2682                    .await?;
2683            } else {
2684                parent_real_inode
2685                    .layer
2686                    .unlink(ctx, parent_real_inode.inode, name)
2687                    .await?;
2688            }
2689
2690            Ok(false)
2691        };
2692
2693        // 6. Perform the unlink/rmdir operation and memory cleanup
2694        if node.in_upper_layer().await {
2695            pnode.handle_upper_inode_locked(&mut df).await?;
2696        }
2697        pnode.remove_child(to_name).await;
2698        let path = node.path.read().await.clone();
2699        self.remove_inode(node.inode, Some(path)).await;
2700
2701        // 7. If needed, create a entry in the upper layer to mask lower-layer files
2702        if need_whiteout.load(Ordering::Relaxed) {
2703            trace!("do_rm: creating whiteout\n");
2704            // pnode is copied up, so it has upper layer.
2705            pnode
2706                .handle_upper_inode_locked(
2707                    &mut |parent_upper_inode: Option<Arc<RealInode>>| async {
2708                        let parent_real_inode = parent_upper_inode.ok_or_else(|| {
2709                            error!(
2710                                "BUG: parent {} has no upper inode after copy up",
2711                                pnode.inode
2712                            );
2713                            Error::from_raw_os_error(libc::EINVAL)
2714                        })?;
2715
2716                        let child_ri = parent_real_inode.create_whiteout(ctx, to_name).await?; //FIXME..............
2717                        let path = format!("{}/{}", pnode.path.read().await, to_name);
2718                        let ino: u64 = self.alloc_inode(&path).await?;
2719                        let ovi = Arc::new(
2720                            OverlayInode::new_from_real_inode(to_name, ino, path.clone(), child_ri)
2721                                .await,
2722                        );
2723
2724                        self.insert_inode(ino, ovi.clone()).await;
2725                        pnode.insert_child(to_name, ovi.clone()).await;
2726                        Ok(false)
2727                    },
2728                )
2729                .await?;
2730        }
2731
2732        Ok(())
2733    }
2734
2735    async fn do_fsync(
2736        &self,
2737        ctx: Request,
2738        inode: Inode,
2739        datasync: bool,
2740        handle: Handle,
2741        syncdir: bool,
2742    ) -> Result<()> {
2743        // Use O_RDONLY flags which indicates no copy up.
2744        let data = self
2745            .get_data(ctx, Some(handle), inode, libc::O_RDONLY as u32)
2746            .await?;
2747
2748        trace!("do_fsync: got data for handle: {handle}, inode:{inode}");
2749
2750        match data.real_handle {
2751            // FIXME: need to test if inode matches corresponding handle?
2752            None => {
2753                trace!("do_fsync: no real handle found for handle: {handle}, inode:{inode}");
2754                Err(Error::from_raw_os_error(libc::ENOENT))
2755            }
2756            Some(ref rh) => {
2757                let real_handle = rh.handle.load(Ordering::Relaxed);
2758                // TODO: check if it's in upper layer? @weizhang555
2759                if syncdir {
2760                    trace!(
2761                        "do_fsync: layer.fsyncdir called for handle: {}, inode:{}; rh.inode: {}, real_handle: {}",
2762                        handle, inode, rh.inode, real_handle
2763                    );
2764                    rh.layer
2765                        .fsyncdir(ctx, rh.inode, real_handle, datasync)
2766                        .await
2767                        .map_err(|e| e.into())
2768                } else {
2769                    rh.layer
2770                        .fsync(ctx, rh.inode, real_handle, datasync)
2771                        .await
2772                        .map_err(|e| e.into())
2773                }
2774            }
2775        }
2776    }
2777
2778    // Delete everything in the directory only on upper layer, ignore lower layers.
2779    async fn empty_node_directory(&self, ctx: Request, node: Arc<OverlayInode>) -> Result<()> {
2780        let st = node.stat64(ctx).await?;
2781        if !utils::is_dir(&st.attr.kind) {
2782            // This function can only be called on directories.
2783            return Err(Error::from_raw_os_error(libc::ENOTDIR));
2784        }
2785
2786        let (layer, in_upper, inode) = node.first_layer_inode().await;
2787        if !in_upper {
2788            return Ok(());
2789        }
2790
2791        // Copy node.childrens Hashmap to Vector, the Vector is also used as temp storage,
2792        // Without this, Rust won't allow us to remove them from node.childrens.
2793        let iter = node
2794            .childrens
2795            .lock()
2796            .await
2797            .values()
2798            .cloned()
2799            .collect::<Vec<_>>();
2800
2801        for child in iter {
2802            // We only care about upper layer, ignore lower layers.
2803            if child.in_upper_layer().await {
2804                let child_name = child.name.read().await.clone();
2805                let child_name_os = OsStr::new(&child_name);
2806                if child.whiteout.load(Ordering::Relaxed) {
2807                    layer.delete_whiteout(ctx, inode, child_name_os).await?
2808                } else {
2809                    let s = child.stat64(ctx).await?;
2810                    let cname: &OsStr = OsStr::new(&child_name_os);
2811                    if utils::is_dir(&s.attr.kind) {
2812                        let (count, whiteouts) = child.count_entries_and_whiteout(ctx).await?;
2813                        if count + whiteouts > 0 {
2814                            let cb = child.clone();
2815                            Box::pin(async move { self.empty_node_directory(ctx, cb).await })
2816                                .await?;
2817                        }
2818                        layer.rmdir(ctx, inode, cname).await?
2819                    } else {
2820                        layer.unlink(ctx, inode, cname).await?;
2821                    }
2822                }
2823
2824                let cpath = child.path.read().await.clone();
2825                // delete the child
2826                self.remove_inode(child.inode, Some(cpath)).await;
2827                node.remove_child(&child_name).await;
2828            }
2829        }
2830
2831        Ok(())
2832    }
2833
2834    async fn find_real_info_from_handle(
2835        &self,
2836        handle: Handle,
2837    ) -> Result<(Arc<BoxedLayer>, Inode, Handle)> {
2838        match self.handles.lock().await.get(&handle) {
2839            Some(h) => match h.real_handle {
2840                Some(ref rhd) => {
2841                    trace!(
2842                        "find_real_info_from_handle: layer in upper: {}",
2843                        rhd.in_upper_layer
2844                    );
2845                    Ok((
2846                        rhd.layer.clone(),
2847                        rhd.inode,
2848                        rhd.handle.load(Ordering::Relaxed),
2849                    ))
2850                }
2851                None => Err(Error::from_raw_os_error(libc::ENOENT)),
2852            },
2853
2854            None => Err(Error::from_raw_os_error(libc::ENOENT)),
2855        }
2856    }
2857
2858    async fn find_real_inode(&self, inode: Inode) -> Result<(Arc<BoxedLayer>, Inode)> {
2859        if let Some(n) = self.get_active_inode(inode).await {
2860            let (first_layer, _, first_inode) = n.first_layer_inode().await;
2861            return Ok((first_layer, first_inode));
2862        } else if let Some(n) = self.get_all_inode(inode).await {
2863            trace!("find_real_inode: found inode by get_all_inode: {}", n.inode);
2864            let (first_layer, _, first_inode) = n.first_layer_inode().await;
2865            return Ok((first_layer, first_inode));
2866        }
2867
2868        Err(Error::from_raw_os_error(libc::ENOENT))
2869    }
2870
2871    async fn get_data(
2872        &self,
2873        ctx: Request,
2874        handle: Option<Handle>,
2875        inode: Inode,
2876        flags: u32,
2877    ) -> Result<Arc<HandleData>> {
2878        let no_open = self.no_open.load(Ordering::Relaxed);
2879        if !no_open {
2880            if let Some(h) = handle
2881                && let Some(v) = self.handles.lock().await.get(&h)
2882                && v.node.inode == inode
2883            {
2884                // trace!("get_data: found handle");
2885                return Ok(Arc::clone(v));
2886            }
2887        } else {
2888            let readonly: bool = flags
2889                & (libc::O_APPEND | libc::O_CREAT | libc::O_TRUNC | libc::O_RDWR | libc::O_WRONLY)
2890                    as u32
2891                == 0;
2892
2893            // lookup node
2894            let node = self.lookup_node(ctx, inode, "").await?;
2895
2896            // whiteout node
2897            if node.whiteout.load(Ordering::Relaxed) {
2898                return Err(Error::from_raw_os_error(libc::ENOENT));
2899            }
2900
2901            if !readonly {
2902                // Check if upper layer exists, return EROFS is not exists.
2903                self.upper_layer
2904                    .as_ref()
2905                    .cloned()
2906                    .ok_or_else(|| Error::from_raw_os_error(libc::EROFS))?;
2907                // copy up to upper layer
2908                self.copy_node_up(ctx, Arc::clone(&node)).await?;
2909            }
2910
2911            let (layer, in_upper_layer, inode) = node.first_layer_inode().await;
2912            let handle_data = HandleData {
2913                node: Arc::clone(&node),
2914                real_handle: Some(RealHandle {
2915                    layer,
2916                    in_upper_layer,
2917                    inode,
2918                    handle: AtomicU64::new(0),
2919                }),
2920                dir_snapshot: Mutex::new(None),
2921            };
2922            return Ok(Arc::new(handle_data));
2923        }
2924
2925        Err(Error::from_raw_os_error(libc::ENOENT))
2926    }
2927
2928    // extend or init the inodes number to one overlay if the current number is done.
2929    pub async fn extend_inode_alloc(&self, key: u64) {
2930        let next_inode = key * INODE_ALLOC_BATCH;
2931        let limit_inode = next_inode + INODE_ALLOC_BATCH - 1;
2932        self.inodes
2933            .write()
2934            .await
2935            .extend_inode_number(next_inode, limit_inode);
2936    }
2937}
2938
2939/// Wrap the parameters for mounting overlay filesystem.
2940#[derive(Debug, Clone)]
2941pub struct OverlayArgs<P, Q, R, M, N, I>
2942where
2943    P: AsRef<Path>,
2944    Q: AsRef<Path>,
2945    R: AsRef<Path>,
2946    M: AsRef<str>,
2947    N: Into<String>,
2948    I: IntoIterator<Item = R>,
2949{
2950    pub mountpoint: P,
2951    pub upperdir: Q,
2952    pub lowerdir: I,
2953    pub privileged: bool,
2954    pub mapping: Option<M>,
2955    pub name: Option<N>,
2956    pub allow_other: bool,
2957}
2958
2959/// Mounts the filesystem using the given parameters and returns the mount handle.
2960///
2961/// # Parameters
2962/// - `mountpoint`: Path to the mount point.
2963/// - `upperdir`: Path to the upper directory.
2964/// - `lowerdir`: Paths to the lower directories.
2965/// - `privileged`: If true, use privileged mount; otherwise, unprivileged mount.
2966/// - `mapping`: Optional user/group ID mapping for unprivileged mounts.
2967/// - `name`: Optional name for the filesystem.
2968/// - `allow_other`: If true, allows other users to access the filesystem.
2969///
2970/// # Returns
2971/// A mount handle on success.
2972pub async fn mount_fs<P, Q, R, M, N, I>(
2973    args: OverlayArgs<P, Q, R, M, N, I>,
2974) -> rfuse3::raw::MountHandle
2975where
2976    P: AsRef<Path>,
2977    Q: AsRef<Path>,
2978    R: AsRef<Path>,
2979    M: AsRef<str>,
2980    N: Into<String>,
2981    I: IntoIterator<Item = R>,
2982{
2983    // Create lower layers
2984    let mut lower_layers = Vec::new();
2985    for lower in args.lowerdir {
2986        let layer = new_passthroughfs_layer(PassthroughArgs {
2987            root_dir: lower,
2988            mapping: args.mapping.as_ref().map(|m| m.as_ref()),
2989        })
2990        .await
2991        .expect("Failed to create lower filesystem layer");
2992        lower_layers.push(Arc::new(layer));
2993    }
2994    // Create upper layer
2995    let upper_layer = Arc::new(
2996        new_passthroughfs_layer(PassthroughArgs {
2997            root_dir: args.upperdir,
2998            mapping: args.mapping.as_ref().map(|m| m.as_ref()),
2999        })
3000        .await
3001        .expect("Failed to create upper filesystem layer"),
3002    );
3003
3004    // Configure overlay filesystem
3005    let config = Config {
3006        mountpoint: args.mountpoint.as_ref().to_path_buf(),
3007        do_import: true,
3008        ..Default::default()
3009    };
3010    let overlayfs = OverlayFs::new(Some(upper_layer), lower_layers, config, 1)
3011        .expect("Failed to initialize OverlayFs");
3012    let logfs = LoggingFileSystem::new(overlayfs);
3013
3014    let mount_path: OsString = OsString::from(args.mountpoint.as_ref().as_os_str());
3015
3016    // Obtain the current user's uid and gid
3017    let uid = unsafe { libc::getuid() };
3018    let gid = unsafe { libc::getgid() };
3019
3020    let mut mount_options = MountOptions::default();
3021    #[cfg(target_os = "linux")]
3022    mount_options.force_readdir_plus(true);
3023
3024    mount_options
3025        .uid(uid)
3026        .gid(gid)
3027        .allow_other(args.allow_other);
3028    if let Some(name) = args.name {
3029        mount_options.fs_name(name);
3030    }
3031
3032    // Mount filesystem based on privilege flag and return the mount handle
3033    if !args.privileged {
3034        debug!("Mounting with unprivileged mode");
3035        Session::new(mount_options)
3036            .mount_with_unprivileged(logfs, mount_path)
3037            .await
3038            .expect("Unprivileged mount failed")
3039    } else {
3040        debug!("Mounting with privileged mode");
3041        Session::new(mount_options)
3042            .mount(logfs, mount_path)
3043            .await
3044            .expect("Privileged mount failed")
3045    }
3046}