Skip to main content

secure_exec_vfs_core/posix/
overlay_fs.rs

1use super::vfs::{
2    normalize_path, MemoryFileSystem, VfsError, VfsResult, VirtualDirEntry, VirtualFileSystem,
3    VirtualStat, VirtualUtimeSpec,
4};
5use base64::Engine;
6use std::collections::BTreeSet;
7
8const MAX_SNAPSHOT_DEPTH: usize = 1024;
9const OVERLAY_METADATA_ROOT: &str = "/.secure-exec-overlay";
10const OVERLAY_WHITEOUT_DIR: &str = "/.secure-exec-overlay/whiteouts";
11const OVERLAY_OPAQUE_DIR: &str = "/.secure-exec-overlay/opaque";
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum OverlayMode {
15    Ephemeral,
16    ReadOnly,
17}
18
19#[derive(Debug)]
20pub struct OverlayFileSystem {
21    lowers: Vec<MemoryFileSystem>,
22    upper: Option<MemoryFileSystem>,
23    writes_locked: bool,
24}
25
26#[derive(Debug, Clone, Copy)]
27enum OverlayMarkerKind {
28    Whiteout,
29    Opaque,
30}
31
32#[derive(Debug)]
33enum OverlaySnapshotKind {
34    Directory,
35    File(Vec<u8>),
36    Symlink(String),
37}
38
39#[derive(Debug)]
40struct OverlaySnapshotEntry {
41    path: String,
42    stat: VirtualStat,
43    kind: OverlaySnapshotKind,
44}
45
46/// Records every upper-layer mutation performed while staging a rename so that a
47/// failure between staging and the (successful) upper rename can be rolled back
48/// without orphaning staged inodes / `path_index` entries. Unlike
49/// `remove_snapshot_entries` (which finalizes a *successful* move and therefore
50/// whiteouts lower-backed source paths), this captures only the entries/markers
51/// that staging itself newly created so they can be removed verbatim — never
52/// hiding a still-present lower source on the error path.
53#[derive(Debug, Default)]
54struct StagedRollback {
55    /// Upper paths newly created by staging, in creation order (a parent is
56    /// always recorded before any child created underneath it). `is_dir` mirrors
57    /// the removal split used by `remove_snapshot_entries`.
58    created_paths: Vec<(String, bool)>,
59    /// Overlay markers newly set during staging / metadata copy, recorded so the
60    /// marker files (themselves upper inodes) are cleared on rollback.
61    created_markers: Vec<(OverlayMarkerKind, String)>,
62}
63
64impl StagedRollback {
65    fn record_path(&mut self, path: &str, is_dir: bool) {
66        self.created_paths.push((String::from(path), is_dir));
67    }
68
69    fn record_marker(&mut self, kind: OverlayMarkerKind, path: &str) {
70        self.created_markers.push((kind, String::from(path)));
71    }
72}
73
74#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
75struct OverlayCopyUpUsage {
76    total_bytes: u64,
77    inode_count: usize,
78}
79
80impl OverlayFileSystem {
81    pub fn new(lowers: Vec<MemoryFileSystem>, mode: OverlayMode) -> Self {
82        let mut effective_lowers = lowers;
83        if effective_lowers.is_empty() {
84            effective_lowers.push(MemoryFileSystem::new());
85        }
86
87        let mut upper = match mode {
88            OverlayMode::Ephemeral => Some(MemoryFileSystem::new()),
89            OverlayMode::ReadOnly => None,
90        };
91        if let Some(upper_filesystem) = upper.as_mut() {
92            sync_upper_root_metadata(upper_filesystem, &effective_lowers);
93        }
94
95        Self {
96            lowers: effective_lowers,
97            upper,
98            writes_locked: matches!(mode, OverlayMode::ReadOnly),
99        }
100    }
101
102    pub fn with_upper(lowers: Vec<MemoryFileSystem>, upper: MemoryFileSystem) -> Self {
103        let mut effective_lowers = lowers;
104        if effective_lowers.is_empty() {
105            effective_lowers.push(MemoryFileSystem::new());
106        }
107
108        Self {
109            lowers: effective_lowers,
110            upper: Some(upper),
111            writes_locked: false,
112        }
113    }
114
115    pub fn lock_writes(&mut self) {
116        self.writes_locked = true;
117    }
118
119    fn normalized(path: &str) -> String {
120        normalize_path(path)
121    }
122
123    fn parent_path(path: &str) -> String {
124        let normalized = Self::normalized(path);
125        if normalized == "/" {
126            return String::from("/");
127        }
128
129        match normalized.rsplit_once('/') {
130            Some(("", _)) | None => String::from("/"),
131            Some((parent, _)) => String::from(parent),
132        }
133    }
134
135    fn basename(path: &str) -> String {
136        let normalized = Self::normalized(path);
137        if normalized == "/" {
138            return String::from("/");
139        }
140        normalized
141            .rsplit('/')
142            .find(|component| !component.is_empty())
143            .unwrap_or("")
144            .to_owned()
145    }
146
147    fn validate_destination_parent(&mut self, path: &str) -> VfsResult<()> {
148        let parent = Self::parent_path(path);
149        let resolved_parent = self.resolve_merged_path(&parent, true, 0)?;
150        let stat = self.merged_lstat(&resolved_parent)?;
151        if !stat.is_directory {
152            return Err(Self::not_directory(&parent));
153        }
154        Ok(())
155    }
156
157    fn resolved_destination_path(&self, path: &str) -> VfsResult<String> {
158        let parent = Self::parent_path(path);
159        let resolved_parent = self.resolve_merged_path(&parent, true, 0)?;
160        Ok(Self::join_path(&resolved_parent, &Self::basename(path)))
161    }
162
163    fn resolve_merged_path(
164        &self,
165        path: &str,
166        follow_final_symlink: bool,
167        depth: usize,
168    ) -> VfsResult<String> {
169        if depth > MAX_SNAPSHOT_DEPTH {
170            return Err(VfsError::new(
171                "ELOOP",
172                format!("too many symbolic links while resolving '{path}'"),
173            ));
174        }
175
176        let normalized = Self::normalized(path);
177        if normalized == "/" {
178            return Ok(normalized);
179        }
180
181        let components: Vec<&str> = normalized
182            .split('/')
183            .filter(|component| !component.is_empty())
184            .collect();
185        let mut current = String::from("/");
186
187        for (index, component) in components.iter().enumerate() {
188            let candidate = Self::join_path(&current, component);
189            let is_final = index + 1 == components.len();
190            let should_follow = !is_final || follow_final_symlink;
191
192            if should_follow {
193                if let Ok(stat) = self.merged_lstat(&candidate) {
194                    if stat.is_symbolic_link {
195                        let target = self.read_link_inner(&candidate)?;
196                        let target_path = if target.starts_with('/') {
197                            Self::normalized(&target)
198                        } else {
199                            Self::normalized(&Self::join_path(
200                                &Self::parent_path(&candidate),
201                                &target,
202                            ))
203                        };
204                        let remainder = components[index + 1..].join("/");
205                        let next_path = if remainder.is_empty() {
206                            target_path
207                        } else {
208                            Self::normalized(&Self::join_path(&target_path, &remainder))
209                        };
210                        return self.resolve_merged_path(
211                            &next_path,
212                            follow_final_symlink,
213                            depth + 1,
214                        );
215                    }
216
217                    if !is_final && !stat.is_directory {
218                        return Err(Self::not_directory(&candidate));
219                    }
220                }
221            } else if let Ok(stat) = self.merged_lstat(&candidate) {
222                if !is_final && !stat.is_directory {
223                    return Err(Self::not_directory(&candidate));
224                }
225            }
226
227            current = candidate;
228        }
229
230        Ok(current)
231    }
232
233    fn destination_parent_copy_up_paths(&self, path: &str) -> VfsResult<Vec<String>> {
234        let parent = Self::parent_path(path);
235        let mut paths = Vec::new();
236        let mut seen = BTreeSet::new();
237        self.collect_destination_parent_copy_up_paths(&parent, &mut paths, &mut seen, 0)?;
238        Ok(paths)
239    }
240
241    fn collect_destination_parent_copy_up_paths(
242        &self,
243        parent: &str,
244        paths: &mut Vec<String>,
245        seen: &mut BTreeSet<String>,
246        depth: usize,
247    ) -> VfsResult<()> {
248        if depth > MAX_SNAPSHOT_DEPTH {
249            return Err(VfsError::new(
250                "ELOOP",
251                format!("too many symbolic links while resolving '{parent}'"),
252            ));
253        }
254
255        let normalized = Self::normalized(parent);
256        if normalized == "/" {
257            return Ok(());
258        }
259
260        let components: Vec<&str> = normalized
261            .split('/')
262            .filter(|component| !component.is_empty())
263            .collect();
264        let mut current = String::from("/");
265        for (index, component) in components.iter().enumerate() {
266            current = Self::join_path(&current, component);
267            let stat = self.merged_lstat(&current)?;
268
269            if stat.is_symbolic_link {
270                if !self.has_entry_in_upper(&current) && seen.insert(current.clone()) {
271                    paths.push(current.clone());
272                }
273
274                let target = self.read_link_inner(&current)?;
275                let target_path = if target.starts_with('/') {
276                    Self::normalized(&target)
277                } else {
278                    Self::normalized(&Self::join_path(&Self::parent_path(&current), &target))
279                };
280                let remainder = components[index + 1..].join("/");
281                let next_parent = if remainder.is_empty() {
282                    target_path
283                } else {
284                    Self::normalized(&Self::join_path(&target_path, &remainder))
285                };
286                return self.collect_destination_parent_copy_up_paths(
287                    &next_parent,
288                    paths,
289                    seen,
290                    depth + 1,
291                );
292            }
293
294            if self.find_lower_by_entry(&current).is_some()
295                && !self.has_entry_in_upper(&current)
296                && seen.insert(current.clone())
297            {
298                paths.push(current.clone());
299            }
300        }
301
302        Ok(())
303    }
304
305    fn encode_marker_path(path: &str) -> String {
306        base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(path)
307    }
308
309    fn marker_directory(kind: OverlayMarkerKind) -> &'static str {
310        match kind {
311            OverlayMarkerKind::Whiteout => OVERLAY_WHITEOUT_DIR,
312            OverlayMarkerKind::Opaque => OVERLAY_OPAQUE_DIR,
313        }
314    }
315
316    fn marker_path(kind: OverlayMarkerKind, path: &str) -> String {
317        format!(
318            "{}/{}",
319            Self::marker_directory(kind),
320            Self::encode_marker_path(&Self::normalized(path))
321        )
322    }
323
324    fn is_internal_metadata_path(path: &str) -> bool {
325        let normalized = Self::normalized(path);
326        normalized == OVERLAY_METADATA_ROOT
327            || normalized.starts_with(&(String::from(OVERLAY_METADATA_ROOT) + "/"))
328    }
329
330    /// Returns true if `path`, or the location it resolves to through symlinks,
331    /// lands in the reserved overlay metadata namespace.
332    ///
333    /// The lexical [`is_internal_metadata_path`] check alone is bypassable: the
334    /// underlying `MemoryFileSystem` follows symlinks, so a guest-created symlink
335    /// whose resolved target enters `/.secure-exec-overlay` (directly, or via a
336    /// symlink to an ancestor such as `/`) would slip past a purely lexical guard
337    /// and let the guest read or tamper with whiteout/opaque markers (e.g.
338    /// resurrecting a deleted lower-layer file). Resolving before the check
339    /// closes that hole while leaving ordinary symlinks unaffected.
340    fn touches_internal_metadata(&self, path: &str) -> bool {
341        if Self::is_internal_metadata_path(path) {
342            return true;
343        }
344        if let Ok(resolved) = self.resolve_merged_path(path, true, 0) {
345            if Self::is_internal_metadata_path(&resolved) {
346                return true;
347            }
348        }
349        if let Ok(resolved) = self.resolved_destination_path(path) {
350            if Self::is_internal_metadata_path(&resolved) {
351                return true;
352            }
353        }
354        false
355    }
356
357    fn hidden_root_entry_name() -> &'static str {
358        ".secure-exec-overlay"
359    }
360
361    fn should_hide_directory_entry(path: &str, entry: &str) -> bool {
362        let normalized = Self::normalized(path);
363        normalized == "/" && entry == Self::hidden_root_entry_name()
364    }
365
366    fn should_ignore_raw_directory_entry(
367        upper: Option<&MemoryFileSystem>,
368        path: &str,
369        entry: &str,
370    ) -> bool {
371        if entry == "." || entry == ".." || Self::should_hide_directory_entry(path, entry) {
372            return true;
373        }
374
375        let entry_path = Self::join_path(path, entry);
376        Self::marker_exists_in_upper(upper, OverlayMarkerKind::Whiteout, &entry_path)
377    }
378
379    fn check_copy_up_usage_limits(
380        usage: &OverlayCopyUpUsage,
381        max_bytes: Option<u64>,
382        max_inodes: Option<usize>,
383    ) -> VfsResult<()> {
384        if let Some(limit) = max_bytes {
385            if usage.total_bytes > limit {
386                return Err(VfsError::new(
387                    "ENOSPC",
388                    format!(
389                        "overlay rename copy-up bytes {} exceed configured limit {}",
390                        usage.total_bytes, limit
391                    ),
392                ));
393            }
394        }
395
396        if let Some(limit) = max_inodes {
397            if usage.inode_count > limit {
398                return Err(VfsError::new(
399                    "ENOSPC",
400                    format!(
401                        "overlay rename copy-up inodes {} exceed configured limit {}",
402                        usage.inode_count, limit
403                    ),
404                ));
405            }
406        }
407
408        Ok(())
409    }
410
411    fn add_copy_up_usage(
412        usage: &mut OverlayCopyUpUsage,
413        bytes: u64,
414        inodes: usize,
415        max_bytes: Option<u64>,
416        max_inodes: Option<usize>,
417    ) -> VfsResult<()> {
418        usage.total_bytes = usage.total_bytes.saturating_add(bytes);
419        usage.inode_count = usage.inode_count.saturating_add(inodes);
420        Self::check_copy_up_usage_limits(usage, max_bytes, max_inodes)
421    }
422
423    fn remaining_inode_budget(
424        usage: &OverlayCopyUpUsage,
425        max_inodes: Option<usize>,
426    ) -> Option<usize> {
427        max_inodes.map(|limit| limit.saturating_sub(usage.inode_count))
428    }
429
430    fn copy_up_directory_entries_limited(
431        &mut self,
432        path: &str,
433        max_entries: Option<usize>,
434    ) -> VfsResult<Vec<String>> {
435        let Some(max_entries) = max_entries else {
436            return self.read_dir(path);
437        };
438
439        match self.read_dir_limited(path, max_entries) {
440            Ok(entries) => Ok(entries),
441            Err(error) if error.code() == "ENOMEM" => Err(VfsError::new(
442                "ENOSPC",
443                format!("overlay rename copy-up directory '{path}' exceeds configured inode limit"),
444            )),
445            Err(error) => Err(error),
446        }
447    }
448
449    fn directory_has_visible_entries_limited(&mut self, path: &str) -> VfsResult<bool> {
450        match self.read_dir_limited(path, 1) {
451            Ok(entries) => Ok(!entries.is_empty()),
452            Err(error) if error.code() == "ENOMEM" => Ok(true),
453            Err(error) => Err(error),
454        }
455    }
456
457    fn memory_subtree_usage_limited(
458        filesystem: &mut MemoryFileSystem,
459        path: &str,
460        max_bytes: Option<u64>,
461        max_inodes: Option<usize>,
462    ) -> VfsResult<OverlayCopyUpUsage> {
463        let mut usage = OverlayCopyUpUsage::default();
464        let mut visited = BTreeSet::new();
465        let mut pending = vec![Self::normalized(path)];
466        while let Some(current_path) = pending.pop() {
467            let stat = filesystem.lstat(&current_path)?;
468            if visited.insert(stat.ino) {
469                let bytes = if stat.is_directory && !stat.is_symbolic_link {
470                    0
471                } else {
472                    stat.size
473                };
474                Self::add_copy_up_usage(&mut usage, bytes, 1, max_bytes, max_inodes)?;
475            }
476
477            if stat.is_directory && !stat.is_symbolic_link {
478                let remaining = Self::remaining_inode_budget(&usage, max_inodes);
479                let children = if let Some(max_entries) = remaining {
480                    filesystem.read_dir_limited(&current_path, max_entries)?
481                } else {
482                    filesystem.read_dir(&current_path)?
483                };
484                for entry in children.into_iter().rev() {
485                    if matches!(entry.as_str(), "." | "..") {
486                        continue;
487                    }
488                    if Self::should_hide_directory_entry(&current_path, &entry) {
489                        continue;
490                    }
491                    pending.push(Self::join_path(&current_path, &entry));
492                }
493            }
494        }
495
496        Ok(usage)
497    }
498
499    fn memory_subtree_released_usage(
500        filesystem: &mut MemoryFileSystem,
501        path: &str,
502    ) -> VfsResult<OverlayCopyUpUsage> {
503        let mut usage = OverlayCopyUpUsage::default();
504        let mut visited = BTreeSet::new();
505        let mut pending = vec![Self::normalized(path)];
506        while let Some(current_path) = pending.pop() {
507            let stat = filesystem.lstat(&current_path)?;
508            if visited.insert(stat.ino) {
509                let subtree_links = filesystem.link_count_in_subtree(stat.ino, path) as u64;
510                if stat.is_directory || stat.nlink <= subtree_links {
511                    let bytes = if stat.is_directory && !stat.is_symbolic_link {
512                        0
513                    } else {
514                        stat.size
515                    };
516                    Self::add_copy_up_usage(&mut usage, bytes, 1, None, None)?;
517                }
518            }
519
520            if stat.is_directory && !stat.is_symbolic_link {
521                for entry in filesystem.read_dir(&current_path)?.into_iter().rev() {
522                    if matches!(entry.as_str(), "." | "..") {
523                        continue;
524                    }
525                    if Self::should_hide_directory_entry(&current_path, &entry) {
526                        continue;
527                    }
528                    pending.push(Self::join_path(&current_path, &entry));
529                }
530            }
531        }
532
533        Ok(usage)
534    }
535
536    fn upper_usage_limited(
537        &mut self,
538        max_bytes: Option<u64>,
539        max_inodes: Option<usize>,
540    ) -> VfsResult<OverlayCopyUpUsage> {
541        let Some(upper) = self.upper.as_mut() else {
542            return Ok(OverlayCopyUpUsage::default());
543        };
544
545        Self::memory_subtree_usage_limited(upper, "/", max_bytes, max_inodes)
546    }
547
548    fn upper_subtree_released_usage(&mut self, path: &str) -> VfsResult<OverlayCopyUpUsage> {
549        let Some(upper) = self.upper.as_mut() else {
550            return Ok(OverlayCopyUpUsage::default());
551        };
552
553        if !upper.exists(path) {
554            return Ok(OverlayCopyUpUsage::default());
555        }
556
557        Self::memory_subtree_released_usage(upper, path)
558    }
559
560    fn collect_copy_up_usage_limited(
561        &mut self,
562        path: &str,
563        usage: &mut OverlayCopyUpUsage,
564        max_bytes: Option<u64>,
565        max_inodes: Option<usize>,
566    ) -> VfsResult<()> {
567        let mut pending = vec![(Self::normalized(path), 0usize)];
568        while let Some((current_path, depth)) = pending.pop() {
569            if depth > MAX_SNAPSHOT_DEPTH {
570                return Err(VfsError::new(
571                    "EINVAL",
572                    format!("overlay snapshot depth limit exceeded at '{current_path}'"),
573                ));
574            }
575
576            let stat = self.merged_lstat(&current_path)?;
577            if !self.has_entry_in_upper(&current_path) {
578                let bytes = if stat.is_symbolic_link {
579                    self.read_link_inner(&current_path)?.len() as u64
580                } else if stat.is_directory {
581                    0
582                } else {
583                    stat.size
584                };
585                Self::add_copy_up_usage(usage, bytes, 1, max_bytes, max_inodes)?;
586            }
587
588            if stat.is_directory && !stat.is_symbolic_link {
589                let children = self.copy_up_directory_entries_limited(&current_path, max_inodes)?;
590                for entry in children.into_iter().rev() {
591                    pending.push((Self::join_path(&current_path, &entry), depth + 1));
592                }
593            }
594        }
595
596        Ok(())
597    }
598
599    fn collect_single_copy_up_usage_limited(
600        &mut self,
601        path: &str,
602        usage: &mut OverlayCopyUpUsage,
603        max_bytes: Option<u64>,
604        max_inodes: Option<usize>,
605    ) -> VfsResult<()> {
606        if self.has_entry_in_upper(path) {
607            return Ok(());
608        }
609
610        let stat = self.merged_lstat(path)?;
611        let bytes = if stat.is_symbolic_link {
612            self.read_link_inner(path)?.len() as u64
613        } else if stat.is_directory {
614            0
615        } else {
616            stat.size
617        };
618        Self::add_copy_up_usage(usage, bytes, 1, max_bytes, max_inodes)
619    }
620
621    pub fn check_rename_copy_up_limits(
622        &mut self,
623        old_path: &str,
624        new_path: &str,
625        max_bytes: Option<u64>,
626        max_inodes: Option<usize>,
627    ) -> VfsResult<()> {
628        let old_normalized = Self::normalized(old_path);
629        let new_normalized = Self::normalized(new_path);
630        if Self::is_internal_metadata_path(&old_normalized)
631            || Self::is_internal_metadata_path(&new_normalized)
632        {
633            return Err(VfsError::permission_denied("rename", old_path));
634        }
635
636        if old_normalized == "/" {
637            return Err(VfsError::permission_denied("rename", old_path));
638        }
639
640        if old_normalized == new_normalized {
641            return Ok(());
642        }
643
644        let source_stat = self.merged_lstat(old_path)?;
645        if self.writes_locked {
646            self.writable_upper(&old_normalized)?;
647        }
648        self.validate_destination_parent(&new_normalized)?;
649        let resolved_new_normalized = self.resolved_destination_path(&new_normalized)?;
650
651        if old_normalized == resolved_new_normalized {
652            return Ok(());
653        }
654
655        if source_stat.is_directory
656            && resolved_new_normalized.starts_with(&(old_normalized.clone() + "/"))
657        {
658            return Err(VfsError::new(
659                "EINVAL",
660                format!(
661                    "cannot move '{}' into its own descendant '{}'",
662                    old_path, new_path
663                ),
664            ));
665        }
666
667        let destination_parent_copy_up_paths =
668            self.destination_parent_copy_up_paths(&new_normalized)?;
669
670        if let Ok(destination_stat) = self.merged_lstat(&resolved_new_normalized) {
671            if destination_stat.is_directory
672                && !destination_stat.is_symbolic_link
673                && self.directory_has_visible_entries_limited(&resolved_new_normalized)?
674            {
675                return Err(Self::not_empty(&resolved_new_normalized));
676            }
677        }
678
679        let mut usage = self.upper_usage_limited(None, None)?;
680        if self.has_entry_in_upper(&resolved_new_normalized) {
681            let destination_usage = self.upper_subtree_released_usage(&resolved_new_normalized)?;
682            usage.total_bytes = usage
683                .total_bytes
684                .saturating_sub(destination_usage.total_bytes);
685            usage.inode_count = usage
686                .inode_count
687                .saturating_sub(destination_usage.inode_count);
688        }
689        Self::check_copy_up_usage_limits(&usage, max_bytes, max_inodes)?;
690        for path in destination_parent_copy_up_paths {
691            self.collect_single_copy_up_usage_limited(&path, &mut usage, max_bytes, max_inodes)?;
692        }
693        self.collect_copy_up_usage_limited(&old_normalized, &mut usage, max_bytes, max_inodes)?;
694
695        Self::check_copy_up_usage_limits(&usage, max_bytes, max_inodes)
696    }
697
698    fn marker_exists(&self, kind: OverlayMarkerKind, path: &str) -> bool {
699        Self::marker_exists_in_upper(self.upper.as_ref(), kind, path)
700    }
701
702    fn marker_exists_in_upper(
703        upper: Option<&MemoryFileSystem>,
704        kind: OverlayMarkerKind,
705        path: &str,
706    ) -> bool {
707        upper.is_some_and(|filesystem| filesystem.exists(&Self::marker_path(kind, path)))
708    }
709
710    fn is_whited_out(&self, path: &str) -> bool {
711        self.marker_exists(OverlayMarkerKind::Whiteout, path)
712    }
713
714    fn ensure_metadata_directories_in_upper(&mut self, path: &str) -> VfsResult<()> {
715        let upper = self.writable_upper(path)?;
716        upper.mkdir(OVERLAY_METADATA_ROOT, true)?;
717        upper.mkdir(OVERLAY_WHITEOUT_DIR, true)?;
718        upper.mkdir(OVERLAY_OPAQUE_DIR, true)?;
719        Ok(())
720    }
721
722    fn set_marker(&mut self, kind: OverlayMarkerKind, path: &str, present: bool) -> VfsResult<()> {
723        let marker_path = Self::marker_path(kind, path);
724        if present {
725            self.ensure_metadata_directories_in_upper(path)?;
726            self.writable_upper(path)?
727                .write_file(&marker_path, Self::normalized(path).into_bytes())?;
728            return Ok(());
729        }
730
731        if self
732            .upper
733            .as_ref()
734            .is_some_and(|upper| upper.exists(&marker_path))
735        {
736            self.writable_upper(path)?.remove_file(&marker_path)?;
737        }
738        Ok(())
739    }
740
741    fn add_whiteout(&mut self, path: &str) -> VfsResult<()> {
742        self.set_marker(OverlayMarkerKind::Whiteout, path, true)
743    }
744
745    fn remove_whiteout(&mut self, path: &str) -> VfsResult<()> {
746        self.set_marker(OverlayMarkerKind::Whiteout, path, false)
747    }
748
749    fn mark_opaque_directory(&mut self, path: &str) -> VfsResult<()> {
750        self.set_marker(OverlayMarkerKind::Opaque, path, true)
751    }
752
753    fn clear_opaque_directory(&mut self, path: &str) -> VfsResult<()> {
754        self.set_marker(OverlayMarkerKind::Opaque, path, false)
755    }
756
757    fn clear_path_metadata(&mut self, path: &str) -> VfsResult<()> {
758        self.remove_whiteout(path)?;
759        self.clear_opaque_directory(path)
760    }
761
762    fn join_path(base: &str, name: &str) -> String {
763        if base == "/" {
764            format!("/{name}")
765        } else {
766            format!("{base}/{name}")
767        }
768    }
769
770    fn rebase_path(path: &str, old_root: &str, new_root: &str) -> String {
771        if path == old_root {
772            return String::from(new_root);
773        }
774
775        format!("{new_root}{}", &path[old_root.len()..])
776    }
777
778    fn read_only_error(path: &str) -> VfsError {
779        VfsError::new("EROFS", format!("read-only filesystem: {path}"))
780    }
781
782    fn entry_not_found(path: &str) -> VfsError {
783        VfsError::new("ENOENT", format!("no such file: {path}"))
784    }
785
786    fn directory_not_found(path: &str) -> VfsError {
787        VfsError::new("ENOENT", format!("no such directory: {path}"))
788    }
789
790    fn already_exists(path: &str) -> VfsError {
791        VfsError::new("EEXIST", format!("file exists: {path}"))
792    }
793
794    fn not_directory(path: &str) -> VfsError {
795        VfsError::new("ENOTDIR", format!("not a directory: {path}"))
796    }
797
798    fn writable_upper(&mut self, path: &str) -> VfsResult<&mut MemoryFileSystem> {
799        if self.writes_locked {
800            return Err(Self::read_only_error(path));
801        }
802        self.upper
803            .as_mut()
804            .ok_or_else(|| Self::read_only_error(path))
805    }
806
807    fn path_exists_in_filesystem(filesystem: &MemoryFileSystem, path: &str) -> bool {
808        filesystem.exists(path)
809    }
810
811    fn has_entry_in_filesystem(filesystem: &MemoryFileSystem, path: &str) -> bool {
812        filesystem.lstat(path).is_ok()
813    }
814
815    fn exists_in_upper(&self, path: &str) -> bool {
816        self.upper
817            .as_ref()
818            .is_some_and(|upper| Self::path_exists_in_filesystem(upper, path))
819    }
820
821    fn has_entry_in_upper(&self, path: &str) -> bool {
822        self.upper
823            .as_ref()
824            .is_some_and(|upper| Self::has_entry_in_filesystem(upper, path))
825    }
826
827    fn find_lower_by_exists(&self, path: &str) -> Option<usize> {
828        self.lowers
829            .iter()
830            .position(|lower| Self::path_exists_in_filesystem(lower, path))
831    }
832
833    fn find_lower_by_entry(&self, path: &str) -> Option<(usize, VirtualStat)> {
834        self.lowers
835            .iter()
836            .enumerate()
837            .find_map(|(index, lower)| lower.lstat(path).ok().map(|stat| (index, stat)))
838    }
839
840    fn merged_lstat(&self, path: &str) -> VfsResult<VirtualStat> {
841        if Self::is_internal_metadata_path(path) {
842            return Err(Self::entry_not_found(path));
843        }
844        if self.is_whited_out(path) {
845            return Err(Self::entry_not_found(path));
846        }
847        if self.has_entry_in_upper(path) {
848            return self
849                .upper
850                .as_ref()
851                .expect("upper must exist when entry exists")
852                .lstat(path);
853        }
854        self.find_lower_by_entry(path)
855            .map(|(_, stat)| stat)
856            .ok_or_else(|| Self::entry_not_found(path))
857    }
858
859    /// `read_link` body without the resolving metadata guard, for use by the
860    /// internal symlink-resolution helpers (`resolve_merged_path` and friends).
861    /// The public `read_link` wraps this with `touches_internal_metadata`;
862    /// resolution must not call back into that wrapper or it would recurse on a
863    /// symlink that points at itself's resolution path.
864    fn read_link_inner(&self, path: &str) -> VfsResult<String> {
865        if Self::is_internal_metadata_path(path) {
866            return Err(Self::entry_not_found(path));
867        }
868        if self.is_whited_out(path) {
869            return Err(Self::entry_not_found(path));
870        }
871        if self.has_entry_in_upper(path) {
872            return self
873                .upper
874                .as_ref()
875                .expect("upper must exist when path exists")
876                .read_link(path);
877        }
878        let Some((index, _)) = self.find_lower_by_entry(path) else {
879            return Err(Self::entry_not_found(path));
880        };
881        self.lowers[index].read_link(path)
882    }
883
884    fn ensure_ancestor_directories_in_upper(&mut self, path: &str) -> VfsResult<()> {
885        if Self::is_internal_metadata_path(path) {
886            return Err(VfsError::permission_denied("mkdir", path));
887        }
888        let normalized = Self::normalized(path);
889        let parts = normalized
890            .split('/')
891            .filter(|part| !part.is_empty())
892            .collect::<Vec<_>>();
893
894        let mut current = String::new();
895        for part in parts.iter().take(parts.len().saturating_sub(1)) {
896            current.push('/');
897            current.push_str(part);
898
899            if self.exists_in_upper(&current) {
900                continue;
901            }
902
903            if let Some(index) = self.find_lower_by_exists(&current) {
904                let stat = self.lowers[index].stat(&current)?;
905                if !stat.is_directory {
906                    return Err(Self::not_directory(&current));
907                }
908
909                let upper = self.writable_upper(&current)?;
910                upper.mkdir(&current, false)?;
911                upper.chmod(&current, stat.mode)?;
912                upper.chown(&current, stat.uid, stat.gid)?;
913                continue;
914            }
915
916            let upper = self.writable_upper(&current)?;
917            upper.mkdir(&current, false)?;
918        }
919
920        Ok(())
921    }
922
923    /// Ancestor-materialization variant used by rename staging. It mirrors
924    /// `ensure_ancestor_directories_in_upper` but records every ancestor it newly
925    /// creates into `rollback`, so a later staging failure can remove them and
926    /// avoid orphaning the freshly created directory inodes.
927    fn ensure_ancestor_directories_in_upper_recording(
928        &mut self,
929        path: &str,
930        rollback: &mut StagedRollback,
931    ) -> VfsResult<()> {
932        if Self::is_internal_metadata_path(path) {
933            return Err(VfsError::permission_denied("mkdir", path));
934        }
935        let normalized = Self::normalized(path);
936        let parts = normalized
937            .split('/')
938            .filter(|part| !part.is_empty())
939            .collect::<Vec<_>>();
940
941        let mut current = String::new();
942        for part in parts.iter().take(parts.len().saturating_sub(1)) {
943            current.push('/');
944            current.push_str(part);
945
946            if self.exists_in_upper(&current) {
947                continue;
948            }
949
950            if let Some(index) = self.find_lower_by_exists(&current) {
951                let stat = self.lowers[index].stat(&current)?;
952                if !stat.is_directory {
953                    return Err(Self::not_directory(&current));
954                }
955
956                let upper = self.writable_upper(&current)?;
957                upper.mkdir(&current, false)?;
958                upper.chmod(&current, stat.mode)?;
959                upper.chown(&current, stat.uid, stat.gid)?;
960                rollback.record_path(&current, true);
961                continue;
962            }
963
964            let upper = self.writable_upper(&current)?;
965            upper.mkdir(&current, false)?;
966            rollback.record_path(&current, true);
967        }
968
969        Ok(())
970    }
971
972    fn copy_up_path(&mut self, path: &str) -> VfsResult<()> {
973        if self.has_entry_in_upper(path) {
974            return Ok(());
975        }
976
977        self.ensure_ancestor_directories_in_upper(path)?;
978
979        let (lower_index, stat) = self
980            .find_lower_by_entry(path)
981            .ok_or_else(|| Self::entry_not_found(path))?;
982
983        if stat.is_symbolic_link {
984            let target = self.lowers[lower_index].read_link(path)?;
985            let upper = self.writable_upper(path)?;
986            upper.symlink(&target, path)?;
987            return Ok(());
988        }
989
990        if stat.is_directory {
991            let upper = self.writable_upper(path)?;
992            upper.mkdir(path, false)?;
993            upper.chmod(path, stat.mode)?;
994            upper.chown(path, stat.uid, stat.gid)?;
995            self.mark_opaque_directory(path)?;
996            return Ok(());
997        }
998
999        let data = self.lowers[lower_index].read_file(path)?;
1000        let upper = self.writable_upper(path)?;
1001        upper.write_file(path, data)?;
1002        upper.chmod(path, stat.mode)?;
1003        upper.chown(path, stat.uid, stat.gid)?;
1004        Ok(())
1005    }
1006
1007    fn materialize_destination_parent_in_upper(&mut self, path: &str) -> VfsResult<()> {
1008        if self.has_entry_in_upper(path) {
1009            return Ok(());
1010        }
1011
1012        if self
1013            .merged_lstat(path)
1014            .is_ok_and(|stat| stat.is_symbolic_link)
1015        {
1016            return self.copy_up_path(path);
1017        }
1018
1019        self.ensure_ancestor_directories_in_upper(path)?;
1020        let stat = self.merged_lstat(path)?;
1021        if !stat.is_directory || stat.is_symbolic_link {
1022            return Err(Self::not_directory(path));
1023        }
1024
1025        let upper = self.writable_upper(path)?;
1026        upper.create_dir(path)?;
1027        upper.chmod(path, stat.mode)?;
1028        upper.chown(path, stat.uid, stat.gid)?;
1029        Ok(())
1030    }
1031
1032    fn path_exists_in_merged_view(&self, path: &str) -> bool {
1033        if self.is_whited_out(path) {
1034            return false;
1035        }
1036        if self.has_entry_in_upper(path) {
1037            return true;
1038        }
1039        self.find_lower_by_entry(path).is_some()
1040    }
1041
1042    fn not_empty(path: &str) -> VfsError {
1043        VfsError::new("ENOTEMPTY", format!("directory not empty, rmdir '{path}'"))
1044    }
1045
1046    fn collect_snapshot_entries(
1047        &mut self,
1048        path: &str,
1049        entries: &mut Vec<OverlaySnapshotEntry>,
1050    ) -> VfsResult<()> {
1051        let mut pending = vec![(Self::normalized(path), 0usize)];
1052        while let Some((current_path, depth)) = pending.pop() {
1053            if depth > MAX_SNAPSHOT_DEPTH {
1054                return Err(VfsError::new(
1055                    "EINVAL",
1056                    format!("overlay snapshot depth limit exceeded at '{current_path}'"),
1057                ));
1058            }
1059
1060            let stat = self.merged_lstat(&current_path)?;
1061
1062            if stat.is_symbolic_link {
1063                entries.push(OverlaySnapshotEntry {
1064                    path: current_path.clone(),
1065                    stat,
1066                    kind: OverlaySnapshotKind::Symlink(self.read_link_inner(&current_path)?),
1067                });
1068                continue;
1069            }
1070
1071            if stat.is_directory {
1072                entries.push(OverlaySnapshotEntry {
1073                    path: current_path.clone(),
1074                    stat,
1075                    kind: OverlaySnapshotKind::Directory,
1076                });
1077
1078                let children = self.read_dir_with_types_inner(&current_path)?;
1079                for entry in children.into_iter().rev() {
1080                    pending.push((Self::join_path(&current_path, &entry.name), depth + 1));
1081                }
1082                continue;
1083            }
1084
1085            entries.push(OverlaySnapshotEntry {
1086                path: current_path.clone(),
1087                stat,
1088                kind: OverlaySnapshotKind::File(self.read_file(&current_path)?),
1089            });
1090        }
1091        Ok(())
1092    }
1093
1094    fn remove_snapshot_entries(&mut self, entries: &[OverlaySnapshotEntry]) -> VfsResult<()> {
1095        for entry in entries.iter().rev() {
1096            if self.has_entry_in_upper(&entry.path) {
1097                match entry.kind {
1098                    OverlaySnapshotKind::Directory => {
1099                        self.writable_upper(&entry.path)?.remove_dir(&entry.path)?;
1100                    }
1101                    OverlaySnapshotKind::File(_) | OverlaySnapshotKind::Symlink(_) => {
1102                        self.writable_upper(&entry.path)?.remove_file(&entry.path)?;
1103                    }
1104                }
1105            }
1106
1107            if self.find_lower_by_entry(&entry.path).is_some() {
1108                self.clear_opaque_directory(&entry.path)?;
1109                self.add_whiteout(&entry.path)?;
1110            } else {
1111                self.clear_path_metadata(&entry.path)?;
1112            }
1113        }
1114
1115        Ok(())
1116    }
1117
1118    fn directory_has_raw_children(&mut self, path: &str) -> VfsResult<bool> {
1119        let normalized = Self::normalized(path);
1120        let mut directory_exists = false;
1121
1122        if let Some(upper) = self.upper.as_mut() {
1123            if let Ok(entries) = upper.read_dir(&normalized) {
1124                directory_exists = true;
1125                if entries.into_iter().any(|entry| {
1126                    !Self::should_ignore_raw_directory_entry(Some(&*upper), &normalized, &entry)
1127                }) {
1128                    return Ok(true);
1129                }
1130            }
1131        }
1132
1133        let upper = self.upper.as_ref();
1134        for lower in self.lowers.iter_mut().rev() {
1135            if let Ok(entries) = lower.read_dir(&normalized) {
1136                directory_exists = true;
1137                if entries.into_iter().any(|entry| {
1138                    !Self::should_ignore_raw_directory_entry(upper, &normalized, &entry)
1139                }) {
1140                    return Ok(true);
1141                }
1142            }
1143        }
1144
1145        if !directory_exists {
1146            return Err(Self::directory_not_found(path));
1147        }
1148
1149        Ok(false)
1150    }
1151
1152    fn read_dir_with_types_inner(&mut self, path: &str) -> VfsResult<Vec<VirtualDirEntry>> {
1153        if self.is_whited_out(path) {
1154            return Err(Self::directory_not_found(path));
1155        }
1156
1157        let normalized = Self::normalized(path);
1158        let mut directory_exists = false;
1159        let mut entries = Vec::<VirtualDirEntry>::new();
1160        let mut seen = BTreeSet::<String>::new();
1161        let upper = self.upper.as_ref();
1162        let include_lowers = !Self::marker_exists_in_upper(upper, OverlayMarkerKind::Opaque, path);
1163
1164        if include_lowers {
1165            for lower in self.lowers.iter_mut().rev() {
1166                if let Ok(lower_entries) = lower.read_dir_with_types(path) {
1167                    directory_exists = true;
1168                    for entry in lower_entries {
1169                        if entry.name == "."
1170                            || entry.name == ".."
1171                            || Self::should_hide_directory_entry(path, &entry.name)
1172                        {
1173                            continue;
1174                        }
1175                        let child_path = if normalized == "/" {
1176                            format!("/{}", entry.name)
1177                        } else {
1178                            format!("{normalized}/{}", entry.name)
1179                        };
1180                        if Self::marker_exists_in_upper(
1181                            upper,
1182                            OverlayMarkerKind::Whiteout,
1183                            &child_path,
1184                        ) || seen.contains(&entry.name)
1185                        {
1186                            continue;
1187                        }
1188                        seen.insert(entry.name.clone());
1189                        entries.push(entry);
1190                    }
1191                }
1192            }
1193        }
1194
1195        if let Some(upper) = self.upper.as_mut() {
1196            if let Ok(upper_entries) = upper.read_dir_with_types(path) {
1197                directory_exists = true;
1198                for entry in upper_entries {
1199                    if entry.name == "."
1200                        || entry.name == ".."
1201                        || Self::should_hide_directory_entry(path, &entry.name)
1202                    {
1203                        continue;
1204                    }
1205                    if let Some(index) = entries
1206                        .iter()
1207                        .position(|existing| existing.name == entry.name)
1208                    {
1209                        entries[index] = entry;
1210                    } else {
1211                        seen.insert(entry.name.clone());
1212                        entries.push(entry);
1213                    }
1214                }
1215            }
1216        }
1217
1218        if !directory_exists {
1219            return Err(Self::directory_not_found(path));
1220        }
1221
1222        Ok(entries)
1223    }
1224
1225    fn marker_paths_in_upper(&mut self, kind: OverlayMarkerKind) -> VfsResult<Vec<String>> {
1226        let Some(upper) = self.upper.as_mut() else {
1227            return Ok(Vec::new());
1228        };
1229
1230        let marker_dir = Self::marker_directory(kind);
1231        let entries = match upper.read_dir(marker_dir) {
1232            Ok(entries) => entries,
1233            Err(error) if error.code() == "ENOENT" => return Ok(Vec::new()),
1234            Err(error) => return Err(error),
1235        };
1236
1237        let mut marker_paths = Vec::new();
1238        for entry in entries {
1239            if entry == "." || entry == ".." {
1240                continue;
1241            }
1242
1243            let marker_file = Self::join_path(marker_dir, &entry);
1244            let marker_path =
1245                String::from_utf8(upper.read_file(&marker_file).map_err(|_| {
1246                    VfsError::io(format!("invalid overlay marker '{marker_file}'"))
1247                })?)
1248                .map_err(|_| VfsError::io(format!("invalid overlay marker '{marker_file}'")))?;
1249            marker_paths.push(Self::normalized(&marker_path));
1250        }
1251
1252        Ok(marker_paths)
1253    }
1254
1255    fn path_in_subtree(path: &str, root: &str) -> bool {
1256        path == root || path.starts_with(&(String::from(root) + "/"))
1257    }
1258
1259    fn clear_subtree_metadata(&mut self, path: &str) -> VfsResult<()> {
1260        let normalized = Self::normalized(path);
1261        for kind in [OverlayMarkerKind::Whiteout, OverlayMarkerKind::Opaque] {
1262            for marker_path in self.marker_paths_in_upper(kind)? {
1263                if Self::path_in_subtree(&marker_path, &normalized) {
1264                    self.set_marker(kind, &marker_path, false)?;
1265                }
1266            }
1267        }
1268        Ok(())
1269    }
1270
1271    fn copy_subtree_metadata(
1272        &mut self,
1273        old_root: &str,
1274        new_root: &str,
1275        rollback: &mut StagedRollback,
1276    ) -> VfsResult<()> {
1277        let old_normalized = Self::normalized(old_root);
1278        let new_normalized = Self::normalized(new_root);
1279
1280        for kind in [OverlayMarkerKind::Whiteout, OverlayMarkerKind::Opaque] {
1281            for marker_path in self.marker_paths_in_upper(kind)? {
1282                if Self::path_in_subtree(&marker_path, &old_normalized) {
1283                    let destination =
1284                        Self::rebase_path(&marker_path, &old_normalized, &new_normalized);
1285                    if !self.marker_exists(kind, &destination) {
1286                        rollback.record_marker(kind, &destination);
1287                    }
1288                    self.set_marker(kind, &destination, true)?;
1289                }
1290            }
1291        }
1292
1293        Ok(())
1294    }
1295
1296    fn stage_snapshot_entries_in_upper(
1297        &mut self,
1298        entries: &[OverlaySnapshotEntry],
1299        rollback: &mut StagedRollback,
1300    ) -> VfsResult<()> {
1301        for entry in entries {
1302            match &entry.kind {
1303                OverlaySnapshotKind::Directory => {
1304                    if !self.has_entry_in_upper(&entry.path) {
1305                        self.ensure_ancestor_directories_in_upper_recording(&entry.path, rollback)?;
1306                        self.writable_upper(&entry.path)?.create_dir(&entry.path)?;
1307                        rollback.record_path(&entry.path, true);
1308                    }
1309                    self.writable_upper(&entry.path)?
1310                        .chmod(&entry.path, entry.stat.mode)?;
1311                    self.writable_upper(&entry.path)?.chown(
1312                        &entry.path,
1313                        entry.stat.uid,
1314                        entry.stat.gid,
1315                    )?;
1316                    if !self.marker_exists(OverlayMarkerKind::Opaque, &entry.path) {
1317                        rollback.record_marker(OverlayMarkerKind::Opaque, &entry.path);
1318                    }
1319                    self.mark_opaque_directory(&entry.path)?;
1320                }
1321                OverlaySnapshotKind::File(data) => {
1322                    if self.has_entry_in_upper(&entry.path) {
1323                        continue;
1324                    }
1325                    self.ensure_ancestor_directories_in_upper_recording(&entry.path, rollback)?;
1326                    self.writable_upper(&entry.path)?
1327                        .write_file(&entry.path, data.clone())?;
1328                    rollback.record_path(&entry.path, false);
1329                    self.writable_upper(&entry.path)?
1330                        .chmod(&entry.path, entry.stat.mode)?;
1331                    self.writable_upper(&entry.path)?.chown(
1332                        &entry.path,
1333                        entry.stat.uid,
1334                        entry.stat.gid,
1335                    )?;
1336                }
1337                OverlaySnapshotKind::Symlink(target) => {
1338                    if self.has_entry_in_upper(&entry.path) {
1339                        continue;
1340                    }
1341                    self.ensure_ancestor_directories_in_upper_recording(&entry.path, rollback)?;
1342                    self.writable_upper(&entry.path)?
1343                        .symlink(target, &entry.path)?;
1344                    rollback.record_path(&entry.path, false);
1345                }
1346            }
1347        }
1348
1349        Ok(())
1350    }
1351
1352    /// Best-effort undo of `stage_snapshot_entries_in_upper` /
1353    /// `copy_subtree_metadata` for the error path. Removes only the upper entries
1354    /// and markers that staging itself created — in reverse creation order so
1355    /// children are removed before their parents — and never adds a whiteout, so
1356    /// a still-present lower source remains visible after a failed rename.
1357    fn rollback_staged_entries(&mut self, rollback: &StagedRollback) {
1358        for (kind, marker_path) in &rollback.created_markers {
1359            let _ = self.set_marker(*kind, marker_path, false);
1360        }
1361
1362        for (path, is_dir) in rollback.created_paths.iter().rev() {
1363            let Some(upper) = self.upper.as_mut() else {
1364                return;
1365            };
1366            if *is_dir {
1367                let _ = upper.remove_dir(path);
1368            } else {
1369                let _ = upper.remove_file(path);
1370            }
1371        }
1372    }
1373}
1374
1375fn sync_upper_root_metadata(upper: &mut MemoryFileSystem, lowers: &[MemoryFileSystem]) {
1376    let Some(root_stat) = lowers.iter().find_map(|lower| lower.lstat("/").ok()) else {
1377        return;
1378    };
1379
1380    upper
1381        .chmod("/", root_stat.mode)
1382        .expect("overlay upper root should exist");
1383    upper
1384        .chown("/", root_stat.uid, root_stat.gid)
1385        .expect("overlay upper root should exist");
1386}
1387
1388impl VirtualFileSystem for OverlayFileSystem {
1389    fn read_file(&mut self, path: &str) -> VfsResult<Vec<u8>> {
1390        if self.touches_internal_metadata(path) {
1391            return Err(Self::entry_not_found(path));
1392        }
1393        if self.is_whited_out(path) {
1394            return Err(Self::entry_not_found(path));
1395        }
1396        if self.exists_in_upper(path) {
1397            return self
1398                .upper
1399                .as_mut()
1400                .expect("upper must exist when path exists")
1401                .read_file(path);
1402        }
1403        let Some(index) = self.find_lower_by_exists(path) else {
1404            return Err(Self::entry_not_found(path));
1405        };
1406        self.lowers[index].read_file(path)
1407    }
1408
1409    fn read_dir(&mut self, path: &str) -> VfsResult<Vec<String>> {
1410        if self.touches_internal_metadata(path) {
1411            return Err(Self::directory_not_found(path));
1412        }
1413        if self.is_whited_out(path) {
1414            return Err(Self::directory_not_found(path));
1415        }
1416
1417        let normalized = Self::normalized(path);
1418        let mut directory_exists = false;
1419        let mut entries = BTreeSet::new();
1420        let upper = self.upper.as_ref();
1421        let include_lowers = !Self::marker_exists_in_upper(upper, OverlayMarkerKind::Opaque, path);
1422
1423        if include_lowers {
1424            for lower in self.lowers.iter_mut().rev() {
1425                if let Ok(lower_entries) = lower.read_dir(path) {
1426                    directory_exists = true;
1427                    for entry in lower_entries {
1428                        if entry == "."
1429                            || entry == ".."
1430                            || Self::should_hide_directory_entry(path, &entry)
1431                        {
1432                            continue;
1433                        }
1434                        let child_path = if normalized == "/" {
1435                            format!("/{entry}")
1436                        } else {
1437                            format!("{normalized}/{entry}")
1438                        };
1439                        if !Self::marker_exists_in_upper(
1440                            upper,
1441                            OverlayMarkerKind::Whiteout,
1442                            &child_path,
1443                        ) {
1444                            entries.insert(entry);
1445                        }
1446                    }
1447                }
1448            }
1449        }
1450
1451        if let Some(upper) = self.upper.as_mut() {
1452            if let Ok(upper_entries) = upper.read_dir(path) {
1453                directory_exists = true;
1454                for entry in upper_entries {
1455                    if entry == "."
1456                        || entry == ".."
1457                        || Self::should_hide_directory_entry(path, &entry)
1458                    {
1459                        continue;
1460                    }
1461                    entries.insert(entry);
1462                }
1463            }
1464        }
1465
1466        if !directory_exists {
1467            return Err(Self::directory_not_found(path));
1468        }
1469
1470        Ok(entries.into_iter().collect())
1471    }
1472
1473    fn read_dir_limited(&mut self, path: &str, max_entries: usize) -> VfsResult<Vec<String>> {
1474        if self.touches_internal_metadata(path) {
1475            return Err(Self::directory_not_found(path));
1476        }
1477        if self.is_whited_out(path) {
1478            return Err(Self::directory_not_found(path));
1479        }
1480
1481        let normalized = Self::normalized(path);
1482        let mut directory_exists = false;
1483        let mut entries = BTreeSet::new();
1484        let upper = self.upper.as_ref();
1485        let include_lowers = !Self::marker_exists_in_upper(upper, OverlayMarkerKind::Opaque, path);
1486
1487        if include_lowers {
1488            for lower in self.lowers.iter_mut().rev() {
1489                let lower_entries = match lower.read_dir_filtered_limited(
1490                    path,
1491                    max_entries.saturating_sub(entries.len()),
1492                    |entry| {
1493                        if entry == "."
1494                            || entry == ".."
1495                            || Self::should_hide_directory_entry(path, entry)
1496                        {
1497                            return false;
1498                        }
1499                        let child_path = if normalized == "/" {
1500                            format!("/{entry}")
1501                        } else {
1502                            format!("{normalized}/{entry}")
1503                        };
1504                        !Self::marker_exists_in_upper(
1505                            upper,
1506                            OverlayMarkerKind::Whiteout,
1507                            &child_path,
1508                        ) && !entries.contains(entry)
1509                    },
1510                ) {
1511                    Ok(entries) => entries,
1512                    Err(error) if error.code() == "ENOENT" || error.code() == "ENOTDIR" => {
1513                        continue;
1514                    }
1515                    Err(error) => return Err(error),
1516                };
1517                directory_exists = true;
1518                for entry in lower_entries {
1519                    entries.insert(entry);
1520                    if entries.len() > max_entries {
1521                        return Err(VfsError::new(
1522                            "ENOMEM",
1523                            format!(
1524                                "directory listing for '{path}' exceeds configured limit of {max_entries} entries"
1525                            ),
1526                        ));
1527                    }
1528                }
1529            }
1530        }
1531
1532        if let Some(upper) = self.upper.as_mut() {
1533            let upper_entries = match upper.read_dir_filtered_limited(
1534                path,
1535                max_entries.saturating_sub(entries.len()),
1536                |entry| {
1537                    entry != "."
1538                        && entry != ".."
1539                        && !Self::should_hide_directory_entry(path, entry)
1540                        && !entries.contains(entry)
1541                },
1542            ) {
1543                Ok(entries) => entries,
1544                Err(error) if error.code() == "ENOENT" => Vec::new(),
1545                Err(error) => return Err(error),
1546            };
1547            directory_exists = directory_exists || upper.exists(path);
1548            for entry in upper_entries {
1549                if entry == "." || entry == ".." || Self::should_hide_directory_entry(path, &entry)
1550                {
1551                    continue;
1552                }
1553                entries.insert(entry);
1554                if entries.len() > max_entries {
1555                    return Err(VfsError::new(
1556                        "ENOMEM",
1557                        format!(
1558                            "directory listing for '{path}' exceeds configured limit of {max_entries} entries"
1559                        ),
1560                    ));
1561                }
1562            }
1563        }
1564
1565        if !directory_exists {
1566            return Err(Self::directory_not_found(path));
1567        }
1568
1569        Ok(entries.into_iter().collect())
1570    }
1571
1572    fn read_dir_with_types(&mut self, path: &str) -> VfsResult<Vec<VirtualDirEntry>> {
1573        if self.touches_internal_metadata(path) {
1574            return Err(Self::directory_not_found(path));
1575        }
1576        self.read_dir_with_types_inner(path)
1577    }
1578
1579    fn write_file(&mut self, path: &str, content: impl Into<Vec<u8>>) -> VfsResult<()> {
1580        if self.touches_internal_metadata(path) {
1581            return Err(VfsError::permission_denied("open", path));
1582        }
1583        self.clear_path_metadata(path)?;
1584        if self.find_lower_by_entry(path).is_some() {
1585            self.copy_up_path(path)?;
1586        } else {
1587            self.ensure_ancestor_directories_in_upper(path)?;
1588        }
1589        self.writable_upper(path)?.write_file(path, content.into())
1590    }
1591
1592    fn create_file_exclusive(&mut self, path: &str, content: impl Into<Vec<u8>>) -> VfsResult<()> {
1593        if self.touches_internal_metadata(path) {
1594            return Err(VfsError::permission_denied("open", path));
1595        }
1596        self.clear_path_metadata(path)?;
1597        if self.path_exists_in_merged_view(path) {
1598            return Err(Self::already_exists(path));
1599        }
1600        self.ensure_ancestor_directories_in_upper(path)?;
1601        self.writable_upper(path)?
1602            .create_file_exclusive(path, content.into())
1603    }
1604
1605    fn append_file(&mut self, path: &str, content: impl Into<Vec<u8>>) -> VfsResult<u64> {
1606        if self.touches_internal_metadata(path) {
1607            return Err(VfsError::permission_denied("open", path));
1608        }
1609        self.clear_path_metadata(path)?;
1610        if self.find_lower_by_entry(path).is_some() {
1611            self.copy_up_path(path)?;
1612        } else {
1613            self.ensure_ancestor_directories_in_upper(path)?;
1614        }
1615        self.writable_upper(path)?.append_file(path, content.into())
1616    }
1617
1618    fn create_dir(&mut self, path: &str) -> VfsResult<()> {
1619        if self.touches_internal_metadata(path) {
1620            return Err(VfsError::permission_denied("mkdir", path));
1621        }
1622        self.clear_path_metadata(path)?;
1623        if self.path_exists_in_merged_view(path) {
1624            return Err(Self::already_exists(path));
1625        }
1626        self.ensure_ancestor_directories_in_upper(path)?;
1627        self.writable_upper(path)?.create_dir(path)
1628    }
1629
1630    fn mkdir(&mut self, path: &str, recursive: bool) -> VfsResult<()> {
1631        if self.touches_internal_metadata(path) {
1632            return Err(VfsError::permission_denied("mkdir", path));
1633        }
1634        self.clear_path_metadata(path)?;
1635        if self.path_exists_in_merged_view(path) {
1636            let stat = self.merged_lstat(path)?;
1637            if recursive && stat.is_directory && !stat.is_symbolic_link {
1638                return Ok(());
1639            }
1640            return Err(Self::already_exists(path));
1641        }
1642        self.ensure_ancestor_directories_in_upper(path)?;
1643        self.writable_upper(path)?.mkdir(path, recursive)
1644    }
1645
1646    fn exists(&self, path: &str) -> bool {
1647        if self.touches_internal_metadata(path) {
1648            return false;
1649        }
1650        self.path_exists_in_merged_view(path)
1651    }
1652
1653    fn stat(&mut self, path: &str) -> VfsResult<VirtualStat> {
1654        if self.touches_internal_metadata(path) {
1655            return Err(Self::entry_not_found(path));
1656        }
1657        if self.is_whited_out(path) {
1658            return Err(Self::entry_not_found(path));
1659        }
1660        if self.exists_in_upper(path) {
1661            return self
1662                .upper
1663                .as_mut()
1664                .expect("upper must exist when path exists")
1665                .stat(path);
1666        }
1667        let Some(index) = self.find_lower_by_exists(path) else {
1668            return Err(Self::entry_not_found(path));
1669        };
1670        self.lowers[index].stat(path)
1671    }
1672
1673    fn remove_file(&mut self, path: &str) -> VfsResult<()> {
1674        if self.touches_internal_metadata(path) {
1675            return Err(VfsError::permission_denied("unlink", path));
1676        }
1677        if self.is_whited_out(path) {
1678            return Err(Self::entry_not_found(path));
1679        }
1680        let lower_exists = self.find_lower_by_exists(path).is_some();
1681        let upper_exists = self.exists_in_upper(path);
1682        if !lower_exists && !upper_exists {
1683            return Err(Self::entry_not_found(path));
1684        }
1685        if upper_exists {
1686            self.writable_upper(path)?.remove_file(path)?;
1687        } else {
1688            self.writable_upper(path)?;
1689        }
1690        self.clear_opaque_directory(path)?;
1691        self.add_whiteout(path)?;
1692        Ok(())
1693    }
1694
1695    fn remove_dir(&mut self, path: &str) -> VfsResult<()> {
1696        let normalized = Self::normalized(path);
1697        if self.touches_internal_metadata(&normalized) {
1698            return Err(VfsError::permission_denied("rmdir", path));
1699        }
1700        if normalized == "/" {
1701            return Err(VfsError::permission_denied("rmdir", path));
1702        }
1703
1704        let stat = match self.merged_lstat(path) {
1705            Ok(stat) => stat,
1706            Err(error) if error.code() == "ENOENT" => return Err(Self::directory_not_found(path)),
1707            Err(error) => return Err(error),
1708        };
1709
1710        if !stat.is_directory || stat.is_symbolic_link {
1711            return Err(Self::not_directory(path));
1712        }
1713
1714        if self.directory_has_raw_children(path)? {
1715            return Err(Self::not_empty(path));
1716        }
1717
1718        let lower_exists = self.find_lower_by_entry(path).is_some();
1719        let upper_exists = self.has_entry_in_upper(path);
1720        if upper_exists {
1721            self.writable_upper(path)?.remove_dir(&normalized)?;
1722        } else {
1723            self.writable_upper(path)?;
1724        }
1725        if lower_exists {
1726            self.clear_opaque_directory(path)?;
1727            self.add_whiteout(path)?;
1728        } else {
1729            self.clear_path_metadata(path)?;
1730        }
1731        Ok(())
1732    }
1733
1734    fn rename(&mut self, old_path: &str, new_path: &str) -> VfsResult<()> {
1735        let old_normalized = Self::normalized(old_path);
1736        let new_normalized = Self::normalized(new_path);
1737        if self.touches_internal_metadata(&old_normalized)
1738            || self.touches_internal_metadata(&new_normalized)
1739        {
1740            return Err(VfsError::permission_denied("rename", old_path));
1741        }
1742
1743        if old_normalized == "/" {
1744            return Err(VfsError::permission_denied("rename", old_path));
1745        }
1746
1747        if old_normalized == new_normalized {
1748            return Ok(());
1749        }
1750
1751        let source_stat = self.merged_lstat(old_path)?;
1752        self.validate_destination_parent(&new_normalized)?;
1753        let resolved_new_normalized = self.resolved_destination_path(&new_normalized)?;
1754
1755        if old_normalized == resolved_new_normalized {
1756            return Ok(());
1757        }
1758
1759        if source_stat.is_directory
1760            && resolved_new_normalized.starts_with(&(old_normalized.clone() + "/"))
1761        {
1762            return Err(VfsError::new(
1763                "EINVAL",
1764                format!(
1765                    "cannot move '{}' into its own descendant '{}'",
1766                    old_path, new_path
1767                ),
1768            ));
1769        }
1770
1771        for path in self.destination_parent_copy_up_paths(&new_normalized)? {
1772            self.materialize_destination_parent_in_upper(&path)?;
1773        }
1774
1775        let mut snapshot_entries = Vec::new();
1776        self.collect_snapshot_entries(&old_normalized, &mut snapshot_entries)?;
1777
1778        if let Ok(destination_stat) = self.merged_lstat(&resolved_new_normalized) {
1779            if destination_stat.is_directory
1780                && !destination_stat.is_symbolic_link
1781                && self.directory_has_visible_entries_limited(&resolved_new_normalized)?
1782            {
1783                return Err(Self::not_empty(&resolved_new_normalized));
1784            }
1785
1786            if self.has_entry_in_upper(&resolved_new_normalized) {
1787                if destination_stat.is_directory && !destination_stat.is_symbolic_link {
1788                    self.writable_upper(&resolved_new_normalized)?
1789                        .remove_dir(&resolved_new_normalized)?;
1790                } else {
1791                    self.writable_upper(&resolved_new_normalized)?
1792                        .remove_file(&resolved_new_normalized)?;
1793                }
1794            }
1795            self.clear_subtree_metadata(&resolved_new_normalized)?;
1796        }
1797
1798        // Stage the source subtree into the upper, copy its overlay metadata, and
1799        // move it to the destination. Any failure between staging and a successful
1800        // rename must not orphan the staged inodes / `path_index` entries, so the
1801        // upper mutations are recorded and rolled back on the error path (the
1802        // success path still finalizes via `remove_snapshot_entries`). A bare `?`
1803        // here would leave the copied-up entries stranded until VM Drop.
1804        let mut rollback = StagedRollback::default();
1805        let staged_result = (|| -> VfsResult<()> {
1806            self.stage_snapshot_entries_in_upper(&snapshot_entries, &mut rollback)?;
1807            self.copy_subtree_metadata(&old_normalized, &resolved_new_normalized, &mut rollback)?;
1808            self.writable_upper(&old_normalized)?
1809                .rename(&old_normalized, &resolved_new_normalized)?;
1810            Ok(())
1811        })();
1812
1813        if let Err(error) = staged_result {
1814            self.rollback_staged_entries(&rollback);
1815            return Err(error);
1816        }
1817
1818        self.remove_snapshot_entries(&snapshot_entries)
1819    }
1820
1821    fn realpath(&self, path: &str) -> VfsResult<String> {
1822        if self.touches_internal_metadata(path) {
1823            return Err(Self::entry_not_found(path));
1824        }
1825        if self.is_whited_out(path) {
1826            return Err(Self::entry_not_found(path));
1827        }
1828        if self.exists_in_upper(path) {
1829            return self
1830                .upper
1831                .as_ref()
1832                .expect("upper must exist when path exists")
1833                .realpath(path);
1834        }
1835        let Some(index) = self.find_lower_by_exists(path) else {
1836            return Err(Self::entry_not_found(path));
1837        };
1838        self.lowers[index].realpath(path)
1839    }
1840
1841    fn symlink(&mut self, target: &str, link_path: &str) -> VfsResult<()> {
1842        if self.touches_internal_metadata(link_path) {
1843            return Err(VfsError::permission_denied("symlink", link_path));
1844        }
1845        self.clear_path_metadata(link_path)?;
1846        self.ensure_ancestor_directories_in_upper(link_path)?;
1847        self.writable_upper(link_path)?.symlink(target, link_path)
1848    }
1849
1850    fn read_link(&self, path: &str) -> VfsResult<String> {
1851        if self.touches_internal_metadata(path) {
1852            return Err(Self::entry_not_found(path));
1853        }
1854        if self.is_whited_out(path) {
1855            return Err(Self::entry_not_found(path));
1856        }
1857        if self.has_entry_in_upper(path) {
1858            return self
1859                .upper
1860                .as_ref()
1861                .expect("upper must exist when path exists")
1862                .read_link(path);
1863        }
1864        let Some((index, _)) = self.find_lower_by_entry(path) else {
1865            return Err(Self::entry_not_found(path));
1866        };
1867        self.lowers[index].read_link(path)
1868    }
1869
1870    fn lstat(&self, path: &str) -> VfsResult<VirtualStat> {
1871        if self.touches_internal_metadata(path) {
1872            return Err(Self::entry_not_found(path));
1873        }
1874        if self.is_whited_out(path) {
1875            return Err(Self::entry_not_found(path));
1876        }
1877        if self.has_entry_in_upper(path) {
1878            return self
1879                .upper
1880                .as_ref()
1881                .expect("upper must exist when path exists")
1882                .lstat(path);
1883        }
1884        self.find_lower_by_entry(path)
1885            .map(|(_, stat)| stat)
1886            .ok_or_else(|| Self::entry_not_found(path))
1887    }
1888
1889    fn link(&mut self, old_path: &str, new_path: &str) -> VfsResult<()> {
1890        if self.touches_internal_metadata(old_path) || self.touches_internal_metadata(new_path) {
1891            return Err(VfsError::permission_denied("link", new_path));
1892        }
1893        self.clear_path_metadata(new_path)?;
1894        self.copy_up_path(old_path)?;
1895        self.ensure_ancestor_directories_in_upper(new_path)?;
1896        self.writable_upper(new_path)?.link(old_path, new_path)
1897    }
1898
1899    fn chmod(&mut self, path: &str, mode: u32) -> VfsResult<()> {
1900        if self.touches_internal_metadata(path) {
1901            return Err(VfsError::permission_denied("chmod", path));
1902        }
1903        if self.is_whited_out(path) {
1904            return Err(Self::entry_not_found(path));
1905        }
1906        if !self.exists_in_upper(path) {
1907            self.copy_up_path(path)?;
1908        }
1909        self.writable_upper(path)?.chmod(path, mode)
1910    }
1911
1912    fn chown(&mut self, path: &str, uid: u32, gid: u32) -> VfsResult<()> {
1913        if self.touches_internal_metadata(path) {
1914            return Err(VfsError::permission_denied("chown", path));
1915        }
1916        if self.is_whited_out(path) {
1917            return Err(Self::entry_not_found(path));
1918        }
1919        if !self.exists_in_upper(path) {
1920            self.copy_up_path(path)?;
1921        }
1922        self.writable_upper(path)?.chown(path, uid, gid)
1923    }
1924
1925    fn utimes(&mut self, path: &str, atime_ms: u64, mtime_ms: u64) -> VfsResult<()> {
1926        if self.touches_internal_metadata(path) {
1927            return Err(VfsError::permission_denied("utime", path));
1928        }
1929        if self.is_whited_out(path) {
1930            return Err(Self::entry_not_found(path));
1931        }
1932        if !self.exists_in_upper(path) {
1933            self.copy_up_path(path)?;
1934        }
1935        self.writable_upper(path)?.utimes(path, atime_ms, mtime_ms)
1936    }
1937
1938    fn utimes_spec(
1939        &mut self,
1940        path: &str,
1941        atime: VirtualUtimeSpec,
1942        mtime: VirtualUtimeSpec,
1943        follow_symlinks: bool,
1944    ) -> VfsResult<()> {
1945        if self.touches_internal_metadata(path) {
1946            return Err(VfsError::permission_denied("utime", path));
1947        }
1948        if self.is_whited_out(path) {
1949            return Err(Self::entry_not_found(path));
1950        }
1951        if !self.exists_in_upper(path) {
1952            self.copy_up_path(path)?;
1953        }
1954        self.writable_upper(path)?
1955            .utimes_spec(path, atime, mtime, follow_symlinks)
1956    }
1957
1958    fn truncate(&mut self, path: &str, length: u64) -> VfsResult<()> {
1959        if self.touches_internal_metadata(path) {
1960            return Err(VfsError::permission_denied("truncate", path));
1961        }
1962        if self.is_whited_out(path) {
1963            return Err(Self::entry_not_found(path));
1964        }
1965        if !self.exists_in_upper(path) {
1966            self.copy_up_path(path)?;
1967        }
1968        self.writable_upper(path)?.truncate(path, length)
1969    }
1970
1971    fn pread(&mut self, path: &str, offset: u64, length: usize) -> VfsResult<Vec<u8>> {
1972        if self.touches_internal_metadata(path) {
1973            return Err(Self::entry_not_found(path));
1974        }
1975        if self.is_whited_out(path) {
1976            return Err(Self::entry_not_found(path));
1977        }
1978        if self.exists_in_upper(path) {
1979            return self
1980                .upper
1981                .as_mut()
1982                .expect("upper must exist when path exists")
1983                .pread(path, offset, length);
1984        }
1985        let Some(index) = self.find_lower_by_exists(path) else {
1986            return Err(Self::entry_not_found(path));
1987        };
1988        self.lowers[index].pread(path, offset, length)
1989    }
1990}
1991
1992#[cfg(test)]
1993mod tests {
1994    use super::{OverlayFileSystem, OverlayMode, OVERLAY_WHITEOUT_DIR};
1995    use crate::posix::vfs::{MemoryFileSystem, VfsResult, VirtualFileSystem};
1996
1997    /// Regression: a rename that fails *after* staging the source subtree into the
1998    /// upper (here, `copy_subtree_metadata` aborts on a corrupt overlay marker)
1999    /// must not orphan the staged inode / `path_index` entry. Before the fix the
2000    /// `?` on `copy_subtree_metadata` short-circuited past
2001    /// `remove_snapshot_entries`, leaving the copied-up source stranded in the
2002    /// upper; the rollback now removes it without resurrecting a whiteout.
2003    #[test]
2004    fn rename_rolls_back_staged_entries_when_metadata_copy_fails() {
2005        let mut lower = MemoryFileSystem::new();
2006        lower
2007            .write_file("/src.txt", b"payload".to_vec())
2008            .expect("seed lower-only source file");
2009
2010        let mut overlay = OverlayFileSystem::with_upper(vec![lower], MemoryFileSystem::new());
2011
2012        // Plant a corrupt whiteout marker directly in the upper. `marker_paths_in_upper`
2013        // parses every marker file as UTF-8, so this forces `copy_subtree_metadata`
2014        // (the step after staging) to fail deterministically with a `?`.
2015        {
2016            let upper = overlay
2017                .upper
2018                .as_mut()
2019                .expect("ephemeral overlay has an upper");
2020            upper
2021                .mkdir(OVERLAY_WHITEOUT_DIR, true)
2022                .expect("create whiteout marker directory");
2023            upper
2024                .write_file(&format!("{OVERLAY_WHITEOUT_DIR}/corrupt"), vec![0xff, 0xfe])
2025                .expect("plant corrupt (non-UTF-8) marker");
2026        }
2027
2028        // The rename must fail (corrupt marker), and staging must leave no residue.
2029        let result = overlay.rename("/src.txt", "/dst.txt");
2030        assert!(
2031            result.is_err(),
2032            "rename should fail when overlay metadata copy aborts"
2033        );
2034
2035        // The staged copy-up of the source must have been rolled out of the upper:
2036        // no orphaned inode / path_index entry remains at the source or destination.
2037        let upper = overlay.upper.as_ref().expect("overlay upper");
2038        assert!(
2039            !upper.exists("/src.txt"),
2040            "staged source copy must be removed from the upper on the error path"
2041        );
2042        assert!(
2043            !upper.exists("/dst.txt"),
2044            "no destination entry should have been staged in the upper"
2045        );
2046
2047        // The rollback must NOT whiteout the still-present lower source: a failed
2048        // rename leaves the original visible in the merged view.
2049        assert!(
2050            overlay.exists("/src.txt"),
2051            "lower-backed source must remain visible after a failed rename"
2052        );
2053        assert!(
2054            !overlay.exists("/dst.txt"),
2055            "destination must not exist after a failed rename"
2056        );
2057    }
2058
2059    #[test]
2060    fn symlink_into_metadata_namespace_cannot_read_or_resurrect_whiteouts() {
2061        let mut lower = MemoryFileSystem::new();
2062        lower.mkdir("/data", true).expect("create lower directory");
2063        lower
2064            .write_file("/data/secret.txt", b"secret".to_vec())
2065            .expect("seed lower file");
2066
2067        let mut overlay = OverlayFileSystem::with_upper(vec![lower], MemoryFileSystem::new());
2068
2069        // Delete a lower-layer file: a whiteout marker is written under the
2070        // reserved metadata root and the file disappears from the merged view.
2071        overlay
2072            .remove_file("/data/secret.txt")
2073            .expect("whiteout lower file");
2074        assert!(!overlay.exists("/data/secret.txt"));
2075
2076        // A guest symlink whose target is the metadata root must not become a
2077        // window into the reserved namespace.
2078        overlay
2079            .symlink("/.secure-exec-overlay/whiteouts", "/escape")
2080            .expect("creating the symlink itself is allowed");
2081
2082        // Listing through the symlink must be denied, not disclose markers.
2083        assert!(
2084            overlay.read_dir("/escape").is_err(),
2085            "listing the metadata namespace via a symlink must be denied"
2086        );
2087
2088        // Removing the whiteout marker through the symlink must be denied, so the
2089        // deleted lower-layer file cannot be resurrected.
2090        assert!(
2091            overlay.remove_file("/escape/anything").is_err(),
2092            "tampering with metadata via a symlink must be denied"
2093        );
2094        assert!(
2095            !overlay.exists("/data/secret.txt"),
2096            "deleted lower-layer file must stay deleted"
2097        );
2098
2099        // The same bypass via a symlink to an ancestor (e.g. `/`) is also closed.
2100        overlay
2101            .symlink("/", "/rootlink")
2102            .expect("symlink to root is allowed");
2103        assert!(
2104            overlay
2105                .read_dir("/rootlink/.secure-exec-overlay/whiteouts")
2106                .is_err(),
2107            "metadata must be unreachable via an ancestor symlink too"
2108        );
2109    }
2110
2111    #[test]
2112    fn whiteouts_persist_when_overlay_reopens_with_same_upper() {
2113        let mut lower = MemoryFileSystem::new();
2114        lower.mkdir("/data", true).expect("create lower directory");
2115        lower
2116            .write_file("/data/base.txt", b"base".to_vec())
2117            .expect("seed lower file");
2118        let lower_snapshot = lower.snapshot();
2119
2120        let mut overlay = OverlayFileSystem::with_upper(
2121            vec![MemoryFileSystem::from_snapshot(lower_snapshot.clone())],
2122            MemoryFileSystem::new(),
2123        );
2124        overlay
2125            .remove_file("/data/base.txt")
2126            .expect("whiteout lower file");
2127
2128        let upper = overlay.upper.take().expect("overlay upper");
2129        let restored_lower = MemoryFileSystem::from_snapshot(lower_snapshot);
2130        let mut restored = OverlayFileSystem::with_upper(vec![restored_lower], upper);
2131
2132        assert!(!restored.exists("/data/base.txt"));
2133        assert_eq!(
2134            restored.read_dir("/data").expect("read merged directory"),
2135            Vec::<String>::new()
2136        );
2137    }
2138
2139    #[test]
2140    fn copied_up_directories_become_opaque_and_hide_overlay_metadata() {
2141        let mut lower = MemoryFileSystem::new();
2142        lower.mkdir("/data", true).expect("create lower directory");
2143        lower
2144            .write_file("/data/base.txt", b"base".to_vec())
2145            .expect("seed lower file");
2146
2147        let mut overlay = OverlayFileSystem::new(vec![lower], OverlayMode::Ephemeral);
2148        overlay
2149            .chmod("/data", 0o700)
2150            .expect("copy up lower directory");
2151
2152        assert_eq!(
2153            overlay.read_dir("/data").expect("read opaque directory"),
2154            Vec::<String>::new()
2155        );
2156        let root_entries = overlay.read_dir("/").expect("read root");
2157        assert!(!root_entries
2158            .iter()
2159            .any(|entry| entry == ".secure-exec-overlay"));
2160    }
2161
2162    #[test]
2163    fn remove_dir_succeeds_when_only_lower_children_are_whited_out() {
2164        let mut lower = MemoryFileSystem::new();
2165        lower.mkdir("/a", true).expect("create lower directory");
2166        lower
2167            .write_file("/a/c", b"child".to_vec())
2168            .expect("seed lower child");
2169
2170        let mut overlay = OverlayFileSystem::new(vec![lower], OverlayMode::Ephemeral);
2171        overlay.remove_file("/a/c").expect("whiteout lower child");
2172        overlay
2173            .remove_dir("/a")
2174            .expect("remove merged-empty directory");
2175
2176        assert!(!overlay.exists("/a"));
2177        assert_error_code(overlay.read_dir("/a"), "ENOENT");
2178    }
2179
2180    #[test]
2181    fn remove_dir_still_rejects_visible_children() {
2182        let mut lower = MemoryFileSystem::new();
2183        lower.mkdir("/a", true).expect("create lower directory");
2184        lower
2185            .write_file("/a/c", b"child".to_vec())
2186            .expect("seed lower child");
2187
2188        let mut overlay = OverlayFileSystem::new(vec![lower], OverlayMode::Ephemeral);
2189        assert_error_code(overlay.remove_dir("/a"), "ENOTEMPTY");
2190        assert!(overlay.exists("/a/c"));
2191    }
2192
2193    fn assert_error_code<T: std::fmt::Debug>(result: VfsResult<T>, expected: &str) {
2194        let error = result.expect_err("expected operation to fail");
2195        assert_eq!(error.code(), expected);
2196    }
2197}