Skip to main content

secure_exec_vfs_core/posix/
overlay_fs.rs

1use super::vfs::{
2    normalize_path, MemoryFileSystem, VfsError, VfsResult, VirtualDirEntry, VirtualFileSystem,
3    VirtualStat, VirtualUtimeSpec,
4};
5use base64::Engine;
6use std::collections::BTreeSet;
7
8const MAX_SNAPSHOT_DEPTH: usize = 1024;
9const OVERLAY_METADATA_ROOT: &str = "/.secure-exec-overlay";
10const OVERLAY_WHITEOUT_DIR: &str = "/.secure-exec-overlay/whiteouts";
11const OVERLAY_OPAQUE_DIR: &str = "/.secure-exec-overlay/opaque";
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum OverlayMode {
15    Ephemeral,
16    ReadOnly,
17}
18
19#[derive(Debug)]
20pub struct OverlayFileSystem {
21    lowers: Vec<MemoryFileSystem>,
22    upper: Option<MemoryFileSystem>,
23    writes_locked: bool,
24}
25
26#[derive(Debug, Clone, Copy)]
27enum OverlayMarkerKind {
28    Whiteout,
29    Opaque,
30}
31
32#[derive(Debug)]
33enum OverlaySnapshotKind {
34    Directory,
35    File(Vec<u8>),
36    Symlink(String),
37}
38
39#[derive(Debug)]
40struct OverlaySnapshotEntry {
41    path: String,
42    stat: VirtualStat,
43    kind: OverlaySnapshotKind,
44}
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
47struct OverlayCopyUpUsage {
48    total_bytes: u64,
49    inode_count: usize,
50}
51
52impl OverlayFileSystem {
53    pub fn new(lowers: Vec<MemoryFileSystem>, mode: OverlayMode) -> Self {
54        let mut effective_lowers = lowers;
55        if effective_lowers.is_empty() {
56            effective_lowers.push(MemoryFileSystem::new());
57        }
58
59        let mut upper = match mode {
60            OverlayMode::Ephemeral => Some(MemoryFileSystem::new()),
61            OverlayMode::ReadOnly => None,
62        };
63        if let Some(upper_filesystem) = upper.as_mut() {
64            sync_upper_root_metadata(upper_filesystem, &effective_lowers);
65        }
66
67        Self {
68            lowers: effective_lowers,
69            upper,
70            writes_locked: matches!(mode, OverlayMode::ReadOnly),
71        }
72    }
73
74    pub fn with_upper(lowers: Vec<MemoryFileSystem>, upper: MemoryFileSystem) -> Self {
75        let mut effective_lowers = lowers;
76        if effective_lowers.is_empty() {
77            effective_lowers.push(MemoryFileSystem::new());
78        }
79
80        Self {
81            lowers: effective_lowers,
82            upper: Some(upper),
83            writes_locked: false,
84        }
85    }
86
87    pub fn lock_writes(&mut self) {
88        self.writes_locked = true;
89    }
90
91    fn normalized(path: &str) -> String {
92        normalize_path(path)
93    }
94
95    fn parent_path(path: &str) -> String {
96        let normalized = Self::normalized(path);
97        if normalized == "/" {
98            return String::from("/");
99        }
100
101        match normalized.rsplit_once('/') {
102            Some(("", _)) | None => String::from("/"),
103            Some((parent, _)) => String::from(parent),
104        }
105    }
106
107    fn basename(path: &str) -> String {
108        let normalized = Self::normalized(path);
109        if normalized == "/" {
110            return String::from("/");
111        }
112        normalized
113            .rsplit('/')
114            .find(|component| !component.is_empty())
115            .unwrap_or("")
116            .to_owned()
117    }
118
119    fn validate_destination_parent(&mut self, path: &str) -> VfsResult<()> {
120        let parent = Self::parent_path(path);
121        let resolved_parent = self.resolve_merged_path(&parent, true, 0)?;
122        let stat = self.merged_lstat(&resolved_parent)?;
123        if !stat.is_directory {
124            return Err(Self::not_directory(&parent));
125        }
126        Ok(())
127    }
128
129    fn resolved_destination_path(&self, path: &str) -> VfsResult<String> {
130        let parent = Self::parent_path(path);
131        let resolved_parent = self.resolve_merged_path(&parent, true, 0)?;
132        Ok(Self::join_path(&resolved_parent, &Self::basename(path)))
133    }
134
135    fn resolve_merged_path(
136        &self,
137        path: &str,
138        follow_final_symlink: bool,
139        depth: usize,
140    ) -> VfsResult<String> {
141        if depth > MAX_SNAPSHOT_DEPTH {
142            return Err(VfsError::new(
143                "ELOOP",
144                format!("too many symbolic links while resolving '{path}'"),
145            ));
146        }
147
148        let normalized = Self::normalized(path);
149        if normalized == "/" {
150            return Ok(normalized);
151        }
152
153        let components: Vec<&str> = normalized
154            .split('/')
155            .filter(|component| !component.is_empty())
156            .collect();
157        let mut current = String::from("/");
158
159        for (index, component) in components.iter().enumerate() {
160            let candidate = Self::join_path(&current, component);
161            let is_final = index + 1 == components.len();
162            let should_follow = !is_final || follow_final_symlink;
163
164            if should_follow {
165                if let Ok(stat) = self.merged_lstat(&candidate) {
166                    if stat.is_symbolic_link {
167                        let target = self.read_link_inner(&candidate)?;
168                        let target_path = if target.starts_with('/') {
169                            Self::normalized(&target)
170                        } else {
171                            Self::normalized(&Self::join_path(
172                                &Self::parent_path(&candidate),
173                                &target,
174                            ))
175                        };
176                        let remainder = components[index + 1..].join("/");
177                        let next_path = if remainder.is_empty() {
178                            target_path
179                        } else {
180                            Self::normalized(&Self::join_path(&target_path, &remainder))
181                        };
182                        return self.resolve_merged_path(
183                            &next_path,
184                            follow_final_symlink,
185                            depth + 1,
186                        );
187                    }
188
189                    if !is_final && !stat.is_directory {
190                        return Err(Self::not_directory(&candidate));
191                    }
192                }
193            } else if let Ok(stat) = self.merged_lstat(&candidate) {
194                if !is_final && !stat.is_directory {
195                    return Err(Self::not_directory(&candidate));
196                }
197            }
198
199            current = candidate;
200        }
201
202        Ok(current)
203    }
204
205    fn destination_parent_copy_up_paths(&self, path: &str) -> VfsResult<Vec<String>> {
206        let parent = Self::parent_path(path);
207        let mut paths = Vec::new();
208        let mut seen = BTreeSet::new();
209        self.collect_destination_parent_copy_up_paths(&parent, &mut paths, &mut seen, 0)?;
210        Ok(paths)
211    }
212
213    fn collect_destination_parent_copy_up_paths(
214        &self,
215        parent: &str,
216        paths: &mut Vec<String>,
217        seen: &mut BTreeSet<String>,
218        depth: usize,
219    ) -> VfsResult<()> {
220        if depth > MAX_SNAPSHOT_DEPTH {
221            return Err(VfsError::new(
222                "ELOOP",
223                format!("too many symbolic links while resolving '{parent}'"),
224            ));
225        }
226
227        let normalized = Self::normalized(parent);
228        if normalized == "/" {
229            return Ok(());
230        }
231
232        let components: Vec<&str> = normalized
233            .split('/')
234            .filter(|component| !component.is_empty())
235            .collect();
236        let mut current = String::from("/");
237        for (index, component) in components.iter().enumerate() {
238            current = Self::join_path(&current, component);
239            let stat = self.merged_lstat(&current)?;
240
241            if stat.is_symbolic_link {
242                if !self.has_entry_in_upper(&current) && seen.insert(current.clone()) {
243                    paths.push(current.clone());
244                }
245
246                let target = self.read_link_inner(&current)?;
247                let target_path = if target.starts_with('/') {
248                    Self::normalized(&target)
249                } else {
250                    Self::normalized(&Self::join_path(&Self::parent_path(&current), &target))
251                };
252                let remainder = components[index + 1..].join("/");
253                let next_parent = if remainder.is_empty() {
254                    target_path
255                } else {
256                    Self::normalized(&Self::join_path(&target_path, &remainder))
257                };
258                return self.collect_destination_parent_copy_up_paths(
259                    &next_parent,
260                    paths,
261                    seen,
262                    depth + 1,
263                );
264            }
265
266            if self.find_lower_by_entry(&current).is_some()
267                && !self.has_entry_in_upper(&current)
268                && seen.insert(current.clone())
269            {
270                paths.push(current.clone());
271            }
272        }
273
274        Ok(())
275    }
276
277    fn encode_marker_path(path: &str) -> String {
278        base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(path)
279    }
280
281    fn marker_directory(kind: OverlayMarkerKind) -> &'static str {
282        match kind {
283            OverlayMarkerKind::Whiteout => OVERLAY_WHITEOUT_DIR,
284            OverlayMarkerKind::Opaque => OVERLAY_OPAQUE_DIR,
285        }
286    }
287
288    fn marker_path(kind: OverlayMarkerKind, path: &str) -> String {
289        format!(
290            "{}/{}",
291            Self::marker_directory(kind),
292            Self::encode_marker_path(&Self::normalized(path))
293        )
294    }
295
296    fn is_internal_metadata_path(path: &str) -> bool {
297        let normalized = Self::normalized(path);
298        normalized == OVERLAY_METADATA_ROOT
299            || normalized.starts_with(&(String::from(OVERLAY_METADATA_ROOT) + "/"))
300    }
301
302    /// Returns true if `path`, or the location it resolves to through symlinks,
303    /// lands in the reserved overlay metadata namespace.
304    ///
305    /// The lexical [`is_internal_metadata_path`] check alone is bypassable: the
306    /// underlying `MemoryFileSystem` follows symlinks, so a guest-created symlink
307    /// whose resolved target enters `/.secure-exec-overlay` (directly, or via a
308    /// symlink to an ancestor such as `/`) would slip past a purely lexical guard
309    /// and let the guest read or tamper with whiteout/opaque markers (e.g.
310    /// resurrecting a deleted lower-layer file). Resolving before the check
311    /// closes that hole while leaving ordinary symlinks unaffected.
312    fn touches_internal_metadata(&self, path: &str) -> bool {
313        if Self::is_internal_metadata_path(path) {
314            return true;
315        }
316        if let Ok(resolved) = self.resolve_merged_path(path, true, 0) {
317            if Self::is_internal_metadata_path(&resolved) {
318                return true;
319            }
320        }
321        if let Ok(resolved) = self.resolved_destination_path(path) {
322            if Self::is_internal_metadata_path(&resolved) {
323                return true;
324            }
325        }
326        false
327    }
328
329    fn hidden_root_entry_name() -> &'static str {
330        ".secure-exec-overlay"
331    }
332
333    fn should_hide_directory_entry(path: &str, entry: &str) -> bool {
334        let normalized = Self::normalized(path);
335        normalized == "/" && entry == Self::hidden_root_entry_name()
336    }
337
338    fn should_ignore_raw_directory_entry(
339        upper: Option<&MemoryFileSystem>,
340        path: &str,
341        entry: &str,
342    ) -> bool {
343        if entry == "." || entry == ".." || Self::should_hide_directory_entry(path, entry) {
344            return true;
345        }
346
347        let entry_path = Self::join_path(path, entry);
348        Self::marker_exists_in_upper(upper, OverlayMarkerKind::Whiteout, &entry_path)
349    }
350
351    fn check_copy_up_usage_limits(
352        usage: &OverlayCopyUpUsage,
353        max_bytes: Option<u64>,
354        max_inodes: Option<usize>,
355    ) -> VfsResult<()> {
356        if let Some(limit) = max_bytes {
357            if usage.total_bytes > limit {
358                return Err(VfsError::new(
359                    "ENOSPC",
360                    format!(
361                        "overlay rename copy-up bytes {} exceed configured limit {}",
362                        usage.total_bytes, limit
363                    ),
364                ));
365            }
366        }
367
368        if let Some(limit) = max_inodes {
369            if usage.inode_count > limit {
370                return Err(VfsError::new(
371                    "ENOSPC",
372                    format!(
373                        "overlay rename copy-up inodes {} exceed configured limit {}",
374                        usage.inode_count, limit
375                    ),
376                ));
377            }
378        }
379
380        Ok(())
381    }
382
383    fn add_copy_up_usage(
384        usage: &mut OverlayCopyUpUsage,
385        bytes: u64,
386        inodes: usize,
387        max_bytes: Option<u64>,
388        max_inodes: Option<usize>,
389    ) -> VfsResult<()> {
390        usage.total_bytes = usage.total_bytes.saturating_add(bytes);
391        usage.inode_count = usage.inode_count.saturating_add(inodes);
392        Self::check_copy_up_usage_limits(usage, max_bytes, max_inodes)
393    }
394
395    fn remaining_inode_budget(
396        usage: &OverlayCopyUpUsage,
397        max_inodes: Option<usize>,
398    ) -> Option<usize> {
399        max_inodes.map(|limit| limit.saturating_sub(usage.inode_count))
400    }
401
402    fn copy_up_directory_entries_limited(
403        &mut self,
404        path: &str,
405        max_entries: Option<usize>,
406    ) -> VfsResult<Vec<String>> {
407        let Some(max_entries) = max_entries else {
408            return self.read_dir(path);
409        };
410
411        match self.read_dir_limited(path, max_entries) {
412            Ok(entries) => Ok(entries),
413            Err(error) if error.code() == "ENOMEM" => Err(VfsError::new(
414                "ENOSPC",
415                format!("overlay rename copy-up directory '{path}' exceeds configured inode limit"),
416            )),
417            Err(error) => Err(error),
418        }
419    }
420
421    fn directory_has_visible_entries_limited(&mut self, path: &str) -> VfsResult<bool> {
422        match self.read_dir_limited(path, 1) {
423            Ok(entries) => Ok(!entries.is_empty()),
424            Err(error) if error.code() == "ENOMEM" => Ok(true),
425            Err(error) => Err(error),
426        }
427    }
428
429    fn memory_subtree_usage_limited(
430        filesystem: &mut MemoryFileSystem,
431        path: &str,
432        max_bytes: Option<u64>,
433        max_inodes: Option<usize>,
434    ) -> VfsResult<OverlayCopyUpUsage> {
435        let mut usage = OverlayCopyUpUsage::default();
436        let mut visited = BTreeSet::new();
437        let mut pending = vec![Self::normalized(path)];
438        while let Some(current_path) = pending.pop() {
439            let stat = filesystem.lstat(&current_path)?;
440            if visited.insert(stat.ino) {
441                let bytes = if stat.is_directory && !stat.is_symbolic_link {
442                    0
443                } else {
444                    stat.size
445                };
446                Self::add_copy_up_usage(&mut usage, bytes, 1, max_bytes, max_inodes)?;
447            }
448
449            if stat.is_directory && !stat.is_symbolic_link {
450                let remaining = Self::remaining_inode_budget(&usage, max_inodes);
451                let children = if let Some(max_entries) = remaining {
452                    filesystem.read_dir_limited(&current_path, max_entries)?
453                } else {
454                    filesystem.read_dir(&current_path)?
455                };
456                for entry in children.into_iter().rev() {
457                    if matches!(entry.as_str(), "." | "..") {
458                        continue;
459                    }
460                    if Self::should_hide_directory_entry(&current_path, &entry) {
461                        continue;
462                    }
463                    pending.push(Self::join_path(&current_path, &entry));
464                }
465            }
466        }
467
468        Ok(usage)
469    }
470
471    fn memory_subtree_released_usage(
472        filesystem: &mut MemoryFileSystem,
473        path: &str,
474    ) -> VfsResult<OverlayCopyUpUsage> {
475        let mut usage = OverlayCopyUpUsage::default();
476        let mut visited = BTreeSet::new();
477        let mut pending = vec![Self::normalized(path)];
478        while let Some(current_path) = pending.pop() {
479            let stat = filesystem.lstat(&current_path)?;
480            if visited.insert(stat.ino) {
481                let subtree_links = filesystem.link_count_in_subtree(stat.ino, path) as u64;
482                if stat.is_directory || stat.nlink <= subtree_links {
483                    let bytes = if stat.is_directory && !stat.is_symbolic_link {
484                        0
485                    } else {
486                        stat.size
487                    };
488                    Self::add_copy_up_usage(&mut usage, bytes, 1, None, None)?;
489                }
490            }
491
492            if stat.is_directory && !stat.is_symbolic_link {
493                for entry in filesystem.read_dir(&current_path)?.into_iter().rev() {
494                    if matches!(entry.as_str(), "." | "..") {
495                        continue;
496                    }
497                    if Self::should_hide_directory_entry(&current_path, &entry) {
498                        continue;
499                    }
500                    pending.push(Self::join_path(&current_path, &entry));
501                }
502            }
503        }
504
505        Ok(usage)
506    }
507
508    fn upper_usage_limited(
509        &mut self,
510        max_bytes: Option<u64>,
511        max_inodes: Option<usize>,
512    ) -> VfsResult<OverlayCopyUpUsage> {
513        let Some(upper) = self.upper.as_mut() else {
514            return Ok(OverlayCopyUpUsage::default());
515        };
516
517        Self::memory_subtree_usage_limited(upper, "/", max_bytes, max_inodes)
518    }
519
520    fn upper_subtree_released_usage(&mut self, path: &str) -> VfsResult<OverlayCopyUpUsage> {
521        let Some(upper) = self.upper.as_mut() else {
522            return Ok(OverlayCopyUpUsage::default());
523        };
524
525        if !upper.exists(path) {
526            return Ok(OverlayCopyUpUsage::default());
527        }
528
529        Self::memory_subtree_released_usage(upper, path)
530    }
531
532    fn collect_copy_up_usage_limited(
533        &mut self,
534        path: &str,
535        usage: &mut OverlayCopyUpUsage,
536        max_bytes: Option<u64>,
537        max_inodes: Option<usize>,
538    ) -> VfsResult<()> {
539        let mut pending = vec![(Self::normalized(path), 0usize)];
540        while let Some((current_path, depth)) = pending.pop() {
541            if depth > MAX_SNAPSHOT_DEPTH {
542                return Err(VfsError::new(
543                    "EINVAL",
544                    format!("overlay snapshot depth limit exceeded at '{current_path}'"),
545                ));
546            }
547
548            let stat = self.merged_lstat(&current_path)?;
549            if !self.has_entry_in_upper(&current_path) {
550                let bytes = if stat.is_symbolic_link {
551                    self.read_link_inner(&current_path)?.len() as u64
552                } else if stat.is_directory {
553                    0
554                } else {
555                    stat.size
556                };
557                Self::add_copy_up_usage(usage, bytes, 1, max_bytes, max_inodes)?;
558            }
559
560            if stat.is_directory && !stat.is_symbolic_link {
561                let children = self.copy_up_directory_entries_limited(&current_path, max_inodes)?;
562                for entry in children.into_iter().rev() {
563                    pending.push((Self::join_path(&current_path, &entry), depth + 1));
564                }
565            }
566        }
567
568        Ok(())
569    }
570
571    fn collect_single_copy_up_usage_limited(
572        &mut self,
573        path: &str,
574        usage: &mut OverlayCopyUpUsage,
575        max_bytes: Option<u64>,
576        max_inodes: Option<usize>,
577    ) -> VfsResult<()> {
578        if self.has_entry_in_upper(path) {
579            return Ok(());
580        }
581
582        let stat = self.merged_lstat(path)?;
583        let bytes = if stat.is_symbolic_link {
584            self.read_link_inner(path)?.len() as u64
585        } else if stat.is_directory {
586            0
587        } else {
588            stat.size
589        };
590        Self::add_copy_up_usage(usage, bytes, 1, max_bytes, max_inodes)
591    }
592
593    pub fn check_rename_copy_up_limits(
594        &mut self,
595        old_path: &str,
596        new_path: &str,
597        max_bytes: Option<u64>,
598        max_inodes: Option<usize>,
599    ) -> VfsResult<()> {
600        let old_normalized = Self::normalized(old_path);
601        let new_normalized = Self::normalized(new_path);
602        if Self::is_internal_metadata_path(&old_normalized)
603            || Self::is_internal_metadata_path(&new_normalized)
604        {
605            return Err(VfsError::permission_denied("rename", old_path));
606        }
607
608        if old_normalized == "/" {
609            return Err(VfsError::permission_denied("rename", old_path));
610        }
611
612        if old_normalized == new_normalized {
613            return Ok(());
614        }
615
616        let source_stat = self.merged_lstat(old_path)?;
617        if self.writes_locked {
618            self.writable_upper(&old_normalized)?;
619        }
620        self.validate_destination_parent(&new_normalized)?;
621        let resolved_new_normalized = self.resolved_destination_path(&new_normalized)?;
622
623        if old_normalized == resolved_new_normalized {
624            return Ok(());
625        }
626
627        if source_stat.is_directory
628            && resolved_new_normalized.starts_with(&(old_normalized.clone() + "/"))
629        {
630            return Err(VfsError::new(
631                "EINVAL",
632                format!(
633                    "cannot move '{}' into its own descendant '{}'",
634                    old_path, new_path
635                ),
636            ));
637        }
638
639        let destination_parent_copy_up_paths =
640            self.destination_parent_copy_up_paths(&new_normalized)?;
641
642        if let Ok(destination_stat) = self.merged_lstat(&resolved_new_normalized) {
643            if destination_stat.is_directory
644                && !destination_stat.is_symbolic_link
645                && self.directory_has_visible_entries_limited(&resolved_new_normalized)?
646            {
647                return Err(Self::not_empty(&resolved_new_normalized));
648            }
649        }
650
651        let mut usage = self.upper_usage_limited(None, None)?;
652        if self.has_entry_in_upper(&resolved_new_normalized) {
653            let destination_usage = self.upper_subtree_released_usage(&resolved_new_normalized)?;
654            usage.total_bytes = usage
655                .total_bytes
656                .saturating_sub(destination_usage.total_bytes);
657            usage.inode_count = usage
658                .inode_count
659                .saturating_sub(destination_usage.inode_count);
660        }
661        Self::check_copy_up_usage_limits(&usage, max_bytes, max_inodes)?;
662        for path in destination_parent_copy_up_paths {
663            self.collect_single_copy_up_usage_limited(&path, &mut usage, max_bytes, max_inodes)?;
664        }
665        self.collect_copy_up_usage_limited(&old_normalized, &mut usage, max_bytes, max_inodes)?;
666
667        Self::check_copy_up_usage_limits(&usage, max_bytes, max_inodes)
668    }
669
670    fn marker_exists(&self, kind: OverlayMarkerKind, path: &str) -> bool {
671        Self::marker_exists_in_upper(self.upper.as_ref(), kind, path)
672    }
673
674    fn marker_exists_in_upper(
675        upper: Option<&MemoryFileSystem>,
676        kind: OverlayMarkerKind,
677        path: &str,
678    ) -> bool {
679        upper.is_some_and(|filesystem| filesystem.exists(&Self::marker_path(kind, path)))
680    }
681
682    fn is_whited_out(&self, path: &str) -> bool {
683        self.marker_exists(OverlayMarkerKind::Whiteout, path)
684    }
685
686    fn ensure_metadata_directories_in_upper(&mut self, path: &str) -> VfsResult<()> {
687        let upper = self.writable_upper(path)?;
688        upper.mkdir(OVERLAY_METADATA_ROOT, true)?;
689        upper.mkdir(OVERLAY_WHITEOUT_DIR, true)?;
690        upper.mkdir(OVERLAY_OPAQUE_DIR, true)?;
691        Ok(())
692    }
693
694    fn set_marker(&mut self, kind: OverlayMarkerKind, path: &str, present: bool) -> VfsResult<()> {
695        let marker_path = Self::marker_path(kind, path);
696        if present {
697            self.ensure_metadata_directories_in_upper(path)?;
698            self.writable_upper(path)?
699                .write_file(&marker_path, Self::normalized(path).into_bytes())?;
700            return Ok(());
701        }
702
703        if self
704            .upper
705            .as_ref()
706            .is_some_and(|upper| upper.exists(&marker_path))
707        {
708            self.writable_upper(path)?.remove_file(&marker_path)?;
709        }
710        Ok(())
711    }
712
713    fn add_whiteout(&mut self, path: &str) -> VfsResult<()> {
714        self.set_marker(OverlayMarkerKind::Whiteout, path, true)
715    }
716
717    fn remove_whiteout(&mut self, path: &str) -> VfsResult<()> {
718        self.set_marker(OverlayMarkerKind::Whiteout, path, false)
719    }
720
721    fn mark_opaque_directory(&mut self, path: &str) -> VfsResult<()> {
722        self.set_marker(OverlayMarkerKind::Opaque, path, true)
723    }
724
725    fn clear_opaque_directory(&mut self, path: &str) -> VfsResult<()> {
726        self.set_marker(OverlayMarkerKind::Opaque, path, false)
727    }
728
729    fn clear_path_metadata(&mut self, path: &str) -> VfsResult<()> {
730        self.remove_whiteout(path)?;
731        self.clear_opaque_directory(path)
732    }
733
734    fn join_path(base: &str, name: &str) -> String {
735        if base == "/" {
736            format!("/{name}")
737        } else {
738            format!("{base}/{name}")
739        }
740    }
741
742    fn rebase_path(path: &str, old_root: &str, new_root: &str) -> String {
743        if path == old_root {
744            return String::from(new_root);
745        }
746
747        format!("{new_root}{}", &path[old_root.len()..])
748    }
749
750    fn read_only_error(path: &str) -> VfsError {
751        VfsError::new("EROFS", format!("read-only filesystem: {path}"))
752    }
753
754    fn entry_not_found(path: &str) -> VfsError {
755        VfsError::new("ENOENT", format!("no such file: {path}"))
756    }
757
758    fn directory_not_found(path: &str) -> VfsError {
759        VfsError::new("ENOENT", format!("no such directory: {path}"))
760    }
761
762    fn already_exists(path: &str) -> VfsError {
763        VfsError::new("EEXIST", format!("file exists: {path}"))
764    }
765
766    fn not_directory(path: &str) -> VfsError {
767        VfsError::new("ENOTDIR", format!("not a directory: {path}"))
768    }
769
770    fn writable_upper(&mut self, path: &str) -> VfsResult<&mut MemoryFileSystem> {
771        if self.writes_locked {
772            return Err(Self::read_only_error(path));
773        }
774        self.upper
775            .as_mut()
776            .ok_or_else(|| Self::read_only_error(path))
777    }
778
779    fn path_exists_in_filesystem(filesystem: &MemoryFileSystem, path: &str) -> bool {
780        filesystem.exists(path)
781    }
782
783    fn has_entry_in_filesystem(filesystem: &MemoryFileSystem, path: &str) -> bool {
784        filesystem.lstat(path).is_ok()
785    }
786
787    fn exists_in_upper(&self, path: &str) -> bool {
788        self.upper
789            .as_ref()
790            .is_some_and(|upper| Self::path_exists_in_filesystem(upper, path))
791    }
792
793    fn has_entry_in_upper(&self, path: &str) -> bool {
794        self.upper
795            .as_ref()
796            .is_some_and(|upper| Self::has_entry_in_filesystem(upper, path))
797    }
798
799    fn find_lower_by_exists(&self, path: &str) -> Option<usize> {
800        self.lowers
801            .iter()
802            .position(|lower| Self::path_exists_in_filesystem(lower, path))
803    }
804
805    fn find_lower_by_entry(&self, path: &str) -> Option<(usize, VirtualStat)> {
806        self.lowers
807            .iter()
808            .enumerate()
809            .find_map(|(index, lower)| lower.lstat(path).ok().map(|stat| (index, stat)))
810    }
811
812    fn merged_lstat(&self, path: &str) -> VfsResult<VirtualStat> {
813        if Self::is_internal_metadata_path(path) {
814            return Err(Self::entry_not_found(path));
815        }
816        if self.is_whited_out(path) {
817            return Err(Self::entry_not_found(path));
818        }
819        if self.has_entry_in_upper(path) {
820            return self
821                .upper
822                .as_ref()
823                .expect("upper must exist when entry exists")
824                .lstat(path);
825        }
826        self.find_lower_by_entry(path)
827            .map(|(_, stat)| stat)
828            .ok_or_else(|| Self::entry_not_found(path))
829    }
830
831    /// `read_link` body without the resolving metadata guard, for use by the
832    /// internal symlink-resolution helpers (`resolve_merged_path` and friends).
833    /// The public `read_link` wraps this with `touches_internal_metadata`;
834    /// resolution must not call back into that wrapper or it would recurse on a
835    /// symlink that points at itself's resolution path.
836    fn read_link_inner(&self, path: &str) -> VfsResult<String> {
837        if Self::is_internal_metadata_path(path) {
838            return Err(Self::entry_not_found(path));
839        }
840        if self.is_whited_out(path) {
841            return Err(Self::entry_not_found(path));
842        }
843        if self.has_entry_in_upper(path) {
844            return self
845                .upper
846                .as_ref()
847                .expect("upper must exist when path exists")
848                .read_link(path);
849        }
850        let Some((index, _)) = self.find_lower_by_entry(path) else {
851            return Err(Self::entry_not_found(path));
852        };
853        self.lowers[index].read_link(path)
854    }
855
856    fn ensure_ancestor_directories_in_upper(&mut self, path: &str) -> VfsResult<()> {
857        if Self::is_internal_metadata_path(path) {
858            return Err(VfsError::permission_denied("mkdir", path));
859        }
860        let normalized = Self::normalized(path);
861        let parts = normalized
862            .split('/')
863            .filter(|part| !part.is_empty())
864            .collect::<Vec<_>>();
865
866        let mut current = String::new();
867        for part in parts.iter().take(parts.len().saturating_sub(1)) {
868            current.push('/');
869            current.push_str(part);
870
871            if self.exists_in_upper(&current) {
872                continue;
873            }
874
875            if let Some(index) = self.find_lower_by_exists(&current) {
876                let stat = self.lowers[index].stat(&current)?;
877                if !stat.is_directory {
878                    return Err(Self::not_directory(&current));
879                }
880
881                let upper = self.writable_upper(&current)?;
882                upper.mkdir(&current, false)?;
883                upper.chmod(&current, stat.mode)?;
884                upper.chown(&current, stat.uid, stat.gid)?;
885                continue;
886            }
887
888            let upper = self.writable_upper(&current)?;
889            upper.mkdir(&current, false)?;
890        }
891
892        Ok(())
893    }
894
895    fn copy_up_path(&mut self, path: &str) -> VfsResult<()> {
896        if self.has_entry_in_upper(path) {
897            return Ok(());
898        }
899
900        self.ensure_ancestor_directories_in_upper(path)?;
901
902        let (lower_index, stat) = self
903            .find_lower_by_entry(path)
904            .ok_or_else(|| Self::entry_not_found(path))?;
905
906        if stat.is_symbolic_link {
907            let target = self.lowers[lower_index].read_link(path)?;
908            let upper = self.writable_upper(path)?;
909            upper.symlink(&target, path)?;
910            return Ok(());
911        }
912
913        if stat.is_directory {
914            let upper = self.writable_upper(path)?;
915            upper.mkdir(path, false)?;
916            upper.chmod(path, stat.mode)?;
917            upper.chown(path, stat.uid, stat.gid)?;
918            self.mark_opaque_directory(path)?;
919            return Ok(());
920        }
921
922        let data = self.lowers[lower_index].read_file(path)?;
923        let upper = self.writable_upper(path)?;
924        upper.write_file(path, data)?;
925        upper.chmod(path, stat.mode)?;
926        upper.chown(path, stat.uid, stat.gid)?;
927        Ok(())
928    }
929
930    fn materialize_destination_parent_in_upper(&mut self, path: &str) -> VfsResult<()> {
931        if self.has_entry_in_upper(path) {
932            return Ok(());
933        }
934
935        if self
936            .merged_lstat(path)
937            .is_ok_and(|stat| stat.is_symbolic_link)
938        {
939            return self.copy_up_path(path);
940        }
941
942        self.ensure_ancestor_directories_in_upper(path)?;
943        let stat = self.merged_lstat(path)?;
944        if !stat.is_directory || stat.is_symbolic_link {
945            return Err(Self::not_directory(path));
946        }
947
948        let upper = self.writable_upper(path)?;
949        upper.create_dir(path)?;
950        upper.chmod(path, stat.mode)?;
951        upper.chown(path, stat.uid, stat.gid)?;
952        Ok(())
953    }
954
955    fn path_exists_in_merged_view(&self, path: &str) -> bool {
956        if self.is_whited_out(path) {
957            return false;
958        }
959        if self.has_entry_in_upper(path) {
960            return true;
961        }
962        self.find_lower_by_entry(path).is_some()
963    }
964
965    fn not_empty(path: &str) -> VfsError {
966        VfsError::new("ENOTEMPTY", format!("directory not empty, rmdir '{path}'"))
967    }
968
969    fn collect_snapshot_entries(
970        &mut self,
971        path: &str,
972        entries: &mut Vec<OverlaySnapshotEntry>,
973    ) -> VfsResult<()> {
974        let mut pending = vec![(Self::normalized(path), 0usize)];
975        while let Some((current_path, depth)) = pending.pop() {
976            if depth > MAX_SNAPSHOT_DEPTH {
977                return Err(VfsError::new(
978                    "EINVAL",
979                    format!("overlay snapshot depth limit exceeded at '{current_path}'"),
980                ));
981            }
982
983            let stat = self.merged_lstat(&current_path)?;
984
985            if stat.is_symbolic_link {
986                entries.push(OverlaySnapshotEntry {
987                    path: current_path.clone(),
988                    stat,
989                    kind: OverlaySnapshotKind::Symlink(self.read_link_inner(&current_path)?),
990                });
991                continue;
992            }
993
994            if stat.is_directory {
995                entries.push(OverlaySnapshotEntry {
996                    path: current_path.clone(),
997                    stat,
998                    kind: OverlaySnapshotKind::Directory,
999                });
1000
1001                let children = self.read_dir_with_types_inner(&current_path)?;
1002                for entry in children.into_iter().rev() {
1003                    pending.push((Self::join_path(&current_path, &entry.name), depth + 1));
1004                }
1005                continue;
1006            }
1007
1008            entries.push(OverlaySnapshotEntry {
1009                path: current_path.clone(),
1010                stat,
1011                kind: OverlaySnapshotKind::File(self.read_file(&current_path)?),
1012            });
1013        }
1014        Ok(())
1015    }
1016
1017    fn remove_snapshot_entries(&mut self, entries: &[OverlaySnapshotEntry]) -> VfsResult<()> {
1018        for entry in entries.iter().rev() {
1019            if self.has_entry_in_upper(&entry.path) {
1020                match entry.kind {
1021                    OverlaySnapshotKind::Directory => {
1022                        self.writable_upper(&entry.path)?.remove_dir(&entry.path)?;
1023                    }
1024                    OverlaySnapshotKind::File(_) | OverlaySnapshotKind::Symlink(_) => {
1025                        self.writable_upper(&entry.path)?.remove_file(&entry.path)?;
1026                    }
1027                }
1028            }
1029
1030            if self.find_lower_by_entry(&entry.path).is_some() {
1031                self.clear_opaque_directory(&entry.path)?;
1032                self.add_whiteout(&entry.path)?;
1033            } else {
1034                self.clear_path_metadata(&entry.path)?;
1035            }
1036        }
1037
1038        Ok(())
1039    }
1040
1041    fn directory_has_raw_children(&mut self, path: &str) -> VfsResult<bool> {
1042        let normalized = Self::normalized(path);
1043        let mut directory_exists = false;
1044
1045        if let Some(upper) = self.upper.as_mut() {
1046            if let Ok(entries) = upper.read_dir(&normalized) {
1047                directory_exists = true;
1048                if entries.into_iter().any(|entry| {
1049                    !Self::should_ignore_raw_directory_entry(Some(&*upper), &normalized, &entry)
1050                }) {
1051                    return Ok(true);
1052                }
1053            }
1054        }
1055
1056        let upper = self.upper.as_ref();
1057        for lower in self.lowers.iter_mut().rev() {
1058            if let Ok(entries) = lower.read_dir(&normalized) {
1059                directory_exists = true;
1060                if entries.into_iter().any(|entry| {
1061                    !Self::should_ignore_raw_directory_entry(upper, &normalized, &entry)
1062                }) {
1063                    return Ok(true);
1064                }
1065            }
1066        }
1067
1068        if !directory_exists {
1069            return Err(Self::directory_not_found(path));
1070        }
1071
1072        Ok(false)
1073    }
1074
1075    fn read_dir_with_types_inner(&mut self, path: &str) -> VfsResult<Vec<VirtualDirEntry>> {
1076        if self.is_whited_out(path) {
1077            return Err(Self::directory_not_found(path));
1078        }
1079
1080        let normalized = Self::normalized(path);
1081        let mut directory_exists = false;
1082        let mut entries = Vec::<VirtualDirEntry>::new();
1083        let mut seen = BTreeSet::<String>::new();
1084        let upper = self.upper.as_ref();
1085        let include_lowers = !Self::marker_exists_in_upper(upper, OverlayMarkerKind::Opaque, path);
1086
1087        if include_lowers {
1088            for lower in self.lowers.iter_mut().rev() {
1089                if let Ok(lower_entries) = lower.read_dir_with_types(path) {
1090                    directory_exists = true;
1091                    for entry in lower_entries {
1092                        if entry.name == "."
1093                            || entry.name == ".."
1094                            || Self::should_hide_directory_entry(path, &entry.name)
1095                        {
1096                            continue;
1097                        }
1098                        let child_path = if normalized == "/" {
1099                            format!("/{}", entry.name)
1100                        } else {
1101                            format!("{normalized}/{}", entry.name)
1102                        };
1103                        if Self::marker_exists_in_upper(
1104                            upper,
1105                            OverlayMarkerKind::Whiteout,
1106                            &child_path,
1107                        ) || seen.contains(&entry.name)
1108                        {
1109                            continue;
1110                        }
1111                        seen.insert(entry.name.clone());
1112                        entries.push(entry);
1113                    }
1114                }
1115            }
1116        }
1117
1118        if let Some(upper) = self.upper.as_mut() {
1119            if let Ok(upper_entries) = upper.read_dir_with_types(path) {
1120                directory_exists = true;
1121                for entry in upper_entries {
1122                    if entry.name == "."
1123                        || entry.name == ".."
1124                        || Self::should_hide_directory_entry(path, &entry.name)
1125                    {
1126                        continue;
1127                    }
1128                    if let Some(index) = entries
1129                        .iter()
1130                        .position(|existing| existing.name == entry.name)
1131                    {
1132                        entries[index] = entry;
1133                    } else {
1134                        seen.insert(entry.name.clone());
1135                        entries.push(entry);
1136                    }
1137                }
1138            }
1139        }
1140
1141        if !directory_exists {
1142            return Err(Self::directory_not_found(path));
1143        }
1144
1145        Ok(entries)
1146    }
1147
1148    fn marker_paths_in_upper(&mut self, kind: OverlayMarkerKind) -> VfsResult<Vec<String>> {
1149        let Some(upper) = self.upper.as_mut() else {
1150            return Ok(Vec::new());
1151        };
1152
1153        let marker_dir = Self::marker_directory(kind);
1154        let entries = match upper.read_dir(marker_dir) {
1155            Ok(entries) => entries,
1156            Err(error) if error.code() == "ENOENT" => return Ok(Vec::new()),
1157            Err(error) => return Err(error),
1158        };
1159
1160        let mut marker_paths = Vec::new();
1161        for entry in entries {
1162            if entry == "." || entry == ".." {
1163                continue;
1164            }
1165
1166            let marker_file = Self::join_path(marker_dir, &entry);
1167            let marker_path =
1168                String::from_utf8(upper.read_file(&marker_file).map_err(|_| {
1169                    VfsError::io(format!("invalid overlay marker '{marker_file}'"))
1170                })?)
1171                .map_err(|_| VfsError::io(format!("invalid overlay marker '{marker_file}'")))?;
1172            marker_paths.push(Self::normalized(&marker_path));
1173        }
1174
1175        Ok(marker_paths)
1176    }
1177
1178    fn path_in_subtree(path: &str, root: &str) -> bool {
1179        path == root || path.starts_with(&(String::from(root) + "/"))
1180    }
1181
1182    fn clear_subtree_metadata(&mut self, path: &str) -> VfsResult<()> {
1183        let normalized = Self::normalized(path);
1184        for kind in [OverlayMarkerKind::Whiteout, OverlayMarkerKind::Opaque] {
1185            for marker_path in self.marker_paths_in_upper(kind)? {
1186                if Self::path_in_subtree(&marker_path, &normalized) {
1187                    self.set_marker(kind, &marker_path, false)?;
1188                }
1189            }
1190        }
1191        Ok(())
1192    }
1193
1194    fn copy_subtree_metadata(&mut self, old_root: &str, new_root: &str) -> VfsResult<()> {
1195        let old_normalized = Self::normalized(old_root);
1196        let new_normalized = Self::normalized(new_root);
1197
1198        for kind in [OverlayMarkerKind::Whiteout, OverlayMarkerKind::Opaque] {
1199            for marker_path in self.marker_paths_in_upper(kind)? {
1200                if Self::path_in_subtree(&marker_path, &old_normalized) {
1201                    let destination =
1202                        Self::rebase_path(&marker_path, &old_normalized, &new_normalized);
1203                    self.set_marker(kind, &destination, true)?;
1204                }
1205            }
1206        }
1207
1208        Ok(())
1209    }
1210
1211    fn stage_snapshot_entries_in_upper(
1212        &mut self,
1213        entries: &[OverlaySnapshotEntry],
1214    ) -> VfsResult<()> {
1215        for entry in entries {
1216            match &entry.kind {
1217                OverlaySnapshotKind::Directory => {
1218                    if !self.has_entry_in_upper(&entry.path) {
1219                        self.ensure_ancestor_directories_in_upper(&entry.path)?;
1220                        self.writable_upper(&entry.path)?.create_dir(&entry.path)?;
1221                    }
1222                    self.writable_upper(&entry.path)?
1223                        .chmod(&entry.path, entry.stat.mode)?;
1224                    self.writable_upper(&entry.path)?.chown(
1225                        &entry.path,
1226                        entry.stat.uid,
1227                        entry.stat.gid,
1228                    )?;
1229                    self.mark_opaque_directory(&entry.path)?;
1230                }
1231                OverlaySnapshotKind::File(data) => {
1232                    if self.has_entry_in_upper(&entry.path) {
1233                        continue;
1234                    }
1235                    self.ensure_ancestor_directories_in_upper(&entry.path)?;
1236                    self.writable_upper(&entry.path)?
1237                        .write_file(&entry.path, data.clone())?;
1238                    self.writable_upper(&entry.path)?
1239                        .chmod(&entry.path, entry.stat.mode)?;
1240                    self.writable_upper(&entry.path)?.chown(
1241                        &entry.path,
1242                        entry.stat.uid,
1243                        entry.stat.gid,
1244                    )?;
1245                }
1246                OverlaySnapshotKind::Symlink(target) => {
1247                    if self.has_entry_in_upper(&entry.path) {
1248                        continue;
1249                    }
1250                    self.ensure_ancestor_directories_in_upper(&entry.path)?;
1251                    self.writable_upper(&entry.path)?
1252                        .symlink(target, &entry.path)?;
1253                }
1254            }
1255        }
1256
1257        Ok(())
1258    }
1259}
1260
1261fn sync_upper_root_metadata(upper: &mut MemoryFileSystem, lowers: &[MemoryFileSystem]) {
1262    let Some(root_stat) = lowers.iter().find_map(|lower| lower.lstat("/").ok()) else {
1263        return;
1264    };
1265
1266    upper
1267        .chmod("/", root_stat.mode)
1268        .expect("overlay upper root should exist");
1269    upper
1270        .chown("/", root_stat.uid, root_stat.gid)
1271        .expect("overlay upper root should exist");
1272}
1273
1274impl VirtualFileSystem for OverlayFileSystem {
1275    fn read_file(&mut self, path: &str) -> VfsResult<Vec<u8>> {
1276        if self.touches_internal_metadata(path) {
1277            return Err(Self::entry_not_found(path));
1278        }
1279        if self.is_whited_out(path) {
1280            return Err(Self::entry_not_found(path));
1281        }
1282        if self.exists_in_upper(path) {
1283            return self
1284                .upper
1285                .as_mut()
1286                .expect("upper must exist when path exists")
1287                .read_file(path);
1288        }
1289        let Some(index) = self.find_lower_by_exists(path) else {
1290            return Err(Self::entry_not_found(path));
1291        };
1292        self.lowers[index].read_file(path)
1293    }
1294
1295    fn read_dir(&mut self, path: &str) -> VfsResult<Vec<String>> {
1296        if self.touches_internal_metadata(path) {
1297            return Err(Self::directory_not_found(path));
1298        }
1299        if self.is_whited_out(path) {
1300            return Err(Self::directory_not_found(path));
1301        }
1302
1303        let normalized = Self::normalized(path);
1304        let mut directory_exists = false;
1305        let mut entries = BTreeSet::new();
1306        let upper = self.upper.as_ref();
1307        let include_lowers = !Self::marker_exists_in_upper(upper, OverlayMarkerKind::Opaque, path);
1308
1309        if include_lowers {
1310            for lower in self.lowers.iter_mut().rev() {
1311                if let Ok(lower_entries) = lower.read_dir(path) {
1312                    directory_exists = true;
1313                    for entry in lower_entries {
1314                        if entry == "."
1315                            || entry == ".."
1316                            || Self::should_hide_directory_entry(path, &entry)
1317                        {
1318                            continue;
1319                        }
1320                        let child_path = if normalized == "/" {
1321                            format!("/{entry}")
1322                        } else {
1323                            format!("{normalized}/{entry}")
1324                        };
1325                        if !Self::marker_exists_in_upper(
1326                            upper,
1327                            OverlayMarkerKind::Whiteout,
1328                            &child_path,
1329                        ) {
1330                            entries.insert(entry);
1331                        }
1332                    }
1333                }
1334            }
1335        }
1336
1337        if let Some(upper) = self.upper.as_mut() {
1338            if let Ok(upper_entries) = upper.read_dir(path) {
1339                directory_exists = true;
1340                for entry in upper_entries {
1341                    if entry == "."
1342                        || entry == ".."
1343                        || Self::should_hide_directory_entry(path, &entry)
1344                    {
1345                        continue;
1346                    }
1347                    entries.insert(entry);
1348                }
1349            }
1350        }
1351
1352        if !directory_exists {
1353            return Err(Self::directory_not_found(path));
1354        }
1355
1356        Ok(entries.into_iter().collect())
1357    }
1358
1359    fn read_dir_limited(&mut self, path: &str, max_entries: usize) -> VfsResult<Vec<String>> {
1360        if self.touches_internal_metadata(path) {
1361            return Err(Self::directory_not_found(path));
1362        }
1363        if self.is_whited_out(path) {
1364            return Err(Self::directory_not_found(path));
1365        }
1366
1367        let normalized = Self::normalized(path);
1368        let mut directory_exists = false;
1369        let mut entries = BTreeSet::new();
1370        let upper = self.upper.as_ref();
1371        let include_lowers = !Self::marker_exists_in_upper(upper, OverlayMarkerKind::Opaque, path);
1372
1373        if include_lowers {
1374            for lower in self.lowers.iter_mut().rev() {
1375                let lower_entries = match lower.read_dir_filtered_limited(
1376                    path,
1377                    max_entries.saturating_sub(entries.len()),
1378                    |entry| {
1379                        if entry == "."
1380                            || entry == ".."
1381                            || Self::should_hide_directory_entry(path, entry)
1382                        {
1383                            return false;
1384                        }
1385                        let child_path = if normalized == "/" {
1386                            format!("/{entry}")
1387                        } else {
1388                            format!("{normalized}/{entry}")
1389                        };
1390                        !Self::marker_exists_in_upper(
1391                            upper,
1392                            OverlayMarkerKind::Whiteout,
1393                            &child_path,
1394                        ) && !entries.contains(entry)
1395                    },
1396                ) {
1397                    Ok(entries) => entries,
1398                    Err(error) if error.code() == "ENOENT" || error.code() == "ENOTDIR" => {
1399                        continue;
1400                    }
1401                    Err(error) => return Err(error),
1402                };
1403                directory_exists = true;
1404                for entry in lower_entries {
1405                    entries.insert(entry);
1406                    if entries.len() > max_entries {
1407                        return Err(VfsError::new(
1408                            "ENOMEM",
1409                            format!(
1410                                "directory listing for '{path}' exceeds configured limit of {max_entries} entries"
1411                            ),
1412                        ));
1413                    }
1414                }
1415            }
1416        }
1417
1418        if let Some(upper) = self.upper.as_mut() {
1419            let upper_entries = match upper.read_dir_filtered_limited(
1420                path,
1421                max_entries.saturating_sub(entries.len()),
1422                |entry| {
1423                    entry != "."
1424                        && entry != ".."
1425                        && !Self::should_hide_directory_entry(path, entry)
1426                        && !entries.contains(entry)
1427                },
1428            ) {
1429                Ok(entries) => entries,
1430                Err(error) if error.code() == "ENOENT" => Vec::new(),
1431                Err(error) => return Err(error),
1432            };
1433            directory_exists = directory_exists || upper.exists(path);
1434            for entry in upper_entries {
1435                if entry == "." || entry == ".." || Self::should_hide_directory_entry(path, &entry)
1436                {
1437                    continue;
1438                }
1439                entries.insert(entry);
1440                if entries.len() > max_entries {
1441                    return Err(VfsError::new(
1442                        "ENOMEM",
1443                        format!(
1444                            "directory listing for '{path}' exceeds configured limit of {max_entries} entries"
1445                        ),
1446                    ));
1447                }
1448            }
1449        }
1450
1451        if !directory_exists {
1452            return Err(Self::directory_not_found(path));
1453        }
1454
1455        Ok(entries.into_iter().collect())
1456    }
1457
1458    fn read_dir_with_types(&mut self, path: &str) -> VfsResult<Vec<VirtualDirEntry>> {
1459        if self.touches_internal_metadata(path) {
1460            return Err(Self::directory_not_found(path));
1461        }
1462        self.read_dir_with_types_inner(path)
1463    }
1464
1465    fn write_file(&mut self, path: &str, content: impl Into<Vec<u8>>) -> VfsResult<()> {
1466        if self.touches_internal_metadata(path) {
1467            return Err(VfsError::permission_denied("open", path));
1468        }
1469        self.clear_path_metadata(path)?;
1470        if self.find_lower_by_entry(path).is_some() {
1471            self.copy_up_path(path)?;
1472        } else {
1473            self.ensure_ancestor_directories_in_upper(path)?;
1474        }
1475        self.writable_upper(path)?.write_file(path, content.into())
1476    }
1477
1478    fn create_file_exclusive(&mut self, path: &str, content: impl Into<Vec<u8>>) -> VfsResult<()> {
1479        if self.touches_internal_metadata(path) {
1480            return Err(VfsError::permission_denied("open", path));
1481        }
1482        self.clear_path_metadata(path)?;
1483        if self.path_exists_in_merged_view(path) {
1484            return Err(Self::already_exists(path));
1485        }
1486        self.ensure_ancestor_directories_in_upper(path)?;
1487        self.writable_upper(path)?
1488            .create_file_exclusive(path, content.into())
1489    }
1490
1491    fn append_file(&mut self, path: &str, content: impl Into<Vec<u8>>) -> VfsResult<u64> {
1492        if self.touches_internal_metadata(path) {
1493            return Err(VfsError::permission_denied("open", path));
1494        }
1495        self.clear_path_metadata(path)?;
1496        if self.find_lower_by_entry(path).is_some() {
1497            self.copy_up_path(path)?;
1498        } else {
1499            self.ensure_ancestor_directories_in_upper(path)?;
1500        }
1501        self.writable_upper(path)?.append_file(path, content.into())
1502    }
1503
1504    fn create_dir(&mut self, path: &str) -> VfsResult<()> {
1505        if self.touches_internal_metadata(path) {
1506            return Err(VfsError::permission_denied("mkdir", path));
1507        }
1508        self.clear_path_metadata(path)?;
1509        if self.path_exists_in_merged_view(path) {
1510            return Err(Self::already_exists(path));
1511        }
1512        self.ensure_ancestor_directories_in_upper(path)?;
1513        self.writable_upper(path)?.create_dir(path)
1514    }
1515
1516    fn mkdir(&mut self, path: &str, recursive: bool) -> VfsResult<()> {
1517        if self.touches_internal_metadata(path) {
1518            return Err(VfsError::permission_denied("mkdir", path));
1519        }
1520        self.clear_path_metadata(path)?;
1521        if self.path_exists_in_merged_view(path) {
1522            let stat = self.merged_lstat(path)?;
1523            if recursive && stat.is_directory && !stat.is_symbolic_link {
1524                return Ok(());
1525            }
1526            return Err(Self::already_exists(path));
1527        }
1528        self.ensure_ancestor_directories_in_upper(path)?;
1529        self.writable_upper(path)?.mkdir(path, recursive)
1530    }
1531
1532    fn exists(&self, path: &str) -> bool {
1533        if self.touches_internal_metadata(path) {
1534            return false;
1535        }
1536        self.path_exists_in_merged_view(path)
1537    }
1538
1539    fn stat(&mut self, path: &str) -> VfsResult<VirtualStat> {
1540        if self.touches_internal_metadata(path) {
1541            return Err(Self::entry_not_found(path));
1542        }
1543        if self.is_whited_out(path) {
1544            return Err(Self::entry_not_found(path));
1545        }
1546        if self.exists_in_upper(path) {
1547            return self
1548                .upper
1549                .as_mut()
1550                .expect("upper must exist when path exists")
1551                .stat(path);
1552        }
1553        let Some(index) = self.find_lower_by_exists(path) else {
1554            return Err(Self::entry_not_found(path));
1555        };
1556        self.lowers[index].stat(path)
1557    }
1558
1559    fn remove_file(&mut self, path: &str) -> VfsResult<()> {
1560        if self.touches_internal_metadata(path) {
1561            return Err(VfsError::permission_denied("unlink", path));
1562        }
1563        if self.is_whited_out(path) {
1564            return Err(Self::entry_not_found(path));
1565        }
1566        let lower_exists = self.find_lower_by_exists(path).is_some();
1567        let upper_exists = self.exists_in_upper(path);
1568        if !lower_exists && !upper_exists {
1569            return Err(Self::entry_not_found(path));
1570        }
1571        if upper_exists {
1572            self.writable_upper(path)?.remove_file(path)?;
1573        } else {
1574            self.writable_upper(path)?;
1575        }
1576        self.clear_opaque_directory(path)?;
1577        self.add_whiteout(path)?;
1578        Ok(())
1579    }
1580
1581    fn remove_dir(&mut self, path: &str) -> VfsResult<()> {
1582        let normalized = Self::normalized(path);
1583        if self.touches_internal_metadata(&normalized) {
1584            return Err(VfsError::permission_denied("rmdir", path));
1585        }
1586        if normalized == "/" {
1587            return Err(VfsError::permission_denied("rmdir", path));
1588        }
1589
1590        let stat = match self.merged_lstat(path) {
1591            Ok(stat) => stat,
1592            Err(error) if error.code() == "ENOENT" => return Err(Self::directory_not_found(path)),
1593            Err(error) => return Err(error),
1594        };
1595
1596        if !stat.is_directory || stat.is_symbolic_link {
1597            return Err(Self::not_directory(path));
1598        }
1599
1600        if self.directory_has_raw_children(path)? {
1601            return Err(Self::not_empty(path));
1602        }
1603
1604        let lower_exists = self.find_lower_by_entry(path).is_some();
1605        let upper_exists = self.has_entry_in_upper(path);
1606        if upper_exists {
1607            self.writable_upper(path)?.remove_dir(&normalized)?;
1608        } else {
1609            self.writable_upper(path)?;
1610        }
1611        if lower_exists {
1612            self.clear_opaque_directory(path)?;
1613            self.add_whiteout(path)?;
1614        } else {
1615            self.clear_path_metadata(path)?;
1616        }
1617        Ok(())
1618    }
1619
1620    fn rename(&mut self, old_path: &str, new_path: &str) -> VfsResult<()> {
1621        let old_normalized = Self::normalized(old_path);
1622        let new_normalized = Self::normalized(new_path);
1623        if self.touches_internal_metadata(&old_normalized)
1624            || self.touches_internal_metadata(&new_normalized)
1625        {
1626            return Err(VfsError::permission_denied("rename", old_path));
1627        }
1628
1629        if old_normalized == "/" {
1630            return Err(VfsError::permission_denied("rename", old_path));
1631        }
1632
1633        if old_normalized == new_normalized {
1634            return Ok(());
1635        }
1636
1637        let source_stat = self.merged_lstat(old_path)?;
1638        self.validate_destination_parent(&new_normalized)?;
1639        let resolved_new_normalized = self.resolved_destination_path(&new_normalized)?;
1640
1641        if old_normalized == resolved_new_normalized {
1642            return Ok(());
1643        }
1644
1645        if source_stat.is_directory
1646            && resolved_new_normalized.starts_with(&(old_normalized.clone() + "/"))
1647        {
1648            return Err(VfsError::new(
1649                "EINVAL",
1650                format!(
1651                    "cannot move '{}' into its own descendant '{}'",
1652                    old_path, new_path
1653                ),
1654            ));
1655        }
1656
1657        for path in self.destination_parent_copy_up_paths(&new_normalized)? {
1658            self.materialize_destination_parent_in_upper(&path)?;
1659        }
1660
1661        let mut snapshot_entries = Vec::new();
1662        self.collect_snapshot_entries(&old_normalized, &mut snapshot_entries)?;
1663
1664        if let Ok(destination_stat) = self.merged_lstat(&resolved_new_normalized) {
1665            if destination_stat.is_directory
1666                && !destination_stat.is_symbolic_link
1667                && self.directory_has_visible_entries_limited(&resolved_new_normalized)?
1668            {
1669                return Err(Self::not_empty(&resolved_new_normalized));
1670            }
1671
1672            if self.has_entry_in_upper(&resolved_new_normalized) {
1673                if destination_stat.is_directory && !destination_stat.is_symbolic_link {
1674                    self.writable_upper(&resolved_new_normalized)?
1675                        .remove_dir(&resolved_new_normalized)?;
1676                } else {
1677                    self.writable_upper(&resolved_new_normalized)?
1678                        .remove_file(&resolved_new_normalized)?;
1679                }
1680            }
1681            self.clear_subtree_metadata(&resolved_new_normalized)?;
1682        }
1683
1684        self.stage_snapshot_entries_in_upper(&snapshot_entries)?;
1685        self.copy_subtree_metadata(&old_normalized, &resolved_new_normalized)?;
1686        self.writable_upper(&old_normalized)?
1687            .rename(&old_normalized, &resolved_new_normalized)?;
1688        self.remove_snapshot_entries(&snapshot_entries)
1689    }
1690
1691    fn realpath(&self, path: &str) -> VfsResult<String> {
1692        if self.touches_internal_metadata(path) {
1693            return Err(Self::entry_not_found(path));
1694        }
1695        if self.is_whited_out(path) {
1696            return Err(Self::entry_not_found(path));
1697        }
1698        if self.exists_in_upper(path) {
1699            return self
1700                .upper
1701                .as_ref()
1702                .expect("upper must exist when path exists")
1703                .realpath(path);
1704        }
1705        let Some(index) = self.find_lower_by_exists(path) else {
1706            return Err(Self::entry_not_found(path));
1707        };
1708        self.lowers[index].realpath(path)
1709    }
1710
1711    fn symlink(&mut self, target: &str, link_path: &str) -> VfsResult<()> {
1712        if self.touches_internal_metadata(link_path) {
1713            return Err(VfsError::permission_denied("symlink", link_path));
1714        }
1715        self.clear_path_metadata(link_path)?;
1716        self.ensure_ancestor_directories_in_upper(link_path)?;
1717        self.writable_upper(link_path)?.symlink(target, link_path)
1718    }
1719
1720    fn read_link(&self, path: &str) -> VfsResult<String> {
1721        if self.touches_internal_metadata(path) {
1722            return Err(Self::entry_not_found(path));
1723        }
1724        if self.is_whited_out(path) {
1725            return Err(Self::entry_not_found(path));
1726        }
1727        if self.has_entry_in_upper(path) {
1728            return self
1729                .upper
1730                .as_ref()
1731                .expect("upper must exist when path exists")
1732                .read_link(path);
1733        }
1734        let Some((index, _)) = self.find_lower_by_entry(path) else {
1735            return Err(Self::entry_not_found(path));
1736        };
1737        self.lowers[index].read_link(path)
1738    }
1739
1740    fn lstat(&self, path: &str) -> VfsResult<VirtualStat> {
1741        if self.touches_internal_metadata(path) {
1742            return Err(Self::entry_not_found(path));
1743        }
1744        if self.is_whited_out(path) {
1745            return Err(Self::entry_not_found(path));
1746        }
1747        if self.has_entry_in_upper(path) {
1748            return self
1749                .upper
1750                .as_ref()
1751                .expect("upper must exist when path exists")
1752                .lstat(path);
1753        }
1754        self.find_lower_by_entry(path)
1755            .map(|(_, stat)| stat)
1756            .ok_or_else(|| Self::entry_not_found(path))
1757    }
1758
1759    fn link(&mut self, old_path: &str, new_path: &str) -> VfsResult<()> {
1760        if self.touches_internal_metadata(old_path) || self.touches_internal_metadata(new_path) {
1761            return Err(VfsError::permission_denied("link", new_path));
1762        }
1763        self.clear_path_metadata(new_path)?;
1764        self.copy_up_path(old_path)?;
1765        self.ensure_ancestor_directories_in_upper(new_path)?;
1766        self.writable_upper(new_path)?.link(old_path, new_path)
1767    }
1768
1769    fn chmod(&mut self, path: &str, mode: u32) -> VfsResult<()> {
1770        if self.touches_internal_metadata(path) {
1771            return Err(VfsError::permission_denied("chmod", path));
1772        }
1773        if self.is_whited_out(path) {
1774            return Err(Self::entry_not_found(path));
1775        }
1776        if !self.exists_in_upper(path) {
1777            self.copy_up_path(path)?;
1778        }
1779        self.writable_upper(path)?.chmod(path, mode)
1780    }
1781
1782    fn chown(&mut self, path: &str, uid: u32, gid: u32) -> VfsResult<()> {
1783        if self.touches_internal_metadata(path) {
1784            return Err(VfsError::permission_denied("chown", path));
1785        }
1786        if self.is_whited_out(path) {
1787            return Err(Self::entry_not_found(path));
1788        }
1789        if !self.exists_in_upper(path) {
1790            self.copy_up_path(path)?;
1791        }
1792        self.writable_upper(path)?.chown(path, uid, gid)
1793    }
1794
1795    fn utimes(&mut self, path: &str, atime_ms: u64, mtime_ms: u64) -> VfsResult<()> {
1796        if self.touches_internal_metadata(path) {
1797            return Err(VfsError::permission_denied("utime", path));
1798        }
1799        if self.is_whited_out(path) {
1800            return Err(Self::entry_not_found(path));
1801        }
1802        if !self.exists_in_upper(path) {
1803            self.copy_up_path(path)?;
1804        }
1805        self.writable_upper(path)?.utimes(path, atime_ms, mtime_ms)
1806    }
1807
1808    fn utimes_spec(
1809        &mut self,
1810        path: &str,
1811        atime: VirtualUtimeSpec,
1812        mtime: VirtualUtimeSpec,
1813        follow_symlinks: bool,
1814    ) -> VfsResult<()> {
1815        if self.touches_internal_metadata(path) {
1816            return Err(VfsError::permission_denied("utime", path));
1817        }
1818        if self.is_whited_out(path) {
1819            return Err(Self::entry_not_found(path));
1820        }
1821        if !self.exists_in_upper(path) {
1822            self.copy_up_path(path)?;
1823        }
1824        self.writable_upper(path)?
1825            .utimes_spec(path, atime, mtime, follow_symlinks)
1826    }
1827
1828    fn truncate(&mut self, path: &str, length: u64) -> VfsResult<()> {
1829        if self.touches_internal_metadata(path) {
1830            return Err(VfsError::permission_denied("truncate", path));
1831        }
1832        if self.is_whited_out(path) {
1833            return Err(Self::entry_not_found(path));
1834        }
1835        if !self.exists_in_upper(path) {
1836            self.copy_up_path(path)?;
1837        }
1838        self.writable_upper(path)?.truncate(path, length)
1839    }
1840
1841    fn pread(&mut self, path: &str, offset: u64, length: usize) -> VfsResult<Vec<u8>> {
1842        if self.touches_internal_metadata(path) {
1843            return Err(Self::entry_not_found(path));
1844        }
1845        if self.is_whited_out(path) {
1846            return Err(Self::entry_not_found(path));
1847        }
1848        if self.exists_in_upper(path) {
1849            return self
1850                .upper
1851                .as_mut()
1852                .expect("upper must exist when path exists")
1853                .pread(path, offset, length);
1854        }
1855        let Some(index) = self.find_lower_by_exists(path) else {
1856            return Err(Self::entry_not_found(path));
1857        };
1858        self.lowers[index].pread(path, offset, length)
1859    }
1860}
1861
1862#[cfg(test)]
1863mod tests {
1864    use super::{OverlayFileSystem, OverlayMode};
1865    use crate::posix::vfs::{MemoryFileSystem, VfsResult, VirtualFileSystem};
1866
1867    #[test]
1868    fn symlink_into_metadata_namespace_cannot_read_or_resurrect_whiteouts() {
1869        let mut lower = MemoryFileSystem::new();
1870        lower.mkdir("/data", true).expect("create lower directory");
1871        lower
1872            .write_file("/data/secret.txt", b"secret".to_vec())
1873            .expect("seed lower file");
1874
1875        let mut overlay = OverlayFileSystem::with_upper(vec![lower], MemoryFileSystem::new());
1876
1877        // Delete a lower-layer file: a whiteout marker is written under the
1878        // reserved metadata root and the file disappears from the merged view.
1879        overlay
1880            .remove_file("/data/secret.txt")
1881            .expect("whiteout lower file");
1882        assert!(!overlay.exists("/data/secret.txt"));
1883
1884        // A guest symlink whose target is the metadata root must not become a
1885        // window into the reserved namespace.
1886        overlay
1887            .symlink("/.secure-exec-overlay/whiteouts", "/escape")
1888            .expect("creating the symlink itself is allowed");
1889
1890        // Listing through the symlink must be denied, not disclose markers.
1891        assert!(
1892            overlay.read_dir("/escape").is_err(),
1893            "listing the metadata namespace via a symlink must be denied"
1894        );
1895
1896        // Removing the whiteout marker through the symlink must be denied, so the
1897        // deleted lower-layer file cannot be resurrected.
1898        assert!(
1899            overlay.remove_file("/escape/anything").is_err(),
1900            "tampering with metadata via a symlink must be denied"
1901        );
1902        assert!(
1903            !overlay.exists("/data/secret.txt"),
1904            "deleted lower-layer file must stay deleted"
1905        );
1906
1907        // The same bypass via a symlink to an ancestor (e.g. `/`) is also closed.
1908        overlay
1909            .symlink("/", "/rootlink")
1910            .expect("symlink to root is allowed");
1911        assert!(
1912            overlay
1913                .read_dir("/rootlink/.secure-exec-overlay/whiteouts")
1914                .is_err(),
1915            "metadata must be unreachable via an ancestor symlink too"
1916        );
1917    }
1918
1919    #[test]
1920    fn whiteouts_persist_when_overlay_reopens_with_same_upper() {
1921        let mut lower = MemoryFileSystem::new();
1922        lower.mkdir("/data", true).expect("create lower directory");
1923        lower
1924            .write_file("/data/base.txt", b"base".to_vec())
1925            .expect("seed lower file");
1926        let lower_snapshot = lower.snapshot();
1927
1928        let mut overlay = OverlayFileSystem::with_upper(
1929            vec![MemoryFileSystem::from_snapshot(lower_snapshot.clone())],
1930            MemoryFileSystem::new(),
1931        );
1932        overlay
1933            .remove_file("/data/base.txt")
1934            .expect("whiteout lower file");
1935
1936        let upper = overlay.upper.take().expect("overlay upper");
1937        let restored_lower = MemoryFileSystem::from_snapshot(lower_snapshot);
1938        let mut restored = OverlayFileSystem::with_upper(vec![restored_lower], upper);
1939
1940        assert!(!restored.exists("/data/base.txt"));
1941        assert_eq!(
1942            restored.read_dir("/data").expect("read merged directory"),
1943            Vec::<String>::new()
1944        );
1945    }
1946
1947    #[test]
1948    fn copied_up_directories_become_opaque_and_hide_overlay_metadata() {
1949        let mut lower = MemoryFileSystem::new();
1950        lower.mkdir("/data", true).expect("create lower directory");
1951        lower
1952            .write_file("/data/base.txt", b"base".to_vec())
1953            .expect("seed lower file");
1954
1955        let mut overlay = OverlayFileSystem::new(vec![lower], OverlayMode::Ephemeral);
1956        overlay
1957            .chmod("/data", 0o700)
1958            .expect("copy up lower directory");
1959
1960        assert_eq!(
1961            overlay.read_dir("/data").expect("read opaque directory"),
1962            Vec::<String>::new()
1963        );
1964        let root_entries = overlay.read_dir("/").expect("read root");
1965        assert!(!root_entries
1966            .iter()
1967            .any(|entry| entry == ".secure-exec-overlay"));
1968    }
1969
1970    #[test]
1971    fn remove_dir_succeeds_when_only_lower_children_are_whited_out() {
1972        let mut lower = MemoryFileSystem::new();
1973        lower.mkdir("/a", true).expect("create lower directory");
1974        lower
1975            .write_file("/a/c", b"child".to_vec())
1976            .expect("seed lower child");
1977
1978        let mut overlay = OverlayFileSystem::new(vec![lower], OverlayMode::Ephemeral);
1979        overlay.remove_file("/a/c").expect("whiteout lower child");
1980        overlay
1981            .remove_dir("/a")
1982            .expect("remove merged-empty directory");
1983
1984        assert!(!overlay.exists("/a"));
1985        assert_error_code(overlay.read_dir("/a"), "ENOENT");
1986    }
1987
1988    #[test]
1989    fn remove_dir_still_rejects_visible_children() {
1990        let mut lower = MemoryFileSystem::new();
1991        lower.mkdir("/a", true).expect("create lower directory");
1992        lower
1993            .write_file("/a/c", b"child".to_vec())
1994            .expect("seed lower child");
1995
1996        let mut overlay = OverlayFileSystem::new(vec![lower], OverlayMode::Ephemeral);
1997        assert_error_code(overlay.remove_dir("/a"), "ENOTEMPTY");
1998        assert!(overlay.exists("/a/c"));
1999    }
2000
2001    fn assert_error_code<T: std::fmt::Debug>(result: VfsResult<T>, expected: &str) {
2002        let error = result.expect_err("expected operation to fail");
2003        assert_eq!(error.code(), expected);
2004    }
2005}