Skip to main content

zlayer_paths/
safe_fs.rs

1//! Symlink-safe filesystem tree operations.
2//!
3//! Recursive `chmod`/`chown`/delete walkers and tar extraction routinely escape
4//! a container rootfs through an absolute symlink. The standard one is
5//! `var/run -> /run`: a naive walker that decides "is this a directory?" with
6//! [`std::path::Path::is_dir`] (which dereferences symlinks) recurses *through*
7//! the link and mutates the host's `/run` — most painfully `chmod`/`chown`ing
8//! the host's `/run/sshd`, which makes `sshd` reject every new connection
9//! (`fatal: /run/sshd must be owned by root and not group or world-writable.`).
10//! The TCP handshake still completes, so it looks exactly like a firewall drop
11//! while being a filesystem-permissions bug, and only a reboot (tmpfs `/run`
12//! recreated clean) recovers it.
13//!
14//! Every operation here uses [`std::fs::symlink_metadata`] (`lstat`, which does
15//! NOT follow symlinks) to classify a node before touching it, and never
16//! recurses into a symlink. The two helpers used by the OCI unpacker
17//! ([`materialize_real_parent`] and [`lremove_if_symlink`]) additionally make
18//! sure a write never lands *through* an on-disk symlinked parent that points
19//! outside the destination tree.
20//!
21//! Prefer these over hand-rolled `read_dir` + `is_dir()` walks. The footguns
22//! they avoid: `std::fs::{set_permissions, metadata, canonicalize}`,
23//! `nix::unistd::chown`, and `Path::{is_dir, is_file}` all dereference symlinks;
24//! only `symlink_metadata`/`lstat` and `remove_file` (on the link itself) do not.
25
26use std::io;
27use std::path::{Component, Path, PathBuf};
28
29/// Walk `root` depth-first **without ever following a symlink**, invoking
30/// `visit(path, metadata)` for every real file and directory (the `metadata`
31/// is the `lstat` result, so `metadata.is_dir()` is the true on-disk type).
32///
33/// Symlinks are skipped entirely — neither visited nor traversed — so the walk
34/// can never leave `root`'s subtree. Per-node `lstat`/`read_dir` failures are
35/// logged and skipped (best-effort traversal); only an error returned by
36/// `visit` aborts the walk and propagates.
37///
38/// # Errors
39///
40/// Returns the first error produced by `visit`.
41pub fn walk_no_follow<F>(root: &Path, mut visit: F) -> io::Result<()>
42where
43    F: FnMut(&Path, &std::fs::Metadata) -> io::Result<()>,
44{
45    let mut stack = vec![root.to_path_buf()];
46    while let Some(p) = stack.pop() {
47        // lstat — never follow a symlink when deciding what `p` is.
48        let md = match std::fs::symlink_metadata(&p) {
49            Ok(md) => md,
50            Err(e) => {
51                tracing::debug!(path = %p.display(), error = %e, "lstat failed during safe walk");
52                continue;
53            }
54        };
55        if md.file_type().is_symlink() {
56            // A symlink — even one pointing at a directory — is left entirely
57            // alone so the walk can never reach outside `root`.
58            continue;
59        }
60        visit(&p, &md)?;
61        if md.is_dir() {
62            match std::fs::read_dir(&p) {
63                Ok(entries) => stack.extend(entries.flatten().map(|e| e.path())),
64                Err(e) => {
65                    tracing::debug!(path = %p.display(), error = %e, "read_dir failed during safe walk");
66                }
67            }
68        }
69    }
70    Ok(())
71}
72
73/// `chgrp` every real entry under `path` to `gid` and set every real directory
74/// to `dir_mode` (e.g. `0o2775` for setgid + group write), skipping symlinks.
75///
76/// Best-effort: per-node failures are logged, not propagated (matches the
77/// daemon build-dir normalize semantics). The owner is left as-is; only the
78/// group and directory mode change.
79#[cfg(unix)]
80pub fn chgrp_setgid_tree(path: &Path, gid: nix::unistd::Gid, dir_mode: u32) {
81    use std::os::unix::fs::PermissionsExt;
82
83    if let Err(e) = std::fs::create_dir_all(path) {
84        tracing::debug!(path = %path.display(), error = %e, "could not create dir for chgrp_setgid_tree");
85        return;
86    }
87    let _ = walk_no_follow(path, |p, md| {
88        // `p` is a real file/dir here (symlinks are skipped by the walker), so
89        // `chown` has nothing to dereference.
90        if let Err(e) = nix::unistd::chown(p, None, Some(gid)) {
91            tracing::debug!(path = %p.display(), error = %e, "chgrp failed during tree normalize");
92        }
93        if md.is_dir() {
94            if let Err(e) = std::fs::set_permissions(p, std::fs::Permissions::from_mode(dir_mode)) {
95                tracing::debug!(path = %p.display(), error = %e, "chmod failed during tree normalize");
96            }
97        }
98        Ok(())
99    });
100}
101
102/// Make every real directory under `root` writable+executable by the owner
103/// (`0o700`) so a subsequent [`std::fs::remove_dir_all`] can delete a tree that
104/// contains read-only directories (e.g. Fedora's `0o555` `ca-trust`), skipping
105/// symlinks. Best-effort.
106///
107/// Call this immediately before `remove_dir_all(root)`.
108#[cfg(unix)]
109pub fn chmod_tree_writable(root: &Path) {
110    use std::os::unix::fs::PermissionsExt;
111
112    let _ = walk_no_follow(root, |p, md| {
113        if md.is_dir() {
114            if let Err(e) = std::fs::set_permissions(p, std::fs::Permissions::from_mode(0o700)) {
115                tracing::debug!(path = %p.display(), error = %e, "chmod-writable failed");
116            }
117        }
118        Ok(())
119    });
120}
121
122/// Apply `mode` to every real **file** under `root` (directories are left
123/// untouched — applying a file mode such as `0o644` to a directory would clear
124/// its execute bit and make it non-traversable), skipping symlinks. Used for
125/// build `COPY/ADD --chmod`.
126///
127/// # Errors
128///
129/// Returns the first `set_permissions` error.
130#[cfg(unix)]
131pub fn chmod_tree_files(root: &Path, mode: u32) -> io::Result<()> {
132    use std::os::unix::fs::PermissionsExt;
133
134    walk_no_follow(root, |p, md| {
135        if md.is_file() {
136            std::fs::set_permissions(p, std::fs::Permissions::from_mode(mode))
137        } else {
138            Ok(())
139        }
140    })
141}
142
143/// `chown` every real file and directory under `root` to `uid`/`gid` (either may
144/// be `None` to leave unchanged), skipping symlinks. Used for build
145/// `COPY/ADD --chown`. A no-op when both are `None`.
146///
147/// # Errors
148///
149/// Returns the first `chown` error.
150#[cfg(unix)]
151pub fn chown_tree(root: &Path, uid: Option<u32>, gid: Option<u32>) -> io::Result<()> {
152    if uid.is_none() && gid.is_none() {
153        return Ok(());
154    }
155    let owner_uid = uid.map(nix::unistd::Uid::from_raw);
156    let owner_gid = gid.map(nix::unistd::Gid::from_raw);
157    walk_no_follow(root, |p, _md| {
158        nix::unistd::chown(p, owner_uid, owner_gid)
159            .map_err(|e| io::Error::other(format!("chown failed on {}: {e}", p.display())))
160    })
161}
162
163/// Remove `path` without ever following a symlink: a symlink (or any non-dir)
164/// is unlinked via [`std::fs::remove_file`] (which removes the link itself, not
165/// its target); a real directory is removed with [`std::fs::remove_dir_all`]
166/// (whose own top-level entry is `lstat`'d, so it will not traverse a symlink
167/// either). A missing path is not an error.
168///
169/// This replaces the `if path.is_dir() { remove_dir_all } else { remove_file }`
170/// idiom, whose `is_dir()` follows symlinks and so deletes through a link.
171///
172/// # Errors
173///
174/// Returns any underlying removal error other than "not found".
175pub fn remove_path_no_follow(path: &Path) -> io::Result<()> {
176    let md = match std::fs::symlink_metadata(path) {
177        Ok(md) => md,
178        Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(()),
179        Err(e) => return Err(e),
180    };
181    let res = if md.is_dir() {
182        std::fs::remove_dir_all(path)
183    } else {
184        // Regular file, symlink, socket, fifo, … — unlink the entry itself.
185        std::fs::remove_file(path)
186    };
187    match res {
188        Ok(()) => Ok(()),
189        Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()),
190        Err(e) => Err(e),
191    }
192}
193
194/// If `path` currently exists as a symlink, unlink it (so a following
195/// `File::create`/`set_permissions`/`create_dir` lands on a fresh real entry
196/// instead of writing through the link). No-op if `path` is absent or is not a
197/// symlink.
198///
199/// # Errors
200///
201/// Returns the unlink error if removing the symlink fails.
202pub fn lremove_if_symlink(path: &Path) -> io::Result<()> {
203    match std::fs::symlink_metadata(path) {
204        Ok(md) if md.file_type().is_symlink() => std::fs::remove_file(path),
205        _ => Ok(()),
206    }
207}
208
209/// Ensure every parent component of `full_path`, from `rootfs` down to
210/// `full_path.parent()`, is a **real directory inside `rootfs`** — replacing any
211/// component that is currently a symlink with a real directory — so a subsequent
212/// write to `full_path` cannot be redirected outside `rootfs`.
213///
214/// This is the OCI-unpack defense: a layer may ship `var/run -> /run` and then a
215/// later entry `var/run/sshd`; without this, `create_dir_all`/`File::create`
216/// follow the link and write to the host's `/run`. Replacing a symlinked parent
217/// with a directory is OCI-correct — a later layer is allowed to put a real
218/// directory where an earlier layer had a symlink.
219///
220/// `full_path` must be `rootfs.join(<relative entry path>)`; if it does not lie
221/// under `rootfs` this is a no-op (nothing outside `rootfs` is touched).
222///
223/// # Errors
224///
225/// Returns an error if a symlinked/foreign parent cannot be unlinked or a real
226/// directory cannot be created.
227pub fn materialize_real_parent(rootfs: &Path, full_path: &Path) -> io::Result<()> {
228    let Some(parent) = full_path.parent() else {
229        return Ok(());
230    };
231    // Only operate within rootfs; never touch anything above it.
232    let Ok(rel) = parent.strip_prefix(rootfs) else {
233        return Ok(());
234    };
235
236    let mut cur = rootfs.to_path_buf();
237    for comp in rel.components() {
238        // `rel` is relative and the entry path was already validated to contain
239        // no `..`/absolute components, but match defensively and only descend
240        // through normal components.
241        let Component::Normal(name) = comp else {
242            continue;
243        };
244        cur.push(name);
245        match std::fs::symlink_metadata(&cur) {
246            // A symlink parent would redirect the write outside rootfs — unlink
247            // it and put a real directory in its place.
248            Ok(md) if md.file_type().is_symlink() => {
249                std::fs::remove_file(&cur)?;
250                std::fs::create_dir(&cur)?;
251            }
252            // Already a real directory — good.
253            Ok(md) if md.is_dir() => {}
254            // A non-dir file sits where we need a directory — replace it.
255            Ok(_) => {
256                std::fs::remove_file(&cur)?;
257                std::fs::create_dir(&cur)?;
258            }
259            // Does not exist yet — create it.
260            Err(_) => {
261                std::fs::create_dir(&cur)?;
262            }
263        }
264    }
265    Ok(())
266}
267
268/// Compute a rootfs-confined **relative** target for an **absolute** symlink.
269///
270/// `link_rel` is the symlink's path *relative to the rootfs root* (e.g.
271/// `var/run` for a link at `<rootfs>/var/run`); `abs_target` is its current
272/// absolute target (e.g. `/run`). Returns the equivalent relative target (e.g.
273/// `../run`) that resolves to the **same** location *inside* the rootfs —
274/// post-pivot the container's `/` IS the rootfs, so `/run` and `../run`
275/// (from `/var`) are identical — but can never escape the rootfs for a
276/// host-context operation that resolves the link before pivot_root.
277///
278/// Returns `None` when `abs_target` is not absolute (nothing to rewrite) or
279/// `link_rel` has no parent.
280#[must_use]
281pub fn relativize_abs_symlink(link_rel: &Path, abs_target: &Path) -> Option<PathBuf> {
282    if !abs_target.is_absolute() {
283        return None;
284    }
285    // Depth of the directory CONTAINING the link, in normal components from the
286    // rootfs root (`var/run` -> parent `var` -> depth 1 -> one `..`).
287    let parent = link_rel.parent()?;
288    let depth = parent
289        .components()
290        .filter(|c| matches!(c, Component::Normal(_)))
291        .count();
292    // The absolute target without its leading `/` IS the path relative to the
293    // rootfs root.
294    let target_rel: PathBuf = abs_target
295        .components()
296        .filter_map(|c| match c {
297            Component::Normal(n) => Some(n),
298            _ => None,
299        })
300        .collect();
301    let mut out = PathBuf::new();
302    for _ in 0..depth {
303        out.push("..");
304    }
305    if target_rel.as_os_str().is_empty() {
306        // Target was `/` (the rootfs root). A link directly under rootfs maps
307        // to `.`; deeper links already point at the root via the `..` prefix.
308        if out.as_os_str().is_empty() {
309            out.push(".");
310        }
311    } else {
312        out.push(&target_rel);
313    }
314    Some(out)
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    /// Build `<base>/rootfs/var/run -> <base>/host_run` (an absolute escaping
322    /// symlink, like a container's `var/run -> /run`) plus a sentinel file
323    /// `<base>/host_run/sshd`. Returns (rootfs, host_run, sentinel).
324    fn rootfs_with_escaping_symlink(base: &Path) -> (PathBuf, PathBuf, PathBuf) {
325        let rootfs = base.join("rootfs");
326        let host_run = base.join("host_run");
327        std::fs::create_dir_all(rootfs.join("var")).unwrap();
328        std::fs::create_dir_all(&host_run).unwrap();
329        let sentinel = host_run.join("sshd");
330        std::fs::write(&sentinel, b"i am the host sshd runtime dir contents").unwrap();
331        #[cfg(unix)]
332        std::os::unix::fs::symlink(&host_run, rootfs.join("var").join("run")).unwrap();
333        (rootfs, host_run, sentinel)
334    }
335
336    #[cfg(unix)]
337    #[test]
338    fn chmod_tree_files_does_not_follow_symlink_out_of_root() {
339        use std::os::unix::fs::PermissionsExt;
340        let tmp = tempfile::tempdir().unwrap();
341        let (rootfs, host_run, sentinel) = rootfs_with_escaping_symlink(tmp.path());
342        // Put a real file inside the rootfs so we know chmod actually ran.
343        let inside = rootfs.join("var").join("inside.txt");
344        std::fs::write(&inside, b"x").unwrap();
345        let before = std::fs::metadata(&sentinel).unwrap().permissions().mode() & 0o777;
346
347        chmod_tree_files(&rootfs, 0o600).unwrap();
348
349        // The host sentinel (reachable only through the var/run symlink) is untouched.
350        let after = std::fs::metadata(&sentinel).unwrap().permissions().mode() & 0o777;
351        assert_eq!(before, after, "sentinel host file mode must be unchanged");
352        // The in-rootfs file WAS chmod'd.
353        assert_eq!(
354            std::fs::metadata(&inside).unwrap().permissions().mode() & 0o777,
355            0o600
356        );
357        let _ = host_run;
358    }
359
360    #[cfg(unix)]
361    #[test]
362    fn walk_skips_symlinked_dir() {
363        let tmp = tempfile::tempdir().unwrap();
364        let (rootfs, _host_run, _sentinel) = rootfs_with_escaping_symlink(tmp.path());
365        let mut visited = Vec::new();
366        walk_no_follow(&rootfs, |p, _md| {
367            visited.push(p.to_path_buf());
368            Ok(())
369        })
370        .unwrap();
371        // The symlink `var/run` itself is never visited, and nothing under the
372        // host_run target is visited.
373        assert!(visited.iter().any(|p| p.ends_with("var")));
374        assert!(
375            !visited.iter().any(|p| p.ends_with("var/run")),
376            "symlink must not be visited"
377        );
378        assert!(
379            !visited.iter().any(|p| p.ends_with("sshd")),
380            "must not cross the symlink into the host dir"
381        );
382    }
383
384    #[test]
385    fn materialize_real_parent_replaces_escaping_symlink() {
386        let tmp = tempfile::tempdir().unwrap();
387        let (rootfs, _host_run, sentinel) = rootfs_with_escaping_symlink(tmp.path());
388
389        // A later entry wants to write rootfs/var/run/sshd. Materialize parents.
390        let target = rootfs.join("var").join("run").join("sshd");
391        materialize_real_parent(&rootfs, &target).unwrap();
392
393        // var/run is now a REAL directory inside rootfs, not a symlink.
394        let md = std::fs::symlink_metadata(rootfs.join("var").join("run")).unwrap();
395        assert!(md.is_dir(), "var/run must be a real dir now");
396        assert!(
397            !md.file_type().is_symlink(),
398            "var/run must not be a symlink"
399        );
400
401        // Writing the target now stays inside rootfs; the host sentinel is intact.
402        std::fs::write(&target, b"contained").unwrap();
403        assert_eq!(
404            std::fs::read(&sentinel).unwrap(),
405            b"i am the host sshd runtime dir contents",
406            "host sentinel must be untouched"
407        );
408        assert!(target.exists());
409    }
410
411    #[test]
412    fn remove_path_no_follow_unlinks_symlink_not_target() {
413        let tmp = tempfile::tempdir().unwrap();
414        let (rootfs, _host_run, sentinel) = rootfs_with_escaping_symlink(tmp.path());
415        let link = rootfs.join("var").join("run");
416
417        remove_path_no_follow(&link).unwrap();
418
419        assert!(
420            std::fs::symlink_metadata(&link).is_err(),
421            "the symlink itself must be gone"
422        );
423        assert!(
424            sentinel.exists(),
425            "the symlink target (host file) must NOT be deleted"
426        );
427    }
428
429    #[test]
430    fn lremove_if_symlink_only_removes_links() {
431        let tmp = tempfile::tempdir().unwrap();
432        let real = tmp.path().join("real.txt");
433        std::fs::write(&real, b"keep").unwrap();
434        lremove_if_symlink(&real).unwrap();
435        assert!(real.exists(), "a real file must not be removed");
436
437        #[cfg(unix)]
438        {
439            let link = tmp.path().join("link");
440            std::os::unix::fs::symlink(&real, &link).unwrap();
441            lremove_if_symlink(&link).unwrap();
442            assert!(std::fs::symlink_metadata(&link).is_err(), "link removed");
443            assert!(real.exists(), "link target preserved");
444        }
445    }
446
447    #[test]
448    fn relativize_abs_symlink_confines_targets() {
449        // The canonical escape: var/run -> /run becomes var/run -> ../run.
450        assert_eq!(
451            relativize_abs_symlink(Path::new("var/run"), Path::new("/run")),
452            Some(PathBuf::from("../run"))
453        );
454        // Deeper link: var/lock -> /run/lock => ../run/lock.
455        assert_eq!(
456            relativize_abs_symlink(Path::new("var/lock"), Path::new("/run/lock")),
457            Some(PathBuf::from("../run/lock"))
458        );
459        // Top-level link (directly under rootfs): bin -> /usr/bin => usr/bin.
460        assert_eq!(
461            relativize_abs_symlink(Path::new("bin"), Path::new("/usr/bin")),
462            Some(PathBuf::from("usr/bin"))
463        );
464        // Two-deep: a/b/c -> /x => ../../x.
465        assert_eq!(
466            relativize_abs_symlink(Path::new("a/b/c"), Path::new("/x")),
467            Some(PathBuf::from("../../x"))
468        );
469        // Target is the rootfs root.
470        assert_eq!(
471            relativize_abs_symlink(Path::new("here"), Path::new("/")),
472            Some(PathBuf::from("."))
473        );
474        assert_eq!(
475            relativize_abs_symlink(Path::new("a/here"), Path::new("/")),
476            Some(PathBuf::from(".."))
477        );
478        // Already relative: nothing to do.
479        assert_eq!(
480            relativize_abs_symlink(Path::new("var/run"), Path::new("../run")),
481            None
482        );
483    }
484}