zlayer_paths/safe_fs.rs
1//! Symlink-safe filesystem tree operations.
2//!
3//! Recursive `chmod`/`chown`/delete walkers and tar extraction routinely escape
4//! a container rootfs through an absolute symlink. The standard one is
5//! `var/run -> /run`: a naive walker that decides "is this a directory?" with
6//! [`std::path::Path::is_dir`] (which dereferences symlinks) recurses *through*
7//! the link and mutates the host's `/run` — most painfully `chmod`/`chown`ing
8//! the host's `/run/sshd`, which makes `sshd` reject every new connection
9//! (`fatal: /run/sshd must be owned by root and not group or world-writable.`).
10//! The TCP handshake still completes, so it looks exactly like a firewall drop
11//! while being a filesystem-permissions bug, and only a reboot (tmpfs `/run`
12//! recreated clean) recovers it.
13//!
14//! Every operation here uses [`std::fs::symlink_metadata`] (`lstat`, which does
15//! NOT follow symlinks) to classify a node before touching it, and never
16//! recurses into a symlink. The two helpers used by the OCI unpacker
17//! ([`materialize_real_parent`] and [`lremove_if_symlink`]) additionally make
18//! sure a write never lands *through* an on-disk symlinked parent that points
19//! outside the destination tree.
20//!
21//! Prefer these over hand-rolled `read_dir` + `is_dir()` walks. The footguns
22//! they avoid: `std::fs::{set_permissions, metadata, canonicalize}`,
23//! `nix::unistd::chown`, and `Path::{is_dir, is_file}` all dereference symlinks;
24//! only `symlink_metadata`/`lstat` and `remove_file` (on the link itself) do not.
25
26use std::io;
27use std::path::{Component, Path, PathBuf};
28
29/// Walk `root` depth-first **without ever following a symlink**, invoking
30/// `visit(path, metadata)` for every real file and directory (the `metadata`
31/// is the `lstat` result, so `metadata.is_dir()` is the true on-disk type).
32///
33/// Symlinks are skipped entirely — neither visited nor traversed — so the walk
34/// can never leave `root`'s subtree. Per-node `lstat`/`read_dir` failures are
35/// logged and skipped (best-effort traversal); only an error returned by
36/// `visit` aborts the walk and propagates.
37///
38/// # Errors
39///
40/// Returns the first error produced by `visit`.
41pub fn walk_no_follow<F>(root: &Path, mut visit: F) -> io::Result<()>
42where
43 F: FnMut(&Path, &std::fs::Metadata) -> io::Result<()>,
44{
45 let mut stack = vec![root.to_path_buf()];
46 while let Some(p) = stack.pop() {
47 // lstat — never follow a symlink when deciding what `p` is.
48 let md = match std::fs::symlink_metadata(&p) {
49 Ok(md) => md,
50 Err(e) => {
51 tracing::debug!(path = %p.display(), error = %e, "lstat failed during safe walk");
52 continue;
53 }
54 };
55 if md.file_type().is_symlink() {
56 // A symlink — even one pointing at a directory — is left entirely
57 // alone so the walk can never reach outside `root`.
58 continue;
59 }
60 visit(&p, &md)?;
61 if md.is_dir() {
62 match std::fs::read_dir(&p) {
63 Ok(entries) => stack.extend(entries.flatten().map(|e| e.path())),
64 Err(e) => {
65 tracing::debug!(path = %p.display(), error = %e, "read_dir failed during safe walk");
66 }
67 }
68 }
69 }
70 Ok(())
71}
72
73/// `chgrp` every real entry under `path` to `gid` and set every real directory
74/// to `dir_mode` (e.g. `0o2775` for setgid + group write), skipping symlinks.
75///
76/// Best-effort: per-node failures are logged, not propagated (matches the
77/// daemon build-dir normalize semantics). The owner is left as-is; only the
78/// group and directory mode change.
79#[cfg(unix)]
80pub fn chgrp_setgid_tree(path: &Path, gid: nix::unistd::Gid, dir_mode: u32) {
81 use std::os::unix::fs::PermissionsExt;
82
83 if let Err(e) = std::fs::create_dir_all(path) {
84 tracing::debug!(path = %path.display(), error = %e, "could not create dir for chgrp_setgid_tree");
85 return;
86 }
87 let _ = walk_no_follow(path, |p, md| {
88 // `p` is a real file/dir here (symlinks are skipped by the walker), so
89 // `chown` has nothing to dereference.
90 if let Err(e) = nix::unistd::chown(p, None, Some(gid)) {
91 tracing::debug!(path = %p.display(), error = %e, "chgrp failed during tree normalize");
92 }
93 if md.is_dir() {
94 if let Err(e) = std::fs::set_permissions(p, std::fs::Permissions::from_mode(dir_mode)) {
95 tracing::debug!(path = %p.display(), error = %e, "chmod failed during tree normalize");
96 }
97 }
98 Ok(())
99 });
100}
101
102/// Make every real directory under `root` writable+executable by the owner
103/// (`0o700`) so a subsequent [`std::fs::remove_dir_all`] can delete a tree that
104/// contains read-only directories (e.g. Fedora's `0o555` `ca-trust`), skipping
105/// symlinks. Best-effort.
106///
107/// Call this immediately before `remove_dir_all(root)`.
108#[cfg(unix)]
109pub fn chmod_tree_writable(root: &Path) {
110 use std::os::unix::fs::PermissionsExt;
111
112 let _ = walk_no_follow(root, |p, md| {
113 if md.is_dir() {
114 if let Err(e) = std::fs::set_permissions(p, std::fs::Permissions::from_mode(0o700)) {
115 tracing::debug!(path = %p.display(), error = %e, "chmod-writable failed");
116 }
117 }
118 Ok(())
119 });
120}
121
122/// Apply `mode` to every real **file** under `root` (directories are left
123/// untouched — applying a file mode such as `0o644` to a directory would clear
124/// its execute bit and make it non-traversable), skipping symlinks. Used for
125/// build `COPY/ADD --chmod`.
126///
127/// # Errors
128///
129/// Returns the first `set_permissions` error.
130#[cfg(unix)]
131pub fn chmod_tree_files(root: &Path, mode: u32) -> io::Result<()> {
132 use std::os::unix::fs::PermissionsExt;
133
134 walk_no_follow(root, |p, md| {
135 if md.is_file() {
136 std::fs::set_permissions(p, std::fs::Permissions::from_mode(mode))
137 } else {
138 Ok(())
139 }
140 })
141}
142
143/// `chown` every real file and directory under `root` to `uid`/`gid` (either may
144/// be `None` to leave unchanged), skipping symlinks. Used for build
145/// `COPY/ADD --chown`. A no-op when both are `None`.
146///
147/// # Errors
148///
149/// Returns the first `chown` error.
150#[cfg(unix)]
151pub fn chown_tree(root: &Path, uid: Option<u32>, gid: Option<u32>) -> io::Result<()> {
152 if uid.is_none() && gid.is_none() {
153 return Ok(());
154 }
155 let owner_uid = uid.map(nix::unistd::Uid::from_raw);
156 let owner_gid = gid.map(nix::unistd::Gid::from_raw);
157 walk_no_follow(root, |p, _md| {
158 nix::unistd::chown(p, owner_uid, owner_gid)
159 .map_err(|e| io::Error::other(format!("chown failed on {}: {e}", p.display())))
160 })
161}
162
163/// Remove `path` without ever following a symlink: a symlink (or any non-dir)
164/// is unlinked via [`std::fs::remove_file`] (which removes the link itself, not
165/// its target); a real directory is removed with [`std::fs::remove_dir_all`]
166/// (whose own top-level entry is `lstat`'d, so it will not traverse a symlink
167/// either). A missing path is not an error.
168///
169/// This replaces the `if path.is_dir() { remove_dir_all } else { remove_file }`
170/// idiom, whose `is_dir()` follows symlinks and so deletes through a link.
171///
172/// # Errors
173///
174/// Returns any underlying removal error other than "not found".
175pub fn remove_path_no_follow(path: &Path) -> io::Result<()> {
176 let md = match std::fs::symlink_metadata(path) {
177 Ok(md) => md,
178 Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(()),
179 Err(e) => return Err(e),
180 };
181 let res = if md.is_dir() {
182 std::fs::remove_dir_all(path)
183 } else {
184 // Regular file, symlink, socket, fifo, … — unlink the entry itself.
185 std::fs::remove_file(path)
186 };
187 match res {
188 Ok(()) => Ok(()),
189 Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()),
190 Err(e) => Err(e),
191 }
192}
193
194/// If `path` currently exists as a symlink, unlink it (so a following
195/// `File::create`/`set_permissions`/`create_dir` lands on a fresh real entry
196/// instead of writing through the link). No-op if `path` is absent or is not a
197/// symlink.
198///
199/// # Errors
200///
201/// Returns the unlink error if removing the symlink fails.
202pub fn lremove_if_symlink(path: &Path) -> io::Result<()> {
203 match std::fs::symlink_metadata(path) {
204 Ok(md) if md.file_type().is_symlink() => std::fs::remove_file(path),
205 _ => Ok(()),
206 }
207}
208
209/// Ensure every parent component of `full_path`, from `rootfs` down to
210/// `full_path.parent()`, is a **real directory inside `rootfs`** — replacing any
211/// component that is currently a symlink with a real directory — so a subsequent
212/// write to `full_path` cannot be redirected outside `rootfs`.
213///
214/// This is the OCI-unpack defense: a layer may ship `var/run -> /run` and then a
215/// later entry `var/run/sshd`; without this, `create_dir_all`/`File::create`
216/// follow the link and write to the host's `/run`. Replacing a symlinked parent
217/// with a directory is OCI-correct — a later layer is allowed to put a real
218/// directory where an earlier layer had a symlink.
219///
220/// `full_path` must be `rootfs.join(<relative entry path>)`; if it does not lie
221/// under `rootfs` this is a no-op (nothing outside `rootfs` is touched).
222///
223/// # Errors
224///
225/// Returns an error if a symlinked/foreign parent cannot be unlinked or a real
226/// directory cannot be created.
227pub fn materialize_real_parent(rootfs: &Path, full_path: &Path) -> io::Result<()> {
228 let Some(parent) = full_path.parent() else {
229 return Ok(());
230 };
231 // Only operate within rootfs; never touch anything above it.
232 let Ok(rel) = parent.strip_prefix(rootfs) else {
233 return Ok(());
234 };
235
236 let mut cur = rootfs.to_path_buf();
237 for comp in rel.components() {
238 // `rel` is relative and the entry path was already validated to contain
239 // no `..`/absolute components, but match defensively and only descend
240 // through normal components.
241 let Component::Normal(name) = comp else {
242 continue;
243 };
244 cur.push(name);
245 match std::fs::symlink_metadata(&cur) {
246 // A symlink parent would redirect the write outside rootfs — unlink
247 // it and put a real directory in its place.
248 Ok(md) if md.file_type().is_symlink() => {
249 std::fs::remove_file(&cur)?;
250 std::fs::create_dir(&cur)?;
251 }
252 // Already a real directory — good.
253 Ok(md) if md.is_dir() => {}
254 // A non-dir file sits where we need a directory — replace it.
255 Ok(_) => {
256 std::fs::remove_file(&cur)?;
257 std::fs::create_dir(&cur)?;
258 }
259 // Does not exist yet — create it.
260 Err(_) => {
261 std::fs::create_dir(&cur)?;
262 }
263 }
264 }
265 Ok(())
266}
267
268/// Compute a rootfs-confined **relative** target for an **absolute** symlink.
269///
270/// `link_rel` is the symlink's path *relative to the rootfs root* (e.g.
271/// `var/run` for a link at `<rootfs>/var/run`); `abs_target` is its current
272/// absolute target (e.g. `/run`). Returns the equivalent relative target (e.g.
273/// `../run`) that resolves to the **same** location *inside* the rootfs —
274/// post-pivot the container's `/` IS the rootfs, so `/run` and `../run`
275/// (from `/var`) are identical — but can never escape the rootfs for a
276/// host-context operation that resolves the link before pivot_root.
277///
278/// Returns `None` when `abs_target` is not absolute (nothing to rewrite) or
279/// `link_rel` has no parent.
280#[must_use]
281pub fn relativize_abs_symlink(link_rel: &Path, abs_target: &Path) -> Option<PathBuf> {
282 if !abs_target.is_absolute() {
283 return None;
284 }
285 // Depth of the directory CONTAINING the link, in normal components from the
286 // rootfs root (`var/run` -> parent `var` -> depth 1 -> one `..`).
287 let parent = link_rel.parent()?;
288 let depth = parent
289 .components()
290 .filter(|c| matches!(c, Component::Normal(_)))
291 .count();
292 // The absolute target without its leading `/` IS the path relative to the
293 // rootfs root.
294 let target_rel: PathBuf = abs_target
295 .components()
296 .filter_map(|c| match c {
297 Component::Normal(n) => Some(n),
298 _ => None,
299 })
300 .collect();
301 let mut out = PathBuf::new();
302 for _ in 0..depth {
303 out.push("..");
304 }
305 if target_rel.as_os_str().is_empty() {
306 // Target was `/` (the rootfs root). A link directly under rootfs maps
307 // to `.`; deeper links already point at the root via the `..` prefix.
308 if out.as_os_str().is_empty() {
309 out.push(".");
310 }
311 } else {
312 out.push(&target_rel);
313 }
314 Some(out)
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320
321 /// Build `<base>/rootfs/var/run -> <base>/host_run` (an absolute escaping
322 /// symlink, like a container's `var/run -> /run`) plus a sentinel file
323 /// `<base>/host_run/sshd`. Returns (rootfs, host_run, sentinel).
324 fn rootfs_with_escaping_symlink(base: &Path) -> (PathBuf, PathBuf, PathBuf) {
325 let rootfs = base.join("rootfs");
326 let host_run = base.join("host_run");
327 std::fs::create_dir_all(rootfs.join("var")).unwrap();
328 std::fs::create_dir_all(&host_run).unwrap();
329 let sentinel = host_run.join("sshd");
330 std::fs::write(&sentinel, b"i am the host sshd runtime dir contents").unwrap();
331 #[cfg(unix)]
332 std::os::unix::fs::symlink(&host_run, rootfs.join("var").join("run")).unwrap();
333 (rootfs, host_run, sentinel)
334 }
335
336 #[cfg(unix)]
337 #[test]
338 fn chmod_tree_files_does_not_follow_symlink_out_of_root() {
339 use std::os::unix::fs::PermissionsExt;
340 let tmp = tempfile::tempdir().unwrap();
341 let (rootfs, host_run, sentinel) = rootfs_with_escaping_symlink(tmp.path());
342 // Put a real file inside the rootfs so we know chmod actually ran.
343 let inside = rootfs.join("var").join("inside.txt");
344 std::fs::write(&inside, b"x").unwrap();
345 let before = std::fs::metadata(&sentinel).unwrap().permissions().mode() & 0o777;
346
347 chmod_tree_files(&rootfs, 0o600).unwrap();
348
349 // The host sentinel (reachable only through the var/run symlink) is untouched.
350 let after = std::fs::metadata(&sentinel).unwrap().permissions().mode() & 0o777;
351 assert_eq!(before, after, "sentinel host file mode must be unchanged");
352 // The in-rootfs file WAS chmod'd.
353 assert_eq!(
354 std::fs::metadata(&inside).unwrap().permissions().mode() & 0o777,
355 0o600
356 );
357 let _ = host_run;
358 }
359
360 #[cfg(unix)]
361 #[test]
362 fn walk_skips_symlinked_dir() {
363 let tmp = tempfile::tempdir().unwrap();
364 let (rootfs, _host_run, _sentinel) = rootfs_with_escaping_symlink(tmp.path());
365 let mut visited = Vec::new();
366 walk_no_follow(&rootfs, |p, _md| {
367 visited.push(p.to_path_buf());
368 Ok(())
369 })
370 .unwrap();
371 // The symlink `var/run` itself is never visited, and nothing under the
372 // host_run target is visited.
373 assert!(visited.iter().any(|p| p.ends_with("var")));
374 assert!(
375 !visited.iter().any(|p| p.ends_with("var/run")),
376 "symlink must not be visited"
377 );
378 assert!(
379 !visited.iter().any(|p| p.ends_with("sshd")),
380 "must not cross the symlink into the host dir"
381 );
382 }
383
384 #[test]
385 fn materialize_real_parent_replaces_escaping_symlink() {
386 let tmp = tempfile::tempdir().unwrap();
387 let (rootfs, _host_run, sentinel) = rootfs_with_escaping_symlink(tmp.path());
388
389 // A later entry wants to write rootfs/var/run/sshd. Materialize parents.
390 let target = rootfs.join("var").join("run").join("sshd");
391 materialize_real_parent(&rootfs, &target).unwrap();
392
393 // var/run is now a REAL directory inside rootfs, not a symlink.
394 let md = std::fs::symlink_metadata(rootfs.join("var").join("run")).unwrap();
395 assert!(md.is_dir(), "var/run must be a real dir now");
396 assert!(
397 !md.file_type().is_symlink(),
398 "var/run must not be a symlink"
399 );
400
401 // Writing the target now stays inside rootfs; the host sentinel is intact.
402 std::fs::write(&target, b"contained").unwrap();
403 assert_eq!(
404 std::fs::read(&sentinel).unwrap(),
405 b"i am the host sshd runtime dir contents",
406 "host sentinel must be untouched"
407 );
408 assert!(target.exists());
409 }
410
411 #[test]
412 fn remove_path_no_follow_unlinks_symlink_not_target() {
413 let tmp = tempfile::tempdir().unwrap();
414 let (rootfs, _host_run, sentinel) = rootfs_with_escaping_symlink(tmp.path());
415 let link = rootfs.join("var").join("run");
416
417 remove_path_no_follow(&link).unwrap();
418
419 assert!(
420 std::fs::symlink_metadata(&link).is_err(),
421 "the symlink itself must be gone"
422 );
423 assert!(
424 sentinel.exists(),
425 "the symlink target (host file) must NOT be deleted"
426 );
427 }
428
429 #[test]
430 fn lremove_if_symlink_only_removes_links() {
431 let tmp = tempfile::tempdir().unwrap();
432 let real = tmp.path().join("real.txt");
433 std::fs::write(&real, b"keep").unwrap();
434 lremove_if_symlink(&real).unwrap();
435 assert!(real.exists(), "a real file must not be removed");
436
437 #[cfg(unix)]
438 {
439 let link = tmp.path().join("link");
440 std::os::unix::fs::symlink(&real, &link).unwrap();
441 lremove_if_symlink(&link).unwrap();
442 assert!(std::fs::symlink_metadata(&link).is_err(), "link removed");
443 assert!(real.exists(), "link target preserved");
444 }
445 }
446
447 #[test]
448 fn relativize_abs_symlink_confines_targets() {
449 // The canonical escape: var/run -> /run becomes var/run -> ../run.
450 assert_eq!(
451 relativize_abs_symlink(Path::new("var/run"), Path::new("/run")),
452 Some(PathBuf::from("../run"))
453 );
454 // Deeper link: var/lock -> /run/lock => ../run/lock.
455 assert_eq!(
456 relativize_abs_symlink(Path::new("var/lock"), Path::new("/run/lock")),
457 Some(PathBuf::from("../run/lock"))
458 );
459 // Top-level link (directly under rootfs): bin -> /usr/bin => usr/bin.
460 assert_eq!(
461 relativize_abs_symlink(Path::new("bin"), Path::new("/usr/bin")),
462 Some(PathBuf::from("usr/bin"))
463 );
464 // Two-deep: a/b/c -> /x => ../../x.
465 assert_eq!(
466 relativize_abs_symlink(Path::new("a/b/c"), Path::new("/x")),
467 Some(PathBuf::from("../../x"))
468 );
469 // Target is the rootfs root.
470 assert_eq!(
471 relativize_abs_symlink(Path::new("here"), Path::new("/")),
472 Some(PathBuf::from("."))
473 );
474 assert_eq!(
475 relativize_abs_symlink(Path::new("a/here"), Path::new("/")),
476 Some(PathBuf::from(".."))
477 );
478 // Already relative: nothing to do.
479 assert_eq!(
480 relativize_abs_symlink(Path::new("var/run"), Path::new("../run")),
481 None
482 );
483 }
484}