zlayer_agent/capability.rs
1//! Daemon capability survey.
2//!
3//! Probes the runtime environment of the zlayer daemon (root vs. non-root,
4//! host vs. nested in a container, cgroup v2 path, `CAP_NET_ADMIN`, presence
5//! of `/dev/net/tun`, and writability of the cgroup root) and derives a coarse
6//! [`DaemonMode`] from those signals.
7//!
8//! All probes are intentionally cheap and non-destructive — a handful of
9//! syscalls, no allocations of kernel resources (no TUN interfaces, no cgroup
10//! writes). The struct is safe to construct multiple times.
11//!
12//! Non-Linux targets report a fixed degraded survey since the kernel features
13//! these probes target are Linux-only.
14
15use std::sync::OnceLock;
16
17use serde::{Deserialize, Serialize};
18
19/// Coarse classification of the daemon's effective execution environment.
20///
21/// Derived from the boolean fields on [`DaemonCapabilities`].
22#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
23#[serde(rename_all = "snake_case")]
24pub enum DaemonMode {
25 /// Host-level execution: all caps, can write cgroup root, can create overlay.
26 Full,
27 /// Inside a container: scoped to a sub-cgroup; some caps may be present.
28 NestedAdaptive,
29 /// Missing privileges required for any meaningful container creation.
30 Degraded,
31}
32
33/// Snapshot of the daemon's effective capabilities and execution environment.
34///
35/// Construct via [`DaemonCapabilities::probe`]. Cheap to call repeatedly.
36///
37/// The struct intentionally exposes independent capability bits as separate
38/// booleans rather than collapsing them into an enum — each bit corresponds to
39/// an orthogonal kernel feature (cgroup write, `CAP_NET_ADMIN`, TUN access,
40/// root-ness) and downstream code wants to inspect them independently when
41/// deciding what to gate.
42#[allow(clippy::struct_excessive_bools)]
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct DaemonCapabilities {
45 /// `true` if the process is running as uid 0.
46 pub is_root: bool,
47 /// `true` if the process appears to be inside a container (non-root cgroup
48 /// v2 path).
49 pub is_nested: bool,
50 /// The cgroup v2 path of the current process, if any (e.g.
51 /// `/system.slice/zlayer.service`). `None` on the cgroup root, on
52 /// cgroup-v1-only hosts, on non-Linux, or on read errors.
53 pub cgroup_parent: Option<String>,
54 /// `true` if the cgroup root's `cgroup.subtree_control` has the
55 /// owner-write bit set. Coarse, non-destructive signal — does not
56 /// guarantee an actual write will succeed.
57 pub can_write_cgroup_root: bool,
58 /// `true` if `CAP_NET_ADMIN` is present in the process's *effective* set
59 /// (Linux only).
60 pub has_cap_net_admin: bool,
61 /// `true` if `/dev/net/tun` can be opened r/w in non-blocking mode without
62 /// EACCES/EPERM/ENOENT/ENXIO. The fd is dropped immediately.
63 pub tun_device_available: bool,
64 /// `true` if the daemon can build container rootfs as an overlayfs mount
65 /// (shared read-only lowerdirs + per-container upperdir) instead of a full
66 /// per-container copy of every layer. Requires ALL of: root or
67 /// `CAP_SYS_ADMIN`, `overlay` listed in `/proc/filesystems`, and a probe
68 /// overlay mount in a temp dir that succeeds and immediately unmounts.
69 /// Computed once at startup (the result cannot change for a running daemon)
70 /// and cached with the rest of the survey.
71 pub overlayfs_rootfs_available: bool,
72 /// `true` if the daemon can build a container rootfs as a ROOTLESS
73 /// `fuse-overlayfs` mount — the userspace overlay backend that needs neither
74 /// `CAP_SYS_ADMIN` nor `CAP_MKNOD`. It still gives the shared-layer dedup of
75 /// the kernel path (shared read-only lowerdirs + a per-container upperdir),
76 /// but represents whiteouts the rootless way fuse-overlayfs reads: a plain
77 /// `.wh.<name>` regular file and the `user.overlay.opaque` xattr (both
78 /// settable without privilege) instead of `0:0` char devices /
79 /// `trusted.overlay.opaque`. Requires ALL of: the `fuse-overlayfs` binary on
80 /// `PATH`, `/dev/fuse` openable, and a probe rootless mount that succeeds and
81 /// immediately unmounts. Independent of root / `CAP_SYS_ADMIN` — this is the
82 /// fallback the daemon uses when [`Self::overlayfs_rootfs_available`] is
83 /// false. Computed once at startup and cached.
84 pub fuse_overlayfs_rootfs_available: bool,
85 /// Coarse classification derived from the above fields.
86 pub effective_mode: DaemonMode,
87}
88
89/// Process-wide memoised capability survey. Seeded by the first call to
90/// [`DaemonCapabilities::get`] or [`DaemonCapabilities::seed`].
91static CAPS: OnceLock<DaemonCapabilities> = OnceLock::new();
92
93impl DaemonCapabilities {
94 /// Returns the process-wide capability snapshot, probing on first call.
95 ///
96 /// Subsequent calls return the same memoised instance — capabilities of a
97 /// running daemon do not change at runtime, so re-probing would be wasted
98 /// syscalls and could create the illusion that the daemon's behaviour can
99 /// shift mid-flight.
100 pub fn get() -> &'static Self {
101 CAPS.get_or_init(Self::probe)
102 }
103
104 /// Eagerly seed the memoised survey with an explicit probe result.
105 ///
106 /// Useful at daemon startup to force the probe to happen at a known point
107 /// (so the banner log appears in the expected place). Returns the stored
108 /// instance — if the cache was already seeded, the existing value wins
109 /// and the passed-in `caps` is dropped (probe is pure, so this is fine).
110 ///
111 /// # Panics
112 ///
113 /// In practice this never panics — `OnceLock::set` either stores the
114 /// value or rejects it because the cell is already filled, and in both
115 /// cases the subsequent `get()` returns `Some`. The `expect` exists only
116 /// to satisfy the type system.
117 pub fn seed(caps: Self) -> &'static Self {
118 let _ = CAPS.set(caps);
119 CAPS.get()
120 .expect("CAPS is filled after set or was already filled")
121 }
122
123 /// Probe the running daemon's effective capabilities.
124 ///
125 /// Cheap — a handful of syscalls and no resource allocation. Prefer
126 /// [`DaemonCapabilities::get`] when you want the process-wide memoised
127 /// value; call this directly only when you intentionally want a fresh
128 /// snapshot (e.g. tests).
129 #[must_use]
130 pub fn probe() -> Self {
131 let is_root = zlayer_paths::is_root();
132 let cgroup_parent = current_cgroup_v2_path();
133 let is_nested = cgroup_parent.is_some();
134 let can_write_cgroup_root = probe_can_write_cgroup_root();
135 let has_cap_net_admin = probe_has_cap_net_admin();
136 let tun_device_available = probe_tun_device_available();
137 let overlayfs_rootfs_available = probe_overlayfs_rootfs_available(is_root);
138 let fuse_overlayfs_rootfs_available = probe_fuse_overlayfs_rootfs_available();
139
140 let effective_mode =
141 if !is_nested && can_write_cgroup_root && has_cap_net_admin && tun_device_available {
142 DaemonMode::Full
143 } else if can_write_cgroup_root || cgroup_parent.is_some() {
144 DaemonMode::NestedAdaptive
145 } else {
146 DaemonMode::Degraded
147 };
148
149 Self {
150 is_root,
151 is_nested,
152 cgroup_parent,
153 can_write_cgroup_root,
154 has_cap_net_admin,
155 tun_device_available,
156 overlayfs_rootfs_available,
157 fuse_overlayfs_rootfs_available,
158 effective_mode,
159 }
160 }
161}
162
163/// Decide whether capability state forces a fallback from overlay to host
164/// networking. Pure and side-effect-free so it can be unit-tested without the
165/// host's real capability state.
166///
167/// Returns `Some(reason)` when overlay networking cannot work and the daemon
168/// must fall back to host networking (or hard-error if the operator passed
169/// `--require-overlay`); returns `None` when overlay is viable.
170///
171/// Call this ONLY when the operator did not already request host networking —
172/// an explicit `--host-network` is a deliberate choice, not a degraded state.
173#[must_use]
174pub fn capability_overlay_fallback(
175 has_cap_net_admin: bool,
176 tun_device_available: bool,
177) -> Option<String> {
178 match (has_cap_net_admin, tun_device_available) {
179 (true, true) => None,
180 (false, false) => Some(
181 "CAP_NET_ADMIN is not in the daemon's effective set and /dev/net/tun is not available"
182 .to_string(),
183 ),
184 (false, true) => Some("CAP_NET_ADMIN is not in the daemon's effective set".to_string()),
185 (true, false) => Some("/dev/net/tun is not available".to_string()),
186 }
187}
188
189/// Decide whether the daemon can run the overlay in fully rootless mode: the
190/// overlay daemon wraps itself in its own user+network namespace (holding
191/// `CAP_NET_ADMIN` over its OWN netns only) and uses pasta for egress, instead of
192/// requiring host root or a setcap'd binary.
193///
194/// Rootless overlay is viable only when ALL hold:
195/// - NOT already root (a root daemon should use the normal root overlay path),
196/// - the process does NOT already hold effective `CAP_NET_ADMIN` (if it does, the
197/// setcap/root overlay path is simpler and gives host-level networking),
198/// - `/dev/net/tun` is openable (boringtun needs it for the TUN device), and
199/// - the `pasta` (passt) egress helper is available on the host.
200///
201/// Pure and side-effect-free so it can be unit-tested without namespaces.
202#[must_use]
203#[allow(clippy::fn_params_excessive_bools)] // parallel capability probe flags, intentionally flat
204pub fn can_rootless_overlay(
205 is_root: bool,
206 has_cap_net_admin: bool,
207 tun_device_available: bool,
208 pasta_available: bool,
209) -> bool {
210 !is_root && !has_cap_net_admin && tun_device_available && pasta_available
211}
212
213/// Pure parser for the contents of `/proc/self/cgroup`.
214///
215/// Finds the cgroup-v2 line (prefix `0::`) and returns the path suffix with
216/// surrounding whitespace trimmed. Returns `None` when:
217/// - the input has no `0::` line (cgroup-v1-only host), or
218/// - the v2 path is exactly `/` (host root — bare-metal, no enclosing cgroup), or
219/// - the input is empty.
220#[cfg(target_os = "linux")]
221fn parse_cgroup_v2_line(content: &str) -> Option<String> {
222 for line in content.lines() {
223 if let Some(rest) = line.strip_prefix("0::") {
224 let trimmed = rest.trim();
225 if trimmed.is_empty() || trimmed == "/" {
226 return None;
227 }
228 return Some(trimmed.to_string());
229 }
230 }
231 None
232}
233
234/// Returns the current process's cgroup-v2 path, if any.
235///
236/// On Linux reads `/proc/self/cgroup` and delegates to `parse_cgroup_v2_line`.
237/// On non-Linux always returns `None`. Returns `None` on any read error or
238/// when the process is at the cgroup-v2 root (bare-metal case).
239#[cfg(target_os = "linux")]
240#[must_use]
241pub fn current_cgroup_v2_path() -> Option<String> {
242 let content = std::fs::read_to_string("/proc/self/cgroup").ok()?;
243 parse_cgroup_v2_line(&content)
244}
245
246#[cfg(not(target_os = "linux"))]
247#[must_use]
248pub fn current_cgroup_v2_path() -> Option<String> {
249 None
250}
251
252/// Pure path computation: given a cgroup-v2 scope reported by
253/// `/proc/self/cgroup`, return the sibling `<scope>/containers` parent that
254/// should be used for new container cgroups.
255///
256/// If `scope` already ends with `/init` (the daemon has already been migrated
257/// into the `init` leaf by a previous call), the `/init` suffix is stripped
258/// and the result anchored at the real scope. This makes
259/// [`ensure_daemon_leaf_and_container_parent`] idempotent.
260#[cfg(target_os = "linux")]
261fn compute_target_parent(scope: &str) -> String {
262 let base = scope.strip_suffix("/init").unwrap_or(scope);
263 let base = base.trim_end_matches('/');
264 format!("{base}/containers")
265}
266
267/// Migrate the current daemon process into a `<scope>/init` sub-cgroup and
268/// return the sibling `<scope>/containers` path as the parent for future
269/// container cgroups. Idempotent — safe to call multiple times.
270///
271/// Returns `None` on non-Linux, when `/proc/self/cgroup` can't be parsed,
272/// when `/sys/fs/cgroup` is read-only, or when the mkdir/PID-write fails.
273/// Callers should fall back to the raw `current_cgroup_v2_path()` value in
274/// those cases (the auto-detect path will surface the underlying error).
275#[cfg(target_os = "linux")]
276#[must_use]
277pub fn ensure_daemon_leaf_and_container_parent() -> Option<String> {
278 let scope = current_cgroup_v2_path()?;
279 let containers = compute_target_parent(&scope);
280 // Idempotency: if we're already in `<base>/init`, just return the sibling.
281 if scope.ends_with("/init") {
282 let containers_fs = format!("/sys/fs/cgroup{containers}");
283 match std::fs::create_dir_all(&containers_fs) {
284 Ok(()) => {}
285 Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
286 Err(_) => return None,
287 }
288 return Some(containers);
289 }
290
291 let scope = scope.trim_end_matches('/').to_string();
292 let mount = "/sys/fs/cgroup";
293 let init_dir = format!("{mount}{scope}/init");
294
295 match std::fs::create_dir_all(&init_dir) {
296 Ok(()) => {}
297 Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
298 Err(_) => return None,
299 }
300
301 let pid_path = format!("{init_dir}/cgroup.procs");
302 let pid_str = format!("{}", std::process::id());
303 if std::fs::write(&pid_path, &pid_str).is_err() {
304 // Already migrated? Re-check /proc/self/cgroup before giving up.
305 let now = current_cgroup_v2_path()?;
306 if now != format!("{scope}/init") {
307 return None;
308 }
309 }
310
311 // Verify the migration actually moved us into <scope>/init.
312 let after = current_cgroup_v2_path()?;
313 if after != format!("{scope}/init") {
314 return None;
315 }
316
317 let containers_dir = format!("{mount}{containers}");
318 match std::fs::create_dir_all(&containers_dir) {
319 Ok(()) => {}
320 Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
321 Err(_) => return None,
322 }
323
324 Some(containers)
325}
326
327#[cfg(not(target_os = "linux"))]
328#[must_use]
329pub fn ensure_daemon_leaf_and_container_parent() -> Option<String> {
330 None
331}
332
333/// Top-level cgroup-v2 node (relative to the cgroup-v2 mount) under which a
334/// writable root host daemon roots container cgroups. Kept deliberately
335/// OUTSIDE the daemon's own systemd unit cgroup (`/system.slice/zlayer.service`)
336/// so that containers which survive a daemon stop (`KillMode=process`) never
337/// turn the unit's cgroup into a populated inner node. A populated inner node
338/// makes systemd's re-fork of the daemon fail with `EBUSY`
339/// (`Failed to spawn executor: Device or resource busy` / `Result: resources`),
340/// wedging the restart loop until the orphans happen to die. Mirrors how
341/// Docker/containerd root containers under their own top-level hierarchy
342/// (`/sys/fs/cgroup/docker/...`) rather than under their service unit.
343///
344/// The name has no `.slice`/`.scope` suffix so systemd treats it as foreign
345/// and never tries to reconcile or prune it.
346#[cfg(target_os = "linux")]
347const HOST_CONTAINER_ROOT: &str = "/zlayer";
348
349/// Controllers delegated down the host container hierarchy so libcontainer can
350/// apply cpu/memory/pids/io limits on the leaf container cgroup. Only those
351/// actually available at each level (per `cgroup.controllers`) are enabled, so
352/// a host missing a controller degrades gracefully instead of erroring.
353#[cfg(target_os = "linux")]
354const HOST_CGROUP_CONTROLLERS: &[&str] = &["cpu", "cpuset", "io", "memory", "pids"];
355
356/// Pure path computation: the host-mode container parent, `<root>/containers`,
357/// relative to the cgroup-v2 mount.
358#[cfg(target_os = "linux")]
359#[must_use]
360fn compute_host_container_parent() -> String {
361 format!("{HOST_CONTAINER_ROOT}/containers")
362}
363
364/// Enable every wanted controller that is actually available at `dir`
365/// (a `/sys/fs/cgroup/...` path) by writing `+<ctrl>` tokens to its
366/// `cgroup.subtree_control`. Best-effort: filtering to available controllers
367/// avoids the `EINVAL` a single unavailable token would cause, and any write
368/// error is ignored (libcontainer will surface a real failure later if a
369/// required controller is genuinely missing).
370#[cfg(target_os = "linux")]
371fn enable_available_controllers(dir: &str) {
372 let available =
373 std::fs::read_to_string(format!("{dir}/cgroup.controllers")).unwrap_or_default();
374 let tokens: Vec<String> = HOST_CGROUP_CONTROLLERS
375 .iter()
376 .filter(|c| available.split_whitespace().any(|a| a == **c))
377 .map(|c| format!("+{c}"))
378 .collect();
379 if tokens.is_empty() {
380 return;
381 }
382 let _ = std::fs::write(format!("{dir}/cgroup.subtree_control"), tokens.join(" "));
383}
384
385/// Ensure the top-level host container hierarchy exists and has controllers
386/// delegated, returning the container parent path (`/zlayer/containers`,
387/// relative to the cgroup-v2 mount) for libcontainer's `cgroupsPath`.
388///
389/// Only meaningful when the daemon can write the cgroup-v2 root (root host
390/// daemon — `DaemonCapabilities::can_write_cgroup_root`). Returns `None` on
391/// non-Linux, or when the mkdir fails (e.g. a read-only `/sys/fs/cgroup`),
392/// in which case callers fall back to in-scope placement.
393///
394/// Unlike [`ensure_daemon_leaf_and_container_parent`], this does NOT migrate
395/// the daemon PID: with containers rooted outside the unit cgroup, the unit
396/// cgroup stays a clean leaf that systemd can always re-attach to on restart,
397/// so no `init` leaf split is needed.
398#[cfg(target_os = "linux")]
399#[must_use]
400pub fn ensure_host_container_parent() -> Option<String> {
401 let mount = "/sys/fs/cgroup";
402 let containers = compute_host_container_parent();
403 let root_fs = format!("{mount}{HOST_CONTAINER_ROOT}");
404 let containers_fs = format!("{mount}{containers}");
405
406 for dir in [&root_fs, &containers_fs] {
407 match std::fs::create_dir_all(dir) {
408 Ok(()) => {}
409 Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
410 Err(_) => return None,
411 }
412 }
413
414 // Delegate controllers down both levels so libcontainer can set limits on
415 // the leaf `<root>/containers/<id>` cgroup it creates.
416 enable_available_controllers(&root_fs);
417 enable_available_controllers(&containers_fs);
418
419 Some(containers)
420}
421
422#[cfg(not(target_os = "linux"))]
423#[must_use]
424pub fn ensure_host_container_parent() -> Option<String> {
425 None
426}
427
428/// Depth-first remove a cgroup-v2 directory tree rooted at `dir`.
429///
430/// A cgroup-v2 parent cannot be `rmdir`'d while it still has child cgroups, so
431/// child directories are removed first (post-order). Best-effort throughout:
432/// a `NotFound` is treated as success (idempotent), and any other error is
433/// logged at `warn!` but does not abort the recursion — reaping as many leaves
434/// as possible is better than bailing on the first `EBUSY`.
435#[cfg(target_os = "linux")]
436fn remove_cgroup_tree(dir: &std::path::Path) {
437 // Best-effort: evacuate any survivors before attempting rmdir. `cgroup.kill`
438 // (kernel >= 5.14) SIGKILLs the whole subtree atomically; ignore failure on
439 // older kernels or when the file is absent.
440 let _ = std::fs::write(dir.join("cgroup.kill"), "1");
441
442 match std::fs::read_dir(dir) {
443 Ok(entries) => {
444 for entry in entries.flatten() {
445 let path = entry.path();
446 if entry.file_type().is_ok_and(|t| t.is_dir()) {
447 // Child cgroup: recurse first — a v2 parent can't be
448 // rmdir'd while children exist.
449 remove_cgroup_tree(&path);
450 } else {
451 // On real cgroupfs the control files (cgroup.procs, etc.)
452 // are removed implicitly by rmdir, so this is normally a
453 // NotFound no-op; on a plain filesystem (and in tests) it
454 // unlinks the leftover so the dir can be removed.
455 let _ = std::fs::remove_file(&path);
456 }
457 }
458 }
459 Err(e) if e.kind() == std::io::ErrorKind::NotFound => return,
460 Err(e) => {
461 tracing::warn!(cgroup = %dir.display(), error = %e, "cgroup read_dir failed");
462 }
463 }
464
465 match std::fs::remove_dir(dir) {
466 Ok(()) => {}
467 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
468 Err(e) => {
469 tracing::warn!(cgroup = %dir.display(), error = %e, "cgroup rmdir failed");
470 }
471 }
472}
473
474/// Reap the host-mode leaf cgroup for `container_id` under `base`
475/// (`<base>/zlayer/containers/<container_id>`), depth-first.
476///
477/// Split out from [`remove_host_container_cgroup`] so the recursion can be
478/// exercised by a unit test against a temp directory instead of the real
479/// `/sys/fs/cgroup` mount. Idempotent: a missing leaf is a no-op.
480#[cfg(target_os = "linux")]
481fn remove_host_container_cgroup_at(base: &str, container_id: &str) {
482 let leaf = std::path::PathBuf::from(base).join(format!("zlayer/containers/{container_id}"));
483 if !leaf.exists() {
484 return;
485 }
486 remove_cgroup_tree(&leaf);
487}
488
489/// Best-effort removal of the host-mode container cgroup at
490/// `/sys/fs/cgroup/zlayer/containers/<container_id>`.
491///
492/// libcontainer's `delete()` normally reaps the leaf cgroup, but
493/// systemd-cgroup races and cgroup-v2 unified hiccups can leave a stale, empty
494/// directory behind. Because the next `create_container` rebuilds the same
495/// `<root>/containers/<id>` path, that orphan trips libcontainer's `build()`
496/// with `could not delete` on restart/scale. This reaps it directly at the
497/// real path (the old `read_dir`-the-mount scan never matched two levels down).
498///
499/// Idempotent (`NotFound` is ignored) and best-effort (`EBUSY`/other errors are
500/// logged, not propagated). No-op on non-Linux.
501#[cfg(target_os = "linux")]
502pub fn remove_host_container_cgroup(container_id: &str) {
503 remove_host_container_cgroup_at("/sys/fs/cgroup", container_id);
504}
505
506#[cfg(not(target_os = "linux"))]
507pub fn remove_host_container_cgroup(_container_id: &str) {}
508
509#[cfg(target_os = "linux")]
510fn probe_can_write_cgroup_root() -> bool {
511 use std::ffi::CString;
512
513 let Ok(path) = CString::new("/sys/fs/cgroup/cgroup.subtree_control") else {
514 return false;
515 };
516 // SAFETY: access(2) is a read-only syscall that takes a pointer to a
517 // NUL-terminated C string. The kernel does not retain the pointer.
518 #[allow(unsafe_code)]
519 let rc = unsafe { libc::access(path.as_ptr(), libc::W_OK) };
520 rc == 0
521}
522
523#[cfg(not(target_os = "linux"))]
524fn probe_can_write_cgroup_root() -> bool {
525 false
526}
527
528#[cfg(target_os = "linux")]
529fn probe_has_cap_net_admin() -> bool {
530 // CAP_NET_ADMIN is bit 12 in the Linux capability bitmask.
531 // We need it in the EFFECTIVE set (`CapEff`), not just the bounding set
532 // (`CapBnd`). A regular user process has full CapBnd by default but empty
533 // CapPrm/CapEff — checking PR_CAPBSET_READ gives a false positive that
534 // makes the daemon think it can create TUN/WG interfaces when it cannot.
535 const CAP_NET_ADMIN_BIT: u64 = 1 << 12;
536 let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
537 return false;
538 };
539 for line in status.lines() {
540 if let Some(hex) = line.strip_prefix("CapEff:") {
541 let trimmed = hex.trim();
542 if let Ok(eff) = u64::from_str_radix(trimmed, 16) {
543 return eff & CAP_NET_ADMIN_BIT != 0;
544 }
545 return false;
546 }
547 }
548 false
549}
550
551#[cfg(not(target_os = "linux"))]
552fn probe_has_cap_net_admin() -> bool {
553 false
554}
555
556#[cfg(target_os = "linux")]
557fn probe_tun_device_available() -> bool {
558 use std::os::unix::fs::OpenOptionsExt;
559
560 // Opening /dev/net/tun without any ioctls is benign and does not allocate
561 // a TUN interface. The fd is dropped immediately when this scope ends.
562 // Any open error — missing device, no perms, kernel module not loaded,
563 // FD exhaustion — means we can't actually use TUN. Treat as unavailable.
564 std::fs::OpenOptions::new()
565 .read(true)
566 .write(true)
567 .custom_flags(libc::O_NONBLOCK)
568 .open("/dev/net/tun")
569 .is_ok()
570}
571
572#[cfg(not(target_os = "linux"))]
573fn probe_tun_device_available() -> bool {
574 false
575}
576
577/// `CAP_SYS_ADMIN` bit (21) in the Linux capability bitmask. Required to call
578/// `mount(2)` for the overlay probe.
579#[cfg(target_os = "linux")]
580const CAP_SYS_ADMIN_BIT: u64 = 1 << 21;
581
582/// `true` if `CAP_SYS_ADMIN` is in the process's *effective* set. Same
583/// `/proc/self/status` `CapEff:` parse as [`probe_has_cap_net_admin`] — the
584/// effective set, not the bounding set, is what `mount(2)` actually checks.
585#[cfg(target_os = "linux")]
586fn probe_has_cap_sys_admin() -> bool {
587 let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
588 return false;
589 };
590 for line in status.lines() {
591 if let Some(hex) = line.strip_prefix("CapEff:") {
592 if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
593 return eff & CAP_SYS_ADMIN_BIT != 0;
594 }
595 return false;
596 }
597 }
598 false
599}
600
601/// Pure parser for `/proc/filesystems`: `true` if `overlay` is a registered
602/// filesystem. Each line is either `\t<fs>` or `nodev\t<fs>`; overlay is a
603/// `nodev` filesystem. Split out so the logic is unit-testable without the
604/// host's real `/proc`.
605#[cfg(target_os = "linux")]
606fn proc_filesystems_has_overlay(content: &str) -> bool {
607 content
608 .lines()
609 .any(|line| line.split_whitespace().next_back() == Some("overlay"))
610}
611
612/// Probe whether the daemon can use an overlayfs rootfs.
613///
614/// Returns `true` iff ALL hold:
615/// 1. the daemon is root OR holds effective `CAP_SYS_ADMIN` (needed for
616/// `mount(2)`),
617/// 2. `overlay` is registered in `/proc/filesystems`, and
618/// 3. a real probe overlay mount in a fresh temp dir succeeds (and is then
619/// immediately unmounted).
620///
621/// The mount probe is the authoritative signal — kernels can list `overlay`
622/// yet reject the mount (e.g. inside an unprivileged userns, or on a backing
623/// filesystem overlay won't accept). Doing a real mount+unmount once at startup
624/// is cheap and removes the guesswork.
625#[cfg(target_os = "linux")]
626fn probe_overlayfs_rootfs_available(is_root: bool) -> bool {
627 if !is_root && !probe_has_cap_sys_admin() {
628 return false;
629 }
630 let Ok(content) = std::fs::read_to_string("/proc/filesystems") else {
631 return false;
632 };
633 if !proc_filesystems_has_overlay(&content) {
634 return false;
635 }
636 probe_overlay_mount_roundtrip()
637}
638
639#[cfg(not(target_os = "linux"))]
640fn probe_overlayfs_rootfs_available(_is_root: bool) -> bool {
641 false
642}
643
644/// Attempt a throwaway overlay mount (lower+upper+work+merged, all under one
645/// temp dir) and immediately unmount it. Returns `true` only if both the mount
646/// and unmount succeed. Best-effort cleanup of the temp dir on every path.
647#[cfg(target_os = "linux")]
648fn probe_overlay_mount_roundtrip() -> bool {
649 use nix::mount::{mount, umount2, MntFlags, MsFlags};
650
651 let Ok(base) = tempfile::Builder::new()
652 .prefix("zlayer-ovl-probe-")
653 .tempdir()
654 else {
655 return false;
656 };
657 let lower = base.path().join("lower");
658 let upper = base.path().join("upper");
659 let work = base.path().join("work");
660 let merged = base.path().join("merged");
661 for d in [&lower, &upper, &work, &merged] {
662 if std::fs::create_dir_all(d).is_err() {
663 return false;
664 }
665 }
666
667 let opts = format!(
668 "lowerdir={},upperdir={},workdir={}",
669 lower.display(),
670 upper.display(),
671 work.display()
672 );
673
674 let mounted = mount(
675 Some("overlay"),
676 &merged,
677 Some("overlay"),
678 MsFlags::empty(),
679 Some(opts.as_str()),
680 )
681 .is_ok();
682
683 if !mounted {
684 return false;
685 }
686
687 // Unmount; lazy-detach as a fallback so the probe never leaves a mount
688 // behind even if the eager umount races. tempdir Drop then removes the
689 // tree. The probe is "available" only if we both mounted AND cleaned up.
690 umount2(&merged, MntFlags::empty()).is_ok() || umount2(&merged, MntFlags::MNT_DETACH).is_ok()
691}
692
693/// Locate an executable named `name` on `PATH`, returning its full path.
694///
695/// Pure helper (a `:`-split scan of the `PATH` env var) so the probe logic can
696/// be unit-tested by passing an explicit `path_var`. A candidate must exist and
697/// be a regular file or symlink; the executable bit is not checked here (the
698/// later spawn surfaces a non-executable file as a real error). Linux-only —
699/// the fuse path is Linux-only.
700#[cfg(target_os = "linux")]
701fn which_in(name: &str, path_var: &str) -> Option<std::path::PathBuf> {
702 if name.is_empty() {
703 return None;
704 }
705 for dir in path_var.split(':').filter(|d| !d.is_empty()) {
706 let candidate = std::path::Path::new(dir).join(name);
707 if candidate.exists() {
708 return Some(candidate);
709 }
710 }
711 None
712}
713
714/// `fuse-overlayfs` binary path, resolved from the process `PATH`. Linux-only.
715#[cfg(target_os = "linux")]
716fn fuse_overlayfs_binary() -> Option<std::path::PathBuf> {
717 let path_var = std::env::var("PATH").ok()?;
718 which_in("fuse-overlayfs", &path_var)
719}
720
721/// The fusermount helper to use for unmounting a `fuse-overlayfs` mount,
722/// preferring the FUSE3 `fusermount3` and falling back to `fusermount`.
723/// Returns the resolved binary path, or `None` if neither is on `PATH`.
724/// Linux-only.
725#[cfg(target_os = "linux")]
726#[must_use]
727pub fn fusermount_binary() -> Option<std::path::PathBuf> {
728 let path_var = std::env::var("PATH").ok()?;
729 which_in("fusermount3", &path_var).or_else(|| which_in("fusermount", &path_var))
730}
731
732#[cfg(not(target_os = "linux"))]
733#[must_use]
734pub fn fusermount_binary() -> Option<std::path::PathBuf> {
735 None
736}
737
738/// `true` if `/dev/fuse` can be opened read/write — the FUSE control device a
739/// userspace `fuse-overlayfs` daemon needs to back its mount. Opening it is
740/// benign and allocates no FUSE connection; the fd is dropped immediately. Any
741/// error (missing node, no perms, module not loaded) means rootless fuse-overlay
742/// is unusable. Linux-only.
743#[cfg(target_os = "linux")]
744fn probe_dev_fuse_available() -> bool {
745 std::fs::OpenOptions::new()
746 .read(true)
747 .write(true)
748 .open("/dev/fuse")
749 .is_ok()
750}
751
752/// Probe whether the daemon can use a ROOTLESS `fuse-overlayfs` rootfs.
753///
754/// Returns `true` iff ALL hold:
755/// 1. the `fuse-overlayfs` binary is on `PATH`,
756/// 2. `/dev/fuse` is openable r/w, and
757/// 3. a real probe `fuse-overlayfs` mount in a fresh temp dir succeeds (and is
758/// then immediately unmounted via `fusermount`).
759///
760/// Deliberately independent of root / `CAP_SYS_ADMIN`: this is the path that
761/// lets an unprivileged daemon still get shared-layer dedup. The mount probe is
762/// authoritative — a host can have the binary and `/dev/fuse` yet reject the
763/// mount (e.g. no `user_allow_other`, a hardened FUSE sysctl, or a sandbox), so
764/// a real mount+unmount once at startup removes the guesswork. Mirrors the
765/// kernel-overlay probe's "actually do it once" philosophy.
766#[cfg(target_os = "linux")]
767fn probe_fuse_overlayfs_rootfs_available() -> bool {
768 let Some(bin) = fuse_overlayfs_binary() else {
769 return false;
770 };
771 if !probe_dev_fuse_available() {
772 return false;
773 }
774 let Some(fusermount) = fusermount_binary() else {
775 return false;
776 };
777 probe_fuse_overlay_mount_roundtrip(&bin, &fusermount)
778}
779
780#[cfg(not(target_os = "linux"))]
781fn probe_fuse_overlayfs_rootfs_available() -> bool {
782 false
783}
784
785/// Attempt a throwaway rootless `fuse-overlayfs` mount and immediately unmount
786/// it. Returns `true` only if both the mount and the unmount succeed. The
787/// backing FUSE daemon self-daemonizes (reparenting to PID 1), so the spawned
788/// `fuse-overlayfs` process returns promptly and we wait on it. Best-effort
789/// cleanup of the temp dir on every path. Linux-only.
790#[cfg(target_os = "linux")]
791fn probe_fuse_overlay_mount_roundtrip(bin: &std::path::Path, fusermount: &std::path::Path) -> bool {
792 let Ok(base) = tempfile::Builder::new()
793 .prefix("zlayer-fuse-ovl-probe-")
794 .tempdir()
795 else {
796 return false;
797 };
798 let lower = base.path().join("lower");
799 let upper = base.path().join("upper");
800 let work = base.path().join("work");
801 let merged = base.path().join("merged");
802 for d in [&lower, &upper, &work, &merged] {
803 if std::fs::create_dir_all(d).is_err() {
804 return false;
805 }
806 }
807
808 let opts = format!(
809 "lowerdir={},upperdir={},workdir={}",
810 lower.display(),
811 upper.display(),
812 work.display()
813 );
814
815 let mounted = std::process::Command::new(bin)
816 .arg("-o")
817 .arg(&opts)
818 .arg(&merged)
819 .stdin(std::process::Stdio::null())
820 .stdout(std::process::Stdio::null())
821 .stderr(std::process::Stdio::null())
822 .status()
823 .is_ok_and(|s| s.success());
824
825 if !mounted {
826 return false;
827 }
828
829 // Unmount via fusermount; tempdir Drop then removes the tree. The probe is
830 // "available" only if we both mounted AND cleaned up.
831 std::process::Command::new(fusermount)
832 .arg("-u")
833 .arg(&merged)
834 .stdin(std::process::Stdio::null())
835 .stdout(std::process::Stdio::null())
836 .stderr(std::process::Stdio::null())
837 .status()
838 .is_ok_and(|s| s.success())
839}
840
841#[cfg(test)]
842mod tests {
843 use super::*;
844
845 #[test]
846 fn probe_does_not_panic_and_is_nested_agrees_with_cgroup_parent() {
847 let caps = DaemonCapabilities::probe();
848 assert_eq!(caps.is_nested, caps.cgroup_parent.is_some());
849 }
850
851 #[cfg(target_os = "linux")]
852 #[test]
853 fn probe_has_cap_net_admin_matches_cap_eff() {
854 // Just confirm the probe agrees with what /proc/self/status reports.
855 // The actual capability state depends on how the test is run (regular
856 // user vs root vs setcap'd binary), but the probe MUST agree with the
857 // CapEff line — that's the whole point of the bug fix.
858 let status = std::fs::read_to_string("/proc/self/status").unwrap();
859 let cap_eff_line = status
860 .lines()
861 .find(|l| l.starts_with("CapEff:"))
862 .expect("CapEff: present in /proc/self/status");
863 let hex = cap_eff_line.trim_start_matches("CapEff:").trim();
864 let eff: u64 = u64::from_str_radix(hex, 16).unwrap();
865 let expected = (eff & (1u64 << 12)) != 0;
866 assert_eq!(super::probe_has_cap_net_admin(), expected);
867 }
868
869 /// Pure classifier reproducing the logic in `probe()`. Kept in the test
870 /// module so the table below can assert behaviour without depending on
871 /// the host's actual capability state.
872 #[allow(clippy::fn_params_excessive_bools)]
873 fn classify(
874 is_nested: bool,
875 can_write_cgroup_root: bool,
876 has_cap_net_admin: bool,
877 tun_device_available: bool,
878 cgroup_parent_is_some: bool,
879 ) -> DaemonMode {
880 if !is_nested && can_write_cgroup_root && has_cap_net_admin && tun_device_available {
881 DaemonMode::Full
882 } else if can_write_cgroup_root || cgroup_parent_is_some {
883 DaemonMode::NestedAdaptive
884 } else {
885 DaemonMode::Degraded
886 }
887 }
888
889 #[test]
890 fn effective_mode_full_requires_all_four_signals() {
891 // Full: every signal must be set the right way.
892 assert_eq!(
893 classify(false, true, true, true, false),
894 DaemonMode::Full,
895 "all four signals set should be Full"
896 );
897 // Drop any single signal and Full must no longer apply.
898 assert_ne!(classify(true, true, true, true, true), DaemonMode::Full);
899 assert_ne!(classify(false, false, true, true, false), DaemonMode::Full);
900 assert_ne!(classify(false, true, false, true, false), DaemonMode::Full);
901 assert_ne!(classify(false, true, true, false, false), DaemonMode::Full);
902 }
903
904 #[test]
905 fn effective_mode_nested_adaptive_when_writable_or_has_parent() {
906 // Writable root but missing other Full signals → NestedAdaptive.
907 assert_eq!(
908 classify(false, true, false, false, false),
909 DaemonMode::NestedAdaptive
910 );
911 // Nested under a parent cgroup, no other signals → NestedAdaptive.
912 assert_eq!(
913 classify(true, false, false, false, true),
914 DaemonMode::NestedAdaptive
915 );
916 }
917
918 #[test]
919 fn effective_mode_degraded_when_no_writable_path() {
920 // No root write, no parent, nothing usable.
921 assert_eq!(
922 classify(false, false, false, false, false),
923 DaemonMode::Degraded
924 );
925 // is_nested=true but no parent and no root write — still Degraded
926 // (the is_nested signal alone, without a resolved parent, does not
927 // give us a writable cgroup to anchor under).
928 assert_eq!(
929 classify(true, false, false, false, false),
930 DaemonMode::Degraded
931 );
932 }
933
934 #[test]
935 fn overlay_fallback_none_only_when_both_present() {
936 assert!(super::capability_overlay_fallback(true, true).is_none());
937 }
938
939 #[test]
940 fn overlay_fallback_reports_missing_cap_net_admin() {
941 let reason = super::capability_overlay_fallback(false, true).expect("should fall back");
942 assert!(reason.contains("CAP_NET_ADMIN"));
943 }
944
945 #[test]
946 fn overlay_fallback_reports_missing_tun() {
947 let reason = super::capability_overlay_fallback(true, false).expect("should fall back");
948 assert!(reason.contains("/dev/net/tun"));
949 }
950
951 #[test]
952 fn overlay_fallback_reports_both_missing() {
953 let reason = super::capability_overlay_fallback(false, false).expect("should fall back");
954 assert!(reason.contains("CAP_NET_ADMIN"));
955 assert!(reason.contains("/dev/net/tun"));
956 }
957
958 #[test]
959 fn rootless_overlay_requires_nonroot_no_cap_tun_and_pasta() {
960 // Happy path: non-root, no cap, tun present, pasta present.
961 assert!(super::can_rootless_overlay(false, false, true, true));
962 }
963
964 #[test]
965 fn rootless_overlay_rejected_when_root() {
966 assert!(!super::can_rootless_overlay(true, false, true, true));
967 }
968
969 #[test]
970 fn rootless_overlay_rejected_when_already_has_cap_net_admin() {
971 assert!(!super::can_rootless_overlay(false, true, true, true));
972 }
973
974 #[test]
975 fn rootless_overlay_rejected_without_tun() {
976 assert!(!super::can_rootless_overlay(false, false, false, true));
977 }
978
979 #[test]
980 fn rootless_overlay_rejected_without_pasta() {
981 assert!(!super::can_rootless_overlay(false, false, true, false));
982 }
983
984 #[test]
985 fn serializes_round_trip_via_serde_json() {
986 let caps = DaemonCapabilities::probe();
987 let json = serde_json::to_string(&caps).expect("serialize");
988 let parsed: DaemonCapabilities = serde_json::from_str(&json).expect("deserialize");
989 assert_eq!(parsed.is_root, caps.is_root);
990 assert_eq!(parsed.is_nested, caps.is_nested);
991 assert_eq!(parsed.cgroup_parent, caps.cgroup_parent);
992 assert_eq!(parsed.can_write_cgroup_root, caps.can_write_cgroup_root);
993 assert_eq!(parsed.has_cap_net_admin, caps.has_cap_net_admin);
994 assert_eq!(parsed.tun_device_available, caps.tun_device_available);
995 assert_eq!(
996 parsed.overlayfs_rootfs_available,
997 caps.overlayfs_rootfs_available
998 );
999 assert_eq!(
1000 parsed.fuse_overlayfs_rootfs_available,
1001 caps.fuse_overlayfs_rootfs_available
1002 );
1003 assert_eq!(parsed.effective_mode, caps.effective_mode);
1004 }
1005
1006 #[cfg(target_os = "linux")]
1007 mod fuse_probe {
1008 use super::super::{fusermount_binary, which_in};
1009
1010 #[test]
1011 fn which_in_finds_binary_in_first_matching_dir() {
1012 // Build a temp dir holding a fake binary and confirm the PATH scan
1013 // resolves it to the full path.
1014 let tmp = tempfile::tempdir().unwrap();
1015 let bin = tmp.path().join("fuse-overlayfs");
1016 std::fs::write(&bin, b"#!/bin/sh\n").unwrap();
1017 let other = tempfile::tempdir().unwrap();
1018 // PATH: a non-matching dir first, then the dir with the binary.
1019 let path_var = format!("{}:{}", other.path().display(), tmp.path().display());
1020 assert_eq!(
1021 which_in("fuse-overlayfs", &path_var).as_deref(),
1022 Some(bin.as_path())
1023 );
1024 }
1025
1026 #[test]
1027 fn which_in_none_when_absent_and_ignores_empty_segments() {
1028 let tmp = tempfile::tempdir().unwrap();
1029 // Leading/trailing/empty colon segments must be skipped, not joined
1030 // against (which would otherwise resolve `/fuse-overlayfs`).
1031 let path_var = format!(":{}:", tmp.path().display());
1032 assert!(which_in("fuse-overlayfs", &path_var).is_none());
1033 // An empty name never resolves.
1034 assert!(which_in("", &path_var).is_none());
1035 }
1036
1037 #[test]
1038 fn fusermount_binary_resolves_or_none_without_panic() {
1039 // On this box one of fusermount3/fusermount is typically present,
1040 // but we only assert the call is total (no panic) and, when it does
1041 // resolve, that it points at an existing file.
1042 if let Some(p) = fusermount_binary() {
1043 assert!(
1044 p.exists(),
1045 "resolved fusermount must exist: {}",
1046 p.display()
1047 );
1048 }
1049 }
1050 }
1051
1052 #[cfg(target_os = "linux")]
1053 mod proc_filesystems {
1054 use super::super::proc_filesystems_has_overlay;
1055
1056 #[test]
1057 fn detects_overlay_as_nodev_filesystem() {
1058 // Real-world shape: nodev entries are tab-indented with a "nodev"
1059 // marker; overlay is one of them.
1060 let content = "nodev\tsysfs\nnodev\ttmpfs\nnodev\toverlay\n\text4\n";
1061 assert!(proc_filesystems_has_overlay(content));
1062 }
1063
1064 #[test]
1065 fn absent_when_overlay_not_listed() {
1066 let content = "nodev\tsysfs\nnodev\ttmpfs\n\text4\n\txfs\n";
1067 assert!(!proc_filesystems_has_overlay(content));
1068 }
1069
1070 #[test]
1071 fn does_not_match_substring_overlayfs() {
1072 // A different fs whose name merely contains "overlay" must not match
1073 // (we compare the whole final token, not a substring).
1074 let content = "nodev\toverlayfs2\n\text4\n";
1075 assert!(!proc_filesystems_has_overlay(content));
1076 }
1077
1078 #[test]
1079 fn empty_input_is_false() {
1080 assert!(!proc_filesystems_has_overlay(""));
1081 }
1082 }
1083
1084 /// The overlay probe must not panic and must be internally consistent:
1085 /// availability implies the daemon is root or holds `CAP_SYS_ADMIN`. The
1086 /// concrete bool depends on how the test runs (root vs not), so we only
1087 /// assert the implication, not a fixed value.
1088 #[cfg(target_os = "linux")]
1089 #[test]
1090 fn overlay_probe_consistent_with_privilege() {
1091 let caps = DaemonCapabilities::probe();
1092 if caps.overlayfs_rootfs_available {
1093 assert!(
1094 caps.is_root || super::probe_has_cap_sys_admin(),
1095 "overlay availability must imply root or CAP_SYS_ADMIN"
1096 );
1097 }
1098 }
1099
1100 #[test]
1101 fn daemon_mode_serde_uses_snake_case() {
1102 assert_eq!(
1103 serde_json::to_string(&DaemonMode::Full).unwrap(),
1104 "\"full\""
1105 );
1106 assert_eq!(
1107 serde_json::to_string(&DaemonMode::NestedAdaptive).unwrap(),
1108 "\"nested_adaptive\""
1109 );
1110 assert_eq!(
1111 serde_json::to_string(&DaemonMode::Degraded).unwrap(),
1112 "\"degraded\""
1113 );
1114 }
1115
1116 #[cfg(target_os = "linux")]
1117 mod target_parent {
1118 use super::super::compute_target_parent;
1119
1120 #[test]
1121 fn idempotent_when_already_under_init() {
1122 // Pre-fix path: scope is the systemd-run scope itself.
1123 assert_eq!(
1124 compute_target_parent(
1125 "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope"
1126 ),
1127 "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/containers"
1128 );
1129 // Already migrated: scope ends with /init — strip and re-anchor.
1130 assert_eq!(
1131 compute_target_parent(
1132 "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/init"
1133 ),
1134 "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/containers"
1135 );
1136 // Trailing slash on either form is harmless.
1137 assert_eq!(compute_target_parent("/foo/bar/"), "/foo/bar/containers");
1138 assert_eq!(
1139 compute_target_parent("/foo/bar/init"),
1140 "/foo/bar/containers"
1141 );
1142 }
1143 }
1144
1145 #[cfg(target_os = "linux")]
1146 mod host_parent {
1147 use super::super::{compute_host_container_parent, HOST_CONTAINER_ROOT};
1148
1149 #[test]
1150 fn host_parent_is_top_level_and_outside_any_unit() {
1151 // Host-mode containers must live under a top-level node, NOT under
1152 // `/system.slice/zlayer.service/...`, so a KillMode=process
1153 // survivor can never wedge the unit's restart with EBUSY.
1154 assert_eq!(compute_host_container_parent(), "/zlayer/containers");
1155 assert!(compute_host_container_parent().starts_with(HOST_CONTAINER_ROOT));
1156 assert!(!compute_host_container_parent().contains("zlayer.service"));
1157 assert!(!compute_host_container_parent().contains(".slice"));
1158 }
1159 }
1160
1161 #[cfg(target_os = "linux")]
1162 mod host_cgroup_reap {
1163 use super::super::remove_host_container_cgroup_at;
1164 use std::fs;
1165
1166 // Reproduces the recreate bug: a previous instance left an empty
1167 // cgroup tree at `<root>/zlayer/containers/<id>` (with a nested child
1168 // cgroup, as a v2 leaf with delegated controllers can have). The reaper
1169 // must depth-first remove the children and then the leaf so the next
1170 // create_container starts from a clean slot.
1171 #[test]
1172 fn reaps_stale_empty_cgroup_tree_depth_first() {
1173 let base = tempfile::tempdir().expect("tempdir");
1174 let base_path = base.path().to_str().unwrap();
1175 let id = "zata-storage-rep-1";
1176
1177 let leaf = base.path().join(format!("zlayer/containers/{id}"));
1178 // A nested child cgroup dir — a v2 parent cannot be rmdir'd while
1179 // children exist, which is exactly what the recursion must handle.
1180 let child = leaf.join("child-scope");
1181 fs::create_dir_all(&child).expect("create nested cgroup tree");
1182 // Simulate cgroup-v2 control files present in the leaf.
1183 fs::write(leaf.join("cgroup.procs"), "").unwrap();
1184 fs::write(child.join("cgroup.procs"), "").unwrap();
1185 assert!(leaf.exists(), "precondition: stale leaf exists");
1186
1187 remove_host_container_cgroup_at(base_path, id);
1188
1189 assert!(
1190 !leaf.exists(),
1191 "stale cgroup leaf must be reaped (depth-first removal of children + leaf)"
1192 );
1193 // The `containers` parent itself is left intact (shared across ids).
1194 assert!(
1195 base.path().join("zlayer/containers").exists(),
1196 "shared containers parent must survive"
1197 );
1198 }
1199
1200 #[test]
1201 fn idempotent_when_leaf_absent() {
1202 let base = tempfile::tempdir().expect("tempdir");
1203 let base_path = base.path().to_str().unwrap();
1204 // No leaf created — must be a no-op, not a panic.
1205 remove_host_container_cgroup_at(base_path, "never-existed");
1206 }
1207 }
1208
1209 #[cfg(target_os = "linux")]
1210 mod cgroup_parser {
1211 use super::super::parse_cgroup_v2_line;
1212
1213 #[test]
1214 fn parse_cgroup_v2_root_returns_none() {
1215 assert_eq!(parse_cgroup_v2_line("0::/\n"), None);
1216 }
1217
1218 #[test]
1219 fn parse_cgroup_v2_path_returns_some() {
1220 assert_eq!(
1221 parse_cgroup_v2_line("0::/system.slice/forgejo-runner.service\n"),
1222 Some("/system.slice/forgejo-runner.service".to_string())
1223 );
1224 }
1225
1226 #[test]
1227 fn parse_cgroup_v2_hybrid_finds_v2_line() {
1228 let input = "12:devices:/user.slice\n11:memory:/user.slice\n0::/foo\n";
1229 assert_eq!(parse_cgroup_v2_line(input), Some("/foo".to_string()));
1230 }
1231
1232 #[test]
1233 fn parse_cgroup_v2_no_newline() {
1234 assert_eq!(parse_cgroup_v2_line("0::/bar"), Some("/bar".to_string()));
1235 }
1236
1237 #[test]
1238 fn parse_cgroup_v2_missing_returns_none() {
1239 assert_eq!(parse_cgroup_v2_line(""), None);
1240 }
1241 }
1242}