Skip to main content

zlayer_agent/
capability.rs

1//! Daemon capability survey.
2//!
3//! Probes the runtime environment of the zlayer daemon (root vs. non-root,
4//! host vs. nested in a container, cgroup v2 path, `CAP_NET_ADMIN`, presence
5//! of `/dev/net/tun`, and writability of the cgroup root) and derives a coarse
6//! [`DaemonMode`] from those signals.
7//!
8//! All probes are intentionally cheap and non-destructive — a handful of
9//! syscalls, no allocations of kernel resources (no TUN interfaces, no cgroup
10//! writes). The struct is safe to construct multiple times.
11//!
12//! Non-Linux targets report a fixed degraded survey since the kernel features
13//! these probes target are Linux-only.
14
15use std::sync::OnceLock;
16
17use serde::{Deserialize, Serialize};
18
19/// Coarse classification of the daemon's effective execution environment.
20///
21/// Derived from the boolean fields on [`DaemonCapabilities`].
22#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
23#[serde(rename_all = "snake_case")]
24pub enum DaemonMode {
25    /// Host-level execution: all caps, can write cgroup root, can create overlay.
26    Full,
27    /// Inside a container: scoped to a sub-cgroup; some caps may be present.
28    NestedAdaptive,
29    /// Missing privileges required for any meaningful container creation.
30    Degraded,
31}
32
33/// Snapshot of the daemon's effective capabilities and execution environment.
34///
35/// Construct via [`DaemonCapabilities::probe`]. Cheap to call repeatedly.
36///
37/// The struct intentionally exposes independent capability bits as separate
38/// booleans rather than collapsing them into an enum — each bit corresponds to
39/// an orthogonal kernel feature (cgroup write, `CAP_NET_ADMIN`, TUN access,
40/// root-ness) and downstream code wants to inspect them independently when
41/// deciding what to gate.
42#[allow(clippy::struct_excessive_bools)]
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct DaemonCapabilities {
45    /// `true` if the process is running as uid 0.
46    pub is_root: bool,
47    /// `true` if the process appears to be inside a container (non-root cgroup
48    /// v2 path).
49    pub is_nested: bool,
50    /// The cgroup v2 path of the current process, if any (e.g.
51    /// `/system.slice/zlayer.service`). `None` on the cgroup root, on
52    /// cgroup-v1-only hosts, on non-Linux, or on read errors.
53    pub cgroup_parent: Option<String>,
54    /// `true` if the cgroup root's `cgroup.subtree_control` has the
55    /// owner-write bit set. Coarse, non-destructive signal — does not
56    /// guarantee an actual write will succeed.
57    pub can_write_cgroup_root: bool,
58    /// `true` if `CAP_NET_ADMIN` is present in the process's *effective* set
59    /// (Linux only).
60    pub has_cap_net_admin: bool,
61    /// `true` if `/dev/net/tun` can be opened r/w in non-blocking mode without
62    /// EACCES/EPERM/ENOENT/ENXIO. The fd is dropped immediately.
63    pub tun_device_available: bool,
64    /// `true` if the daemon can build container rootfs as an overlayfs mount
65    /// (shared read-only lowerdirs + per-container upperdir) instead of a full
66    /// per-container copy of every layer. Requires ALL of: root or
67    /// `CAP_SYS_ADMIN`, `overlay` listed in `/proc/filesystems`, and a probe
68    /// overlay mount in a temp dir that succeeds and immediately unmounts.
69    /// Computed once at startup (the result cannot change for a running daemon)
70    /// and cached with the rest of the survey.
71    pub overlayfs_rootfs_available: bool,
72    /// `true` if the daemon can build a container rootfs as a ROOTLESS
73    /// `fuse-overlayfs` mount — the userspace overlay backend that needs neither
74    /// `CAP_SYS_ADMIN` nor `CAP_MKNOD`. It still gives the shared-layer dedup of
75    /// the kernel path (shared read-only lowerdirs + a per-container upperdir),
76    /// but represents whiteouts the rootless way fuse-overlayfs reads: a plain
77    /// `.wh.<name>` regular file and the `user.overlay.opaque` xattr (both
78    /// settable without privilege) instead of `0:0` char devices /
79    /// `trusted.overlay.opaque`. Requires ALL of: the `fuse-overlayfs` binary on
80    /// `PATH`, `/dev/fuse` openable, and a probe rootless mount that succeeds and
81    /// immediately unmounts. Independent of root / `CAP_SYS_ADMIN` — this is the
82    /// fallback the daemon uses when [`Self::overlayfs_rootfs_available`] is
83    /// false. Computed once at startup and cached.
84    pub fuse_overlayfs_rootfs_available: bool,
85    /// Coarse classification derived from the above fields.
86    pub effective_mode: DaemonMode,
87}
88
89/// Process-wide memoised capability survey. Seeded by the first call to
90/// [`DaemonCapabilities::get`] or [`DaemonCapabilities::seed`].
91static CAPS: OnceLock<DaemonCapabilities> = OnceLock::new();
92
93impl DaemonCapabilities {
94    /// Returns the process-wide capability snapshot, probing on first call.
95    ///
96    /// Subsequent calls return the same memoised instance — capabilities of a
97    /// running daemon do not change at runtime, so re-probing would be wasted
98    /// syscalls and could create the illusion that the daemon's behaviour can
99    /// shift mid-flight.
100    pub fn get() -> &'static Self {
101        CAPS.get_or_init(Self::probe)
102    }
103
104    /// Eagerly seed the memoised survey with an explicit probe result.
105    ///
106    /// Useful at daemon startup to force the probe to happen at a known point
107    /// (so the banner log appears in the expected place). Returns the stored
108    /// instance — if the cache was already seeded, the existing value wins
109    /// and the passed-in `caps` is dropped (probe is pure, so this is fine).
110    ///
111    /// # Panics
112    ///
113    /// In practice this never panics — `OnceLock::set` either stores the
114    /// value or rejects it because the cell is already filled, and in both
115    /// cases the subsequent `get()` returns `Some`. The `expect` exists only
116    /// to satisfy the type system.
117    pub fn seed(caps: Self) -> &'static Self {
118        let _ = CAPS.set(caps);
119        CAPS.get()
120            .expect("CAPS is filled after set or was already filled")
121    }
122
123    /// Probe the running daemon's effective capabilities.
124    ///
125    /// Cheap — a handful of syscalls and no resource allocation. Prefer
126    /// [`DaemonCapabilities::get`] when you want the process-wide memoised
127    /// value; call this directly only when you intentionally want a fresh
128    /// snapshot (e.g. tests).
129    #[must_use]
130    pub fn probe() -> Self {
131        let is_root = zlayer_paths::is_root();
132        let cgroup_parent = current_cgroup_v2_path();
133        let is_nested = cgroup_parent.is_some();
134        let can_write_cgroup_root = probe_can_write_cgroup_root();
135        let has_cap_net_admin = probe_has_cap_net_admin();
136        let tun_device_available = probe_tun_device_available();
137        let overlayfs_rootfs_available = probe_overlayfs_rootfs_available(is_root);
138        let fuse_overlayfs_rootfs_available = probe_fuse_overlayfs_rootfs_available();
139
140        let effective_mode =
141            if !is_nested && can_write_cgroup_root && has_cap_net_admin && tun_device_available {
142                DaemonMode::Full
143            } else if can_write_cgroup_root || cgroup_parent.is_some() {
144                DaemonMode::NestedAdaptive
145            } else {
146                DaemonMode::Degraded
147            };
148
149        Self {
150            is_root,
151            is_nested,
152            cgroup_parent,
153            can_write_cgroup_root,
154            has_cap_net_admin,
155            tun_device_available,
156            overlayfs_rootfs_available,
157            fuse_overlayfs_rootfs_available,
158            effective_mode,
159        }
160    }
161}
162
163/// Decide whether capability state forces a fallback from overlay to host
164/// networking. Pure and side-effect-free so it can be unit-tested without the
165/// host's real capability state.
166///
167/// Returns `Some(reason)` when overlay networking cannot work and the daemon
168/// must fall back to host networking (or hard-error if the operator passed
169/// `--require-overlay`); returns `None` when overlay is viable.
170///
171/// Call this ONLY when the operator did not already request host networking —
172/// an explicit `--host-network` is a deliberate choice, not a degraded state.
173#[must_use]
174pub fn capability_overlay_fallback(
175    has_cap_net_admin: bool,
176    tun_device_available: bool,
177) -> Option<String> {
178    match (has_cap_net_admin, tun_device_available) {
179        (true, true) => None,
180        (false, false) => Some(
181            "CAP_NET_ADMIN is not in the daemon's effective set and /dev/net/tun is not available"
182                .to_string(),
183        ),
184        (false, true) => Some("CAP_NET_ADMIN is not in the daemon's effective set".to_string()),
185        (true, false) => Some("/dev/net/tun is not available".to_string()),
186    }
187}
188
189/// Decide whether the daemon can run the overlay in fully rootless mode: the
190/// overlay daemon wraps itself in its own user+network namespace (holding
191/// `CAP_NET_ADMIN` over its OWN netns only) and uses pasta for egress, instead of
192/// requiring host root or a setcap'd binary.
193///
194/// Rootless overlay is viable only when ALL hold:
195/// - NOT already root (a root daemon should use the normal root overlay path),
196/// - the process does NOT already hold effective `CAP_NET_ADMIN` (if it does, the
197///   setcap/root overlay path is simpler and gives host-level networking),
198/// - `/dev/net/tun` is openable (boringtun needs it for the TUN device), and
199/// - the `pasta` (passt) egress helper is available on the host.
200///
201/// Pure and side-effect-free so it can be unit-tested without namespaces.
202#[must_use]
203#[allow(clippy::fn_params_excessive_bools)] // parallel capability probe flags, intentionally flat
204pub fn can_rootless_overlay(
205    is_root: bool,
206    has_cap_net_admin: bool,
207    tun_device_available: bool,
208    pasta_available: bool,
209) -> bool {
210    !is_root && !has_cap_net_admin && tun_device_available && pasta_available
211}
212
213/// Pure parser for the contents of `/proc/self/cgroup`.
214///
215/// Finds the cgroup-v2 line (prefix `0::`) and returns the path suffix with
216/// surrounding whitespace trimmed. Returns `None` when:
217/// - the input has no `0::` line (cgroup-v1-only host), or
218/// - the v2 path is exactly `/` (host root — bare-metal, no enclosing cgroup), or
219/// - the input is empty.
220#[cfg(target_os = "linux")]
221fn parse_cgroup_v2_line(content: &str) -> Option<String> {
222    for line in content.lines() {
223        if let Some(rest) = line.strip_prefix("0::") {
224            let trimmed = rest.trim();
225            if trimmed.is_empty() || trimmed == "/" {
226                return None;
227            }
228            return Some(trimmed.to_string());
229        }
230    }
231    None
232}
233
234/// Returns the current process's cgroup-v2 path, if any.
235///
236/// On Linux reads `/proc/self/cgroup` and delegates to `parse_cgroup_v2_line`.
237/// On non-Linux always returns `None`. Returns `None` on any read error or
238/// when the process is at the cgroup-v2 root (bare-metal case).
239#[cfg(target_os = "linux")]
240#[must_use]
241pub fn current_cgroup_v2_path() -> Option<String> {
242    let content = std::fs::read_to_string("/proc/self/cgroup").ok()?;
243    parse_cgroup_v2_line(&content)
244}
245
246#[cfg(not(target_os = "linux"))]
247#[must_use]
248pub fn current_cgroup_v2_path() -> Option<String> {
249    None
250}
251
252/// Pure path computation: given a cgroup-v2 scope reported by
253/// `/proc/self/cgroup`, return the sibling `<scope>/containers` parent that
254/// should be used for new container cgroups.
255///
256/// If `scope` already ends with `/init` (the daemon has already been migrated
257/// into the `init` leaf by a previous call), the `/init` suffix is stripped
258/// and the result anchored at the real scope. This makes
259/// [`ensure_daemon_leaf_and_container_parent`] idempotent.
260#[cfg(target_os = "linux")]
261fn compute_target_parent(scope: &str) -> String {
262    let base = scope.strip_suffix("/init").unwrap_or(scope);
263    let base = base.trim_end_matches('/');
264    format!("{base}/containers")
265}
266
267/// Migrate the current daemon process into a `<scope>/init` sub-cgroup and
268/// return the sibling `<scope>/containers` path as the parent for future
269/// container cgroups. Idempotent — safe to call multiple times.
270///
271/// Returns `None` on non-Linux, when `/proc/self/cgroup` can't be parsed,
272/// when `/sys/fs/cgroup` is read-only, or when the mkdir/PID-write fails.
273/// Callers should fall back to the raw `current_cgroup_v2_path()` value in
274/// those cases (the auto-detect path will surface the underlying error).
275#[cfg(target_os = "linux")]
276#[must_use]
277pub fn ensure_daemon_leaf_and_container_parent() -> Option<String> {
278    let scope = current_cgroup_v2_path()?;
279    let containers = compute_target_parent(&scope);
280    // Idempotency: if we're already in `<base>/init`, just return the sibling.
281    if scope.ends_with("/init") {
282        let containers_fs = format!("/sys/fs/cgroup{containers}");
283        match std::fs::create_dir_all(&containers_fs) {
284            Ok(()) => {}
285            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
286            Err(_) => return None,
287        }
288        return Some(containers);
289    }
290
291    let scope = scope.trim_end_matches('/').to_string();
292    let mount = "/sys/fs/cgroup";
293    let init_dir = format!("{mount}{scope}/init");
294
295    match std::fs::create_dir_all(&init_dir) {
296        Ok(()) => {}
297        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
298        Err(_) => return None,
299    }
300
301    let pid_path = format!("{init_dir}/cgroup.procs");
302    let pid_str = format!("{}", std::process::id());
303    if std::fs::write(&pid_path, &pid_str).is_err() {
304        // Already migrated? Re-check /proc/self/cgroup before giving up.
305        let now = current_cgroup_v2_path()?;
306        if now != format!("{scope}/init") {
307            return None;
308        }
309    }
310
311    // Verify the migration actually moved us into <scope>/init.
312    let after = current_cgroup_v2_path()?;
313    if after != format!("{scope}/init") {
314        return None;
315    }
316
317    let containers_dir = format!("{mount}{containers}");
318    match std::fs::create_dir_all(&containers_dir) {
319        Ok(()) => {}
320        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
321        Err(_) => return None,
322    }
323
324    Some(containers)
325}
326
327#[cfg(not(target_os = "linux"))]
328#[must_use]
329pub fn ensure_daemon_leaf_and_container_parent() -> Option<String> {
330    None
331}
332
333/// Top-level cgroup-v2 node (relative to the cgroup-v2 mount) under which a
334/// writable root host daemon roots container cgroups. Kept deliberately
335/// OUTSIDE the daemon's own systemd unit cgroup (`/system.slice/zlayer.service`)
336/// so that containers which survive a daemon stop (`KillMode=process`) never
337/// turn the unit's cgroup into a populated inner node. A populated inner node
338/// makes systemd's re-fork of the daemon fail with `EBUSY`
339/// (`Failed to spawn executor: Device or resource busy` / `Result: resources`),
340/// wedging the restart loop until the orphans happen to die. Mirrors how
341/// Docker/containerd root containers under their own top-level hierarchy
342/// (`/sys/fs/cgroup/docker/...`) rather than under their service unit.
343///
344/// The name has no `.slice`/`.scope` suffix so systemd treats it as foreign
345/// and never tries to reconcile or prune it.
346#[cfg(target_os = "linux")]
347const HOST_CONTAINER_ROOT: &str = "/zlayer";
348
349/// Controllers delegated down the host container hierarchy so libcontainer can
350/// apply cpu/memory/pids/io limits on the leaf container cgroup. Only those
351/// actually available at each level (per `cgroup.controllers`) are enabled, so
352/// a host missing a controller degrades gracefully instead of erroring.
353#[cfg(target_os = "linux")]
354const HOST_CGROUP_CONTROLLERS: &[&str] = &["cpu", "cpuset", "io", "memory", "pids"];
355
356/// Pure path computation: the host-mode container parent, `<root>/containers`,
357/// relative to the cgroup-v2 mount.
358#[cfg(target_os = "linux")]
359#[must_use]
360fn compute_host_container_parent() -> String {
361    format!("{HOST_CONTAINER_ROOT}/containers")
362}
363
364/// Enable every wanted controller that is actually available at `dir`
365/// (a `/sys/fs/cgroup/...` path) by writing `+<ctrl>` tokens to its
366/// `cgroup.subtree_control`. Best-effort: filtering to available controllers
367/// avoids the `EINVAL` a single unavailable token would cause, and any write
368/// error is ignored (libcontainer will surface a real failure later if a
369/// required controller is genuinely missing).
370#[cfg(target_os = "linux")]
371fn enable_available_controllers(dir: &str) {
372    let available =
373        std::fs::read_to_string(format!("{dir}/cgroup.controllers")).unwrap_or_default();
374    let tokens: Vec<String> = HOST_CGROUP_CONTROLLERS
375        .iter()
376        .filter(|c| available.split_whitespace().any(|a| a == **c))
377        .map(|c| format!("+{c}"))
378        .collect();
379    if tokens.is_empty() {
380        return;
381    }
382    let _ = std::fs::write(format!("{dir}/cgroup.subtree_control"), tokens.join(" "));
383}
384
385/// Ensure the top-level host container hierarchy exists and has controllers
386/// delegated, returning the container parent path (`/zlayer/containers`,
387/// relative to the cgroup-v2 mount) for libcontainer's `cgroupsPath`.
388///
389/// Only meaningful when the daemon can write the cgroup-v2 root (root host
390/// daemon — `DaemonCapabilities::can_write_cgroup_root`). Returns `None` on
391/// non-Linux, or when the mkdir fails (e.g. a read-only `/sys/fs/cgroup`),
392/// in which case callers fall back to in-scope placement.
393///
394/// Unlike [`ensure_daemon_leaf_and_container_parent`], this does NOT migrate
395/// the daemon PID: with containers rooted outside the unit cgroup, the unit
396/// cgroup stays a clean leaf that systemd can always re-attach to on restart,
397/// so no `init` leaf split is needed.
398#[cfg(target_os = "linux")]
399#[must_use]
400pub fn ensure_host_container_parent() -> Option<String> {
401    let mount = "/sys/fs/cgroup";
402    let containers = compute_host_container_parent();
403    let root_fs = format!("{mount}{HOST_CONTAINER_ROOT}");
404    let containers_fs = format!("{mount}{containers}");
405
406    for dir in [&root_fs, &containers_fs] {
407        match std::fs::create_dir_all(dir) {
408            Ok(()) => {}
409            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
410            Err(_) => return None,
411        }
412    }
413
414    // Delegate controllers down both levels so libcontainer can set limits on
415    // the leaf `<root>/containers/<id>` cgroup it creates.
416    enable_available_controllers(&root_fs);
417    enable_available_controllers(&containers_fs);
418
419    Some(containers)
420}
421
422#[cfg(not(target_os = "linux"))]
423#[must_use]
424pub fn ensure_host_container_parent() -> Option<String> {
425    None
426}
427
428/// Depth-first remove a cgroup-v2 directory tree rooted at `dir`.
429///
430/// A cgroup-v2 parent cannot be `rmdir`'d while it still has child cgroups, so
431/// child directories are removed first (post-order). Best-effort throughout:
432/// a `NotFound` is treated as success (idempotent), and any other error is
433/// logged at `warn!` but does not abort the recursion — reaping as many leaves
434/// as possible is better than bailing on the first `EBUSY`.
435#[cfg(target_os = "linux")]
436fn remove_cgroup_tree(dir: &std::path::Path) {
437    // Best-effort: evacuate any survivors before attempting rmdir. `cgroup.kill`
438    // (kernel >= 5.14) SIGKILLs the whole subtree atomically; ignore failure on
439    // older kernels or when the file is absent.
440    let _ = std::fs::write(dir.join("cgroup.kill"), "1");
441
442    match std::fs::read_dir(dir) {
443        Ok(entries) => {
444            for entry in entries.flatten() {
445                let path = entry.path();
446                if entry.file_type().is_ok_and(|t| t.is_dir()) {
447                    // Child cgroup: recurse first — a v2 parent can't be
448                    // rmdir'd while children exist.
449                    remove_cgroup_tree(&path);
450                } else {
451                    // On real cgroupfs the control files (cgroup.procs, etc.)
452                    // are removed implicitly by rmdir, so this is normally a
453                    // NotFound no-op; on a plain filesystem (and in tests) it
454                    // unlinks the leftover so the dir can be removed.
455                    let _ = std::fs::remove_file(&path);
456                }
457            }
458        }
459        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return,
460        Err(e) => {
461            tracing::warn!(cgroup = %dir.display(), error = %e, "cgroup read_dir failed");
462        }
463    }
464
465    match std::fs::remove_dir(dir) {
466        Ok(()) => {}
467        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
468        Err(e) => {
469            tracing::warn!(cgroup = %dir.display(), error = %e, "cgroup rmdir failed");
470        }
471    }
472}
473
474/// Reap the host-mode leaf cgroup for `container_id` under `base`
475/// (`<base>/zlayer/containers/<container_id>`), depth-first.
476///
477/// Split out from [`remove_host_container_cgroup`] so the recursion can be
478/// exercised by a unit test against a temp directory instead of the real
479/// `/sys/fs/cgroup` mount. Idempotent: a missing leaf is a no-op.
480#[cfg(target_os = "linux")]
481fn remove_host_container_cgroup_at(base: &str, container_id: &str) {
482    let leaf = std::path::PathBuf::from(base).join(format!("zlayer/containers/{container_id}"));
483    if !leaf.exists() {
484        return;
485    }
486    remove_cgroup_tree(&leaf);
487}
488
489/// Best-effort removal of the host-mode container cgroup at
490/// `/sys/fs/cgroup/zlayer/containers/<container_id>`.
491///
492/// libcontainer's `delete()` normally reaps the leaf cgroup, but
493/// systemd-cgroup races and cgroup-v2 unified hiccups can leave a stale, empty
494/// directory behind. Because the next `create_container` rebuilds the same
495/// `<root>/containers/<id>` path, that orphan trips libcontainer's `build()`
496/// with `could not delete` on restart/scale. This reaps it directly at the
497/// real path (the old `read_dir`-the-mount scan never matched two levels down).
498///
499/// Idempotent (`NotFound` is ignored) and best-effort (`EBUSY`/other errors are
500/// logged, not propagated). No-op on non-Linux.
501#[cfg(target_os = "linux")]
502pub fn remove_host_container_cgroup(container_id: &str) {
503    remove_host_container_cgroup_at("/sys/fs/cgroup", container_id);
504}
505
506#[cfg(not(target_os = "linux"))]
507pub fn remove_host_container_cgroup(_container_id: &str) {}
508
509#[cfg(target_os = "linux")]
510fn probe_can_write_cgroup_root() -> bool {
511    use std::ffi::CString;
512
513    let Ok(path) = CString::new("/sys/fs/cgroup/cgroup.subtree_control") else {
514        return false;
515    };
516    // SAFETY: access(2) is a read-only syscall that takes a pointer to a
517    // NUL-terminated C string. The kernel does not retain the pointer.
518    #[allow(unsafe_code)]
519    let rc = unsafe { libc::access(path.as_ptr(), libc::W_OK) };
520    rc == 0
521}
522
523#[cfg(not(target_os = "linux"))]
524fn probe_can_write_cgroup_root() -> bool {
525    false
526}
527
528#[cfg(target_os = "linux")]
529fn probe_has_cap_net_admin() -> bool {
530    // CAP_NET_ADMIN is bit 12 in the Linux capability bitmask.
531    // We need it in the EFFECTIVE set (`CapEff`), not just the bounding set
532    // (`CapBnd`). A regular user process has full CapBnd by default but empty
533    // CapPrm/CapEff — checking PR_CAPBSET_READ gives a false positive that
534    // makes the daemon think it can create TUN/WG interfaces when it cannot.
535    const CAP_NET_ADMIN_BIT: u64 = 1 << 12;
536    let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
537        return false;
538    };
539    for line in status.lines() {
540        if let Some(hex) = line.strip_prefix("CapEff:") {
541            let trimmed = hex.trim();
542            if let Ok(eff) = u64::from_str_radix(trimmed, 16) {
543                return eff & CAP_NET_ADMIN_BIT != 0;
544            }
545            return false;
546        }
547    }
548    false
549}
550
551#[cfg(not(target_os = "linux"))]
552fn probe_has_cap_net_admin() -> bool {
553    false
554}
555
556#[cfg(target_os = "linux")]
557fn probe_tun_device_available() -> bool {
558    use std::os::unix::fs::OpenOptionsExt;
559
560    // Opening /dev/net/tun without any ioctls is benign and does not allocate
561    // a TUN interface. The fd is dropped immediately when this scope ends.
562    // Any open error — missing device, no perms, kernel module not loaded,
563    // FD exhaustion — means we can't actually use TUN. Treat as unavailable.
564    std::fs::OpenOptions::new()
565        .read(true)
566        .write(true)
567        .custom_flags(libc::O_NONBLOCK)
568        .open("/dev/net/tun")
569        .is_ok()
570}
571
572#[cfg(not(target_os = "linux"))]
573fn probe_tun_device_available() -> bool {
574    false
575}
576
577/// `CAP_SYS_ADMIN` bit (21) in the Linux capability bitmask. Required to call
578/// `mount(2)` for the overlay probe.
579#[cfg(target_os = "linux")]
580const CAP_SYS_ADMIN_BIT: u64 = 1 << 21;
581
582/// `true` if `CAP_SYS_ADMIN` is in the process's *effective* set. Same
583/// `/proc/self/status` `CapEff:` parse as [`probe_has_cap_net_admin`] — the
584/// effective set, not the bounding set, is what `mount(2)` actually checks.
585#[cfg(target_os = "linux")]
586fn probe_has_cap_sys_admin() -> bool {
587    let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
588        return false;
589    };
590    for line in status.lines() {
591        if let Some(hex) = line.strip_prefix("CapEff:") {
592            if let Ok(eff) = u64::from_str_radix(hex.trim(), 16) {
593                return eff & CAP_SYS_ADMIN_BIT != 0;
594            }
595            return false;
596        }
597    }
598    false
599}
600
601/// Pure parser for `/proc/filesystems`: `true` if `overlay` is a registered
602/// filesystem. Each line is either `\t<fs>` or `nodev\t<fs>`; overlay is a
603/// `nodev` filesystem. Split out so the logic is unit-testable without the
604/// host's real `/proc`.
605#[cfg(target_os = "linux")]
606fn proc_filesystems_has_overlay(content: &str) -> bool {
607    content
608        .lines()
609        .any(|line| line.split_whitespace().next_back() == Some("overlay"))
610}
611
612/// Probe whether the daemon can use an overlayfs rootfs.
613///
614/// Returns `true` iff ALL hold:
615/// 1. the daemon is root OR holds effective `CAP_SYS_ADMIN` (needed for
616///    `mount(2)`),
617/// 2. `overlay` is registered in `/proc/filesystems`, and
618/// 3. a real probe overlay mount in a fresh temp dir succeeds (and is then
619///    immediately unmounted).
620///
621/// The mount probe is the authoritative signal — kernels can list `overlay`
622/// yet reject the mount (e.g. inside an unprivileged userns, or on a backing
623/// filesystem overlay won't accept). Doing a real mount+unmount once at startup
624/// is cheap and removes the guesswork.
625#[cfg(target_os = "linux")]
626fn probe_overlayfs_rootfs_available(is_root: bool) -> bool {
627    if !is_root && !probe_has_cap_sys_admin() {
628        return false;
629    }
630    let Ok(content) = std::fs::read_to_string("/proc/filesystems") else {
631        return false;
632    };
633    if !proc_filesystems_has_overlay(&content) {
634        return false;
635    }
636    probe_overlay_mount_roundtrip()
637}
638
639#[cfg(not(target_os = "linux"))]
640fn probe_overlayfs_rootfs_available(_is_root: bool) -> bool {
641    false
642}
643
644/// Attempt a throwaway overlay mount (lower+upper+work+merged, all under one
645/// temp dir) and immediately unmount it. Returns `true` only if both the mount
646/// and unmount succeed. Best-effort cleanup of the temp dir on every path.
647#[cfg(target_os = "linux")]
648fn probe_overlay_mount_roundtrip() -> bool {
649    use nix::mount::{mount, umount2, MntFlags, MsFlags};
650
651    let Ok(base) = tempfile::Builder::new()
652        .prefix("zlayer-ovl-probe-")
653        .tempdir()
654    else {
655        return false;
656    };
657    let lower = base.path().join("lower");
658    let upper = base.path().join("upper");
659    let work = base.path().join("work");
660    let merged = base.path().join("merged");
661    for d in [&lower, &upper, &work, &merged] {
662        if std::fs::create_dir_all(d).is_err() {
663            return false;
664        }
665    }
666
667    let opts = format!(
668        "lowerdir={},upperdir={},workdir={}",
669        lower.display(),
670        upper.display(),
671        work.display()
672    );
673
674    let mounted = mount(
675        Some("overlay"),
676        &merged,
677        Some("overlay"),
678        MsFlags::empty(),
679        Some(opts.as_str()),
680    )
681    .is_ok();
682
683    if !mounted {
684        return false;
685    }
686
687    // Unmount; lazy-detach as a fallback so the probe never leaves a mount
688    // behind even if the eager umount races. tempdir Drop then removes the
689    // tree. The probe is "available" only if we both mounted AND cleaned up.
690    umount2(&merged, MntFlags::empty()).is_ok() || umount2(&merged, MntFlags::MNT_DETACH).is_ok()
691}
692
693/// Locate an executable named `name` on `PATH`, returning its full path.
694///
695/// Pure helper (a `:`-split scan of the `PATH` env var) so the probe logic can
696/// be unit-tested by passing an explicit `path_var`. A candidate must exist and
697/// be a regular file or symlink; the executable bit is not checked here (the
698/// later spawn surfaces a non-executable file as a real error). Linux-only —
699/// the fuse path is Linux-only.
700#[cfg(target_os = "linux")]
701fn which_in(name: &str, path_var: &str) -> Option<std::path::PathBuf> {
702    if name.is_empty() {
703        return None;
704    }
705    for dir in path_var.split(':').filter(|d| !d.is_empty()) {
706        let candidate = std::path::Path::new(dir).join(name);
707        if candidate.exists() {
708            return Some(candidate);
709        }
710    }
711    None
712}
713
714/// `fuse-overlayfs` binary path, resolved from the process `PATH`. Linux-only.
715#[cfg(target_os = "linux")]
716fn fuse_overlayfs_binary() -> Option<std::path::PathBuf> {
717    let path_var = std::env::var("PATH").ok()?;
718    which_in("fuse-overlayfs", &path_var)
719}
720
721/// The fusermount helper to use for unmounting a `fuse-overlayfs` mount,
722/// preferring the FUSE3 `fusermount3` and falling back to `fusermount`.
723/// Returns the resolved binary path, or `None` if neither is on `PATH`.
724/// Linux-only.
725#[cfg(target_os = "linux")]
726#[must_use]
727pub fn fusermount_binary() -> Option<std::path::PathBuf> {
728    let path_var = std::env::var("PATH").ok()?;
729    which_in("fusermount3", &path_var).or_else(|| which_in("fusermount", &path_var))
730}
731
732#[cfg(not(target_os = "linux"))]
733#[must_use]
734pub fn fusermount_binary() -> Option<std::path::PathBuf> {
735    None
736}
737
738/// `true` if `/dev/fuse` can be opened read/write — the FUSE control device a
739/// userspace `fuse-overlayfs` daemon needs to back its mount. Opening it is
740/// benign and allocates no FUSE connection; the fd is dropped immediately. Any
741/// error (missing node, no perms, module not loaded) means rootless fuse-overlay
742/// is unusable. Linux-only.
743#[cfg(target_os = "linux")]
744fn probe_dev_fuse_available() -> bool {
745    std::fs::OpenOptions::new()
746        .read(true)
747        .write(true)
748        .open("/dev/fuse")
749        .is_ok()
750}
751
752/// Probe whether the daemon can use a ROOTLESS `fuse-overlayfs` rootfs.
753///
754/// Returns `true` iff ALL hold:
755/// 1. the `fuse-overlayfs` binary is on `PATH`,
756/// 2. `/dev/fuse` is openable r/w, and
757/// 3. a real probe `fuse-overlayfs` mount in a fresh temp dir succeeds (and is
758///    then immediately unmounted via `fusermount`).
759///
760/// Deliberately independent of root / `CAP_SYS_ADMIN`: this is the path that
761/// lets an unprivileged daemon still get shared-layer dedup. The mount probe is
762/// authoritative — a host can have the binary and `/dev/fuse` yet reject the
763/// mount (e.g. no `user_allow_other`, a hardened FUSE sysctl, or a sandbox), so
764/// a real mount+unmount once at startup removes the guesswork. Mirrors the
765/// kernel-overlay probe's "actually do it once" philosophy.
766#[cfg(target_os = "linux")]
767fn probe_fuse_overlayfs_rootfs_available() -> bool {
768    let Some(bin) = fuse_overlayfs_binary() else {
769        return false;
770    };
771    if !probe_dev_fuse_available() {
772        return false;
773    }
774    let Some(fusermount) = fusermount_binary() else {
775        return false;
776    };
777    probe_fuse_overlay_mount_roundtrip(&bin, &fusermount)
778}
779
780#[cfg(not(target_os = "linux"))]
781fn probe_fuse_overlayfs_rootfs_available() -> bool {
782    false
783}
784
785/// Attempt a throwaway rootless `fuse-overlayfs` mount and immediately unmount
786/// it. Returns `true` only if both the mount and the unmount succeed. The
787/// backing FUSE daemon self-daemonizes (reparenting to PID 1), so the spawned
788/// `fuse-overlayfs` process returns promptly and we wait on it. Best-effort
789/// cleanup of the temp dir on every path. Linux-only.
790#[cfg(target_os = "linux")]
791fn probe_fuse_overlay_mount_roundtrip(bin: &std::path::Path, fusermount: &std::path::Path) -> bool {
792    let Ok(base) = tempfile::Builder::new()
793        .prefix("zlayer-fuse-ovl-probe-")
794        .tempdir()
795    else {
796        return false;
797    };
798    let lower = base.path().join("lower");
799    let upper = base.path().join("upper");
800    let work = base.path().join("work");
801    let merged = base.path().join("merged");
802    for d in [&lower, &upper, &work, &merged] {
803        if std::fs::create_dir_all(d).is_err() {
804            return false;
805        }
806    }
807
808    let opts = format!(
809        "lowerdir={},upperdir={},workdir={}",
810        lower.display(),
811        upper.display(),
812        work.display()
813    );
814
815    let mounted = std::process::Command::new(bin)
816        .arg("-o")
817        .arg(&opts)
818        .arg(&merged)
819        .stdin(std::process::Stdio::null())
820        .stdout(std::process::Stdio::null())
821        .stderr(std::process::Stdio::null())
822        .status()
823        .is_ok_and(|s| s.success());
824
825    if !mounted {
826        return false;
827    }
828
829    // Unmount via fusermount; tempdir Drop then removes the tree. The probe is
830    // "available" only if we both mounted AND cleaned up.
831    std::process::Command::new(fusermount)
832        .arg("-u")
833        .arg(&merged)
834        .stdin(std::process::Stdio::null())
835        .stdout(std::process::Stdio::null())
836        .stderr(std::process::Stdio::null())
837        .status()
838        .is_ok_and(|s| s.success())
839}
840
841#[cfg(test)]
842mod tests {
843    use super::*;
844
845    #[test]
846    fn probe_does_not_panic_and_is_nested_agrees_with_cgroup_parent() {
847        let caps = DaemonCapabilities::probe();
848        assert_eq!(caps.is_nested, caps.cgroup_parent.is_some());
849    }
850
851    #[cfg(target_os = "linux")]
852    #[test]
853    fn probe_has_cap_net_admin_matches_cap_eff() {
854        // Just confirm the probe agrees with what /proc/self/status reports.
855        // The actual capability state depends on how the test is run (regular
856        // user vs root vs setcap'd binary), but the probe MUST agree with the
857        // CapEff line — that's the whole point of the bug fix.
858        let status = std::fs::read_to_string("/proc/self/status").unwrap();
859        let cap_eff_line = status
860            .lines()
861            .find(|l| l.starts_with("CapEff:"))
862            .expect("CapEff: present in /proc/self/status");
863        let hex = cap_eff_line.trim_start_matches("CapEff:").trim();
864        let eff: u64 = u64::from_str_radix(hex, 16).unwrap();
865        let expected = (eff & (1u64 << 12)) != 0;
866        assert_eq!(super::probe_has_cap_net_admin(), expected);
867    }
868
869    /// Pure classifier reproducing the logic in `probe()`. Kept in the test
870    /// module so the table below can assert behaviour without depending on
871    /// the host's actual capability state.
872    #[allow(clippy::fn_params_excessive_bools)]
873    fn classify(
874        is_nested: bool,
875        can_write_cgroup_root: bool,
876        has_cap_net_admin: bool,
877        tun_device_available: bool,
878        cgroup_parent_is_some: bool,
879    ) -> DaemonMode {
880        if !is_nested && can_write_cgroup_root && has_cap_net_admin && tun_device_available {
881            DaemonMode::Full
882        } else if can_write_cgroup_root || cgroup_parent_is_some {
883            DaemonMode::NestedAdaptive
884        } else {
885            DaemonMode::Degraded
886        }
887    }
888
889    #[test]
890    fn effective_mode_full_requires_all_four_signals() {
891        // Full: every signal must be set the right way.
892        assert_eq!(
893            classify(false, true, true, true, false),
894            DaemonMode::Full,
895            "all four signals set should be Full"
896        );
897        // Drop any single signal and Full must no longer apply.
898        assert_ne!(classify(true, true, true, true, true), DaemonMode::Full);
899        assert_ne!(classify(false, false, true, true, false), DaemonMode::Full);
900        assert_ne!(classify(false, true, false, true, false), DaemonMode::Full);
901        assert_ne!(classify(false, true, true, false, false), DaemonMode::Full);
902    }
903
904    #[test]
905    fn effective_mode_nested_adaptive_when_writable_or_has_parent() {
906        // Writable root but missing other Full signals → NestedAdaptive.
907        assert_eq!(
908            classify(false, true, false, false, false),
909            DaemonMode::NestedAdaptive
910        );
911        // Nested under a parent cgroup, no other signals → NestedAdaptive.
912        assert_eq!(
913            classify(true, false, false, false, true),
914            DaemonMode::NestedAdaptive
915        );
916    }
917
918    #[test]
919    fn effective_mode_degraded_when_no_writable_path() {
920        // No root write, no parent, nothing usable.
921        assert_eq!(
922            classify(false, false, false, false, false),
923            DaemonMode::Degraded
924        );
925        // is_nested=true but no parent and no root write — still Degraded
926        // (the is_nested signal alone, without a resolved parent, does not
927        // give us a writable cgroup to anchor under).
928        assert_eq!(
929            classify(true, false, false, false, false),
930            DaemonMode::Degraded
931        );
932    }
933
934    #[test]
935    fn overlay_fallback_none_only_when_both_present() {
936        assert!(super::capability_overlay_fallback(true, true).is_none());
937    }
938
939    #[test]
940    fn overlay_fallback_reports_missing_cap_net_admin() {
941        let reason = super::capability_overlay_fallback(false, true).expect("should fall back");
942        assert!(reason.contains("CAP_NET_ADMIN"));
943    }
944
945    #[test]
946    fn overlay_fallback_reports_missing_tun() {
947        let reason = super::capability_overlay_fallback(true, false).expect("should fall back");
948        assert!(reason.contains("/dev/net/tun"));
949    }
950
951    #[test]
952    fn overlay_fallback_reports_both_missing() {
953        let reason = super::capability_overlay_fallback(false, false).expect("should fall back");
954        assert!(reason.contains("CAP_NET_ADMIN"));
955        assert!(reason.contains("/dev/net/tun"));
956    }
957
958    #[test]
959    fn rootless_overlay_requires_nonroot_no_cap_tun_and_pasta() {
960        // Happy path: non-root, no cap, tun present, pasta present.
961        assert!(super::can_rootless_overlay(false, false, true, true));
962    }
963
964    #[test]
965    fn rootless_overlay_rejected_when_root() {
966        assert!(!super::can_rootless_overlay(true, false, true, true));
967    }
968
969    #[test]
970    fn rootless_overlay_rejected_when_already_has_cap_net_admin() {
971        assert!(!super::can_rootless_overlay(false, true, true, true));
972    }
973
974    #[test]
975    fn rootless_overlay_rejected_without_tun() {
976        assert!(!super::can_rootless_overlay(false, false, false, true));
977    }
978
979    #[test]
980    fn rootless_overlay_rejected_without_pasta() {
981        assert!(!super::can_rootless_overlay(false, false, true, false));
982    }
983
984    #[test]
985    fn serializes_round_trip_via_serde_json() {
986        let caps = DaemonCapabilities::probe();
987        let json = serde_json::to_string(&caps).expect("serialize");
988        let parsed: DaemonCapabilities = serde_json::from_str(&json).expect("deserialize");
989        assert_eq!(parsed.is_root, caps.is_root);
990        assert_eq!(parsed.is_nested, caps.is_nested);
991        assert_eq!(parsed.cgroup_parent, caps.cgroup_parent);
992        assert_eq!(parsed.can_write_cgroup_root, caps.can_write_cgroup_root);
993        assert_eq!(parsed.has_cap_net_admin, caps.has_cap_net_admin);
994        assert_eq!(parsed.tun_device_available, caps.tun_device_available);
995        assert_eq!(
996            parsed.overlayfs_rootfs_available,
997            caps.overlayfs_rootfs_available
998        );
999        assert_eq!(
1000            parsed.fuse_overlayfs_rootfs_available,
1001            caps.fuse_overlayfs_rootfs_available
1002        );
1003        assert_eq!(parsed.effective_mode, caps.effective_mode);
1004    }
1005
1006    #[cfg(target_os = "linux")]
1007    mod fuse_probe {
1008        use super::super::{fusermount_binary, which_in};
1009
1010        #[test]
1011        fn which_in_finds_binary_in_first_matching_dir() {
1012            // Build a temp dir holding a fake binary and confirm the PATH scan
1013            // resolves it to the full path.
1014            let tmp = tempfile::tempdir().unwrap();
1015            let bin = tmp.path().join("fuse-overlayfs");
1016            std::fs::write(&bin, b"#!/bin/sh\n").unwrap();
1017            let other = tempfile::tempdir().unwrap();
1018            // PATH: a non-matching dir first, then the dir with the binary.
1019            let path_var = format!("{}:{}", other.path().display(), tmp.path().display());
1020            assert_eq!(
1021                which_in("fuse-overlayfs", &path_var).as_deref(),
1022                Some(bin.as_path())
1023            );
1024        }
1025
1026        #[test]
1027        fn which_in_none_when_absent_and_ignores_empty_segments() {
1028            let tmp = tempfile::tempdir().unwrap();
1029            // Leading/trailing/empty colon segments must be skipped, not joined
1030            // against (which would otherwise resolve `/fuse-overlayfs`).
1031            let path_var = format!(":{}:", tmp.path().display());
1032            assert!(which_in("fuse-overlayfs", &path_var).is_none());
1033            // An empty name never resolves.
1034            assert!(which_in("", &path_var).is_none());
1035        }
1036
1037        #[test]
1038        fn fusermount_binary_resolves_or_none_without_panic() {
1039            // On this box one of fusermount3/fusermount is typically present,
1040            // but we only assert the call is total (no panic) and, when it does
1041            // resolve, that it points at an existing file.
1042            if let Some(p) = fusermount_binary() {
1043                assert!(
1044                    p.exists(),
1045                    "resolved fusermount must exist: {}",
1046                    p.display()
1047                );
1048            }
1049        }
1050    }
1051
1052    #[cfg(target_os = "linux")]
1053    mod proc_filesystems {
1054        use super::super::proc_filesystems_has_overlay;
1055
1056        #[test]
1057        fn detects_overlay_as_nodev_filesystem() {
1058            // Real-world shape: nodev entries are tab-indented with a "nodev"
1059            // marker; overlay is one of them.
1060            let content = "nodev\tsysfs\nnodev\ttmpfs\nnodev\toverlay\n\text4\n";
1061            assert!(proc_filesystems_has_overlay(content));
1062        }
1063
1064        #[test]
1065        fn absent_when_overlay_not_listed() {
1066            let content = "nodev\tsysfs\nnodev\ttmpfs\n\text4\n\txfs\n";
1067            assert!(!proc_filesystems_has_overlay(content));
1068        }
1069
1070        #[test]
1071        fn does_not_match_substring_overlayfs() {
1072            // A different fs whose name merely contains "overlay" must not match
1073            // (we compare the whole final token, not a substring).
1074            let content = "nodev\toverlayfs2\n\text4\n";
1075            assert!(!proc_filesystems_has_overlay(content));
1076        }
1077
1078        #[test]
1079        fn empty_input_is_false() {
1080            assert!(!proc_filesystems_has_overlay(""));
1081        }
1082    }
1083
1084    /// The overlay probe must not panic and must be internally consistent:
1085    /// availability implies the daemon is root or holds `CAP_SYS_ADMIN`. The
1086    /// concrete bool depends on how the test runs (root vs not), so we only
1087    /// assert the implication, not a fixed value.
1088    #[cfg(target_os = "linux")]
1089    #[test]
1090    fn overlay_probe_consistent_with_privilege() {
1091        let caps = DaemonCapabilities::probe();
1092        if caps.overlayfs_rootfs_available {
1093            assert!(
1094                caps.is_root || super::probe_has_cap_sys_admin(),
1095                "overlay availability must imply root or CAP_SYS_ADMIN"
1096            );
1097        }
1098    }
1099
1100    #[test]
1101    fn daemon_mode_serde_uses_snake_case() {
1102        assert_eq!(
1103            serde_json::to_string(&DaemonMode::Full).unwrap(),
1104            "\"full\""
1105        );
1106        assert_eq!(
1107            serde_json::to_string(&DaemonMode::NestedAdaptive).unwrap(),
1108            "\"nested_adaptive\""
1109        );
1110        assert_eq!(
1111            serde_json::to_string(&DaemonMode::Degraded).unwrap(),
1112            "\"degraded\""
1113        );
1114    }
1115
1116    #[cfg(target_os = "linux")]
1117    mod target_parent {
1118        use super::super::compute_target_parent;
1119
1120        #[test]
1121        fn idempotent_when_already_under_init() {
1122            // Pre-fix path: scope is the systemd-run scope itself.
1123            assert_eq!(
1124                compute_target_parent(
1125                    "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope"
1126                ),
1127                "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/containers"
1128            );
1129            // Already migrated: scope ends with /init — strip and re-anchor.
1130            assert_eq!(
1131                compute_target_parent(
1132                    "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/init"
1133                ),
1134                "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/containers"
1135            );
1136            // Trailing slash on either form is harmless.
1137            assert_eq!(compute_target_parent("/foo/bar/"), "/foo/bar/containers");
1138            assert_eq!(
1139                compute_target_parent("/foo/bar/init"),
1140                "/foo/bar/containers"
1141            );
1142        }
1143    }
1144
1145    #[cfg(target_os = "linux")]
1146    mod host_parent {
1147        use super::super::{compute_host_container_parent, HOST_CONTAINER_ROOT};
1148
1149        #[test]
1150        fn host_parent_is_top_level_and_outside_any_unit() {
1151            // Host-mode containers must live under a top-level node, NOT under
1152            // `/system.slice/zlayer.service/...`, so a KillMode=process
1153            // survivor can never wedge the unit's restart with EBUSY.
1154            assert_eq!(compute_host_container_parent(), "/zlayer/containers");
1155            assert!(compute_host_container_parent().starts_with(HOST_CONTAINER_ROOT));
1156            assert!(!compute_host_container_parent().contains("zlayer.service"));
1157            assert!(!compute_host_container_parent().contains(".slice"));
1158        }
1159    }
1160
1161    #[cfg(target_os = "linux")]
1162    mod host_cgroup_reap {
1163        use super::super::remove_host_container_cgroup_at;
1164        use std::fs;
1165
1166        // Reproduces the recreate bug: a previous instance left an empty
1167        // cgroup tree at `<root>/zlayer/containers/<id>` (with a nested child
1168        // cgroup, as a v2 leaf with delegated controllers can have). The reaper
1169        // must depth-first remove the children and then the leaf so the next
1170        // create_container starts from a clean slot.
1171        #[test]
1172        fn reaps_stale_empty_cgroup_tree_depth_first() {
1173            let base = tempfile::tempdir().expect("tempdir");
1174            let base_path = base.path().to_str().unwrap();
1175            let id = "zata-storage-rep-1";
1176
1177            let leaf = base.path().join(format!("zlayer/containers/{id}"));
1178            // A nested child cgroup dir — a v2 parent cannot be rmdir'd while
1179            // children exist, which is exactly what the recursion must handle.
1180            let child = leaf.join("child-scope");
1181            fs::create_dir_all(&child).expect("create nested cgroup tree");
1182            // Simulate cgroup-v2 control files present in the leaf.
1183            fs::write(leaf.join("cgroup.procs"), "").unwrap();
1184            fs::write(child.join("cgroup.procs"), "").unwrap();
1185            assert!(leaf.exists(), "precondition: stale leaf exists");
1186
1187            remove_host_container_cgroup_at(base_path, id);
1188
1189            assert!(
1190                !leaf.exists(),
1191                "stale cgroup leaf must be reaped (depth-first removal of children + leaf)"
1192            );
1193            // The `containers` parent itself is left intact (shared across ids).
1194            assert!(
1195                base.path().join("zlayer/containers").exists(),
1196                "shared containers parent must survive"
1197            );
1198        }
1199
1200        #[test]
1201        fn idempotent_when_leaf_absent() {
1202            let base = tempfile::tempdir().expect("tempdir");
1203            let base_path = base.path().to_str().unwrap();
1204            // No leaf created — must be a no-op, not a panic.
1205            remove_host_container_cgroup_at(base_path, "never-existed");
1206        }
1207    }
1208
1209    #[cfg(target_os = "linux")]
1210    mod cgroup_parser {
1211        use super::super::parse_cgroup_v2_line;
1212
1213        #[test]
1214        fn parse_cgroup_v2_root_returns_none() {
1215            assert_eq!(parse_cgroup_v2_line("0::/\n"), None);
1216        }
1217
1218        #[test]
1219        fn parse_cgroup_v2_path_returns_some() {
1220            assert_eq!(
1221                parse_cgroup_v2_line("0::/system.slice/forgejo-runner.service\n"),
1222                Some("/system.slice/forgejo-runner.service".to_string())
1223            );
1224        }
1225
1226        #[test]
1227        fn parse_cgroup_v2_hybrid_finds_v2_line() {
1228            let input = "12:devices:/user.slice\n11:memory:/user.slice\n0::/foo\n";
1229            assert_eq!(parse_cgroup_v2_line(input), Some("/foo".to_string()));
1230        }
1231
1232        #[test]
1233        fn parse_cgroup_v2_no_newline() {
1234            assert_eq!(parse_cgroup_v2_line("0::/bar"), Some("/bar".to_string()));
1235        }
1236
1237        #[test]
1238        fn parse_cgroup_v2_missing_returns_none() {
1239            assert_eq!(parse_cgroup_v2_line(""), None);
1240        }
1241    }
1242}