Skip to main content

zlayer_agent/
capability.rs

1//! Daemon capability survey.
2//!
3//! Probes the runtime environment of the zlayer daemon (root vs. non-root,
4//! host vs. nested in a container, cgroup v2 path, `CAP_NET_ADMIN`, presence
5//! of `/dev/net/tun`, and writability of the cgroup root) and derives a coarse
6//! [`DaemonMode`] from those signals.
7//!
8//! All probes are intentionally cheap and non-destructive — a handful of
9//! syscalls, no allocations of kernel resources (no TUN interfaces, no cgroup
10//! writes). The struct is safe to construct multiple times.
11//!
12//! Non-Linux targets report a fixed degraded survey since the kernel features
13//! these probes target are Linux-only.
14
15use std::sync::OnceLock;
16
17use serde::{Deserialize, Serialize};
18
19/// Coarse classification of the daemon's effective execution environment.
20///
21/// Derived from the boolean fields on [`DaemonCapabilities`].
22#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
23#[serde(rename_all = "snake_case")]
24pub enum DaemonMode {
25    /// Host-level execution: all caps, can write cgroup root, can create overlay.
26    Full,
27    /// Inside a container: scoped to a sub-cgroup; some caps may be present.
28    NestedAdaptive,
29    /// Missing privileges required for any meaningful container creation.
30    Degraded,
31}
32
33/// Snapshot of the daemon's effective capabilities and execution environment.
34///
35/// Construct via [`DaemonCapabilities::probe`]. Cheap to call repeatedly.
36///
37/// The struct intentionally exposes independent capability bits as separate
38/// booleans rather than collapsing them into an enum — each bit corresponds to
39/// an orthogonal kernel feature (cgroup write, `CAP_NET_ADMIN`, TUN access,
40/// root-ness) and downstream code wants to inspect them independently when
41/// deciding what to gate.
42#[allow(clippy::struct_excessive_bools)]
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct DaemonCapabilities {
45    /// `true` if the process is running as uid 0.
46    pub is_root: bool,
47    /// `true` if the process appears to be inside a container (non-root cgroup
48    /// v2 path).
49    pub is_nested: bool,
50    /// The cgroup v2 path of the current process, if any (e.g.
51    /// `/system.slice/zlayer.service`). `None` on the cgroup root, on
52    /// cgroup-v1-only hosts, on non-Linux, or on read errors.
53    pub cgroup_parent: Option<String>,
54    /// `true` if the cgroup root's `cgroup.subtree_control` has the
55    /// owner-write bit set. Coarse, non-destructive signal — does not
56    /// guarantee an actual write will succeed.
57    pub can_write_cgroup_root: bool,
58    /// `true` if `CAP_NET_ADMIN` is present in the process's *effective* set
59    /// (Linux only).
60    pub has_cap_net_admin: bool,
61    /// `true` if `/dev/net/tun` can be opened r/w in non-blocking mode without
62    /// EACCES/EPERM/ENOENT/ENXIO. The fd is dropped immediately.
63    pub tun_device_available: bool,
64    /// Coarse classification derived from the above fields.
65    pub effective_mode: DaemonMode,
66}
67
68/// Process-wide memoised capability survey. Seeded by the first call to
69/// [`DaemonCapabilities::get`] or [`DaemonCapabilities::seed`].
70static CAPS: OnceLock<DaemonCapabilities> = OnceLock::new();
71
72impl DaemonCapabilities {
73    /// Returns the process-wide capability snapshot, probing on first call.
74    ///
75    /// Subsequent calls return the same memoised instance — capabilities of a
76    /// running daemon do not change at runtime, so re-probing would be wasted
77    /// syscalls and could create the illusion that the daemon's behaviour can
78    /// shift mid-flight.
79    pub fn get() -> &'static Self {
80        CAPS.get_or_init(Self::probe)
81    }
82
83    /// Eagerly seed the memoised survey with an explicit probe result.
84    ///
85    /// Useful at daemon startup to force the probe to happen at a known point
86    /// (so the banner log appears in the expected place). Returns the stored
87    /// instance — if the cache was already seeded, the existing value wins
88    /// and the passed-in `caps` is dropped (probe is pure, so this is fine).
89    ///
90    /// # Panics
91    ///
92    /// In practice this never panics — `OnceLock::set` either stores the
93    /// value or rejects it because the cell is already filled, and in both
94    /// cases the subsequent `get()` returns `Some`. The `expect` exists only
95    /// to satisfy the type system.
96    pub fn seed(caps: Self) -> &'static Self {
97        let _ = CAPS.set(caps);
98        CAPS.get()
99            .expect("CAPS is filled after set or was already filled")
100    }
101
102    /// Probe the running daemon's effective capabilities.
103    ///
104    /// Cheap — a handful of syscalls and no resource allocation. Prefer
105    /// [`DaemonCapabilities::get`] when you want the process-wide memoised
106    /// value; call this directly only when you intentionally want a fresh
107    /// snapshot (e.g. tests).
108    #[must_use]
109    pub fn probe() -> Self {
110        let is_root = zlayer_paths::is_root();
111        let cgroup_parent = current_cgroup_v2_path();
112        let is_nested = cgroup_parent.is_some();
113        let can_write_cgroup_root = probe_can_write_cgroup_root();
114        let has_cap_net_admin = probe_has_cap_net_admin();
115        let tun_device_available = probe_tun_device_available();
116
117        let effective_mode =
118            if !is_nested && can_write_cgroup_root && has_cap_net_admin && tun_device_available {
119                DaemonMode::Full
120            } else if can_write_cgroup_root || cgroup_parent.is_some() {
121                DaemonMode::NestedAdaptive
122            } else {
123                DaemonMode::Degraded
124            };
125
126        Self {
127            is_root,
128            is_nested,
129            cgroup_parent,
130            can_write_cgroup_root,
131            has_cap_net_admin,
132            tun_device_available,
133            effective_mode,
134        }
135    }
136}
137
138/// Pure parser for the contents of `/proc/self/cgroup`.
139///
140/// Finds the cgroup-v2 line (prefix `0::`) and returns the path suffix with
141/// surrounding whitespace trimmed. Returns `None` when:
142/// - the input has no `0::` line (cgroup-v1-only host), or
143/// - the v2 path is exactly `/` (host root — bare-metal, no enclosing cgroup), or
144/// - the input is empty.
145#[cfg(target_os = "linux")]
146fn parse_cgroup_v2_line(content: &str) -> Option<String> {
147    for line in content.lines() {
148        if let Some(rest) = line.strip_prefix("0::") {
149            let trimmed = rest.trim();
150            if trimmed.is_empty() || trimmed == "/" {
151                return None;
152            }
153            return Some(trimmed.to_string());
154        }
155    }
156    None
157}
158
159/// Returns the current process's cgroup-v2 path, if any.
160///
161/// On Linux reads `/proc/self/cgroup` and delegates to `parse_cgroup_v2_line`.
162/// On non-Linux always returns `None`. Returns `None` on any read error or
163/// when the process is at the cgroup-v2 root (bare-metal case).
164#[cfg(target_os = "linux")]
165#[must_use]
166pub fn current_cgroup_v2_path() -> Option<String> {
167    let content = std::fs::read_to_string("/proc/self/cgroup").ok()?;
168    parse_cgroup_v2_line(&content)
169}
170
171#[cfg(not(target_os = "linux"))]
172#[must_use]
173pub fn current_cgroup_v2_path() -> Option<String> {
174    None
175}
176
177/// Pure path computation: given a cgroup-v2 scope reported by
178/// `/proc/self/cgroup`, return the sibling `<scope>/containers` parent that
179/// should be used for new container cgroups.
180///
181/// If `scope` already ends with `/init` (the daemon has already been migrated
182/// into the `init` leaf by a previous call), the `/init` suffix is stripped
183/// and the result anchored at the real scope. This makes
184/// [`ensure_daemon_leaf_and_container_parent`] idempotent.
185#[cfg(target_os = "linux")]
186fn compute_target_parent(scope: &str) -> String {
187    let base = scope.strip_suffix("/init").unwrap_or(scope);
188    let base = base.trim_end_matches('/');
189    format!("{base}/containers")
190}
191
192/// Migrate the current daemon process into a `<scope>/init` sub-cgroup and
193/// return the sibling `<scope>/containers` path as the parent for future
194/// container cgroups. Idempotent — safe to call multiple times.
195///
196/// Returns `None` on non-Linux, when `/proc/self/cgroup` can't be parsed,
197/// when `/sys/fs/cgroup` is read-only, or when the mkdir/PID-write fails.
198/// Callers should fall back to the raw `current_cgroup_v2_path()` value in
199/// those cases (the auto-detect path will surface the underlying error).
200#[cfg(target_os = "linux")]
201#[must_use]
202pub fn ensure_daemon_leaf_and_container_parent() -> Option<String> {
203    let scope = current_cgroup_v2_path()?;
204    let containers = compute_target_parent(&scope);
205    // Idempotency: if we're already in `<base>/init`, just return the sibling.
206    if scope.ends_with("/init") {
207        let containers_fs = format!("/sys/fs/cgroup{containers}");
208        match std::fs::create_dir_all(&containers_fs) {
209            Ok(()) => {}
210            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
211            Err(_) => return None,
212        }
213        return Some(containers);
214    }
215
216    let scope = scope.trim_end_matches('/').to_string();
217    let mount = "/sys/fs/cgroup";
218    let init_dir = format!("{mount}{scope}/init");
219
220    match std::fs::create_dir_all(&init_dir) {
221        Ok(()) => {}
222        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
223        Err(_) => return None,
224    }
225
226    let pid_path = format!("{init_dir}/cgroup.procs");
227    let pid_str = format!("{}", std::process::id());
228    if std::fs::write(&pid_path, &pid_str).is_err() {
229        // Already migrated? Re-check /proc/self/cgroup before giving up.
230        let now = current_cgroup_v2_path()?;
231        if now != format!("{scope}/init") {
232            return None;
233        }
234    }
235
236    // Verify the migration actually moved us into <scope>/init.
237    let after = current_cgroup_v2_path()?;
238    if after != format!("{scope}/init") {
239        return None;
240    }
241
242    let containers_dir = format!("{mount}{containers}");
243    match std::fs::create_dir_all(&containers_dir) {
244        Ok(()) => {}
245        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
246        Err(_) => return None,
247    }
248
249    Some(containers)
250}
251
252#[cfg(not(target_os = "linux"))]
253#[must_use]
254pub fn ensure_daemon_leaf_and_container_parent() -> Option<String> {
255    None
256}
257
258#[cfg(target_os = "linux")]
259fn probe_can_write_cgroup_root() -> bool {
260    use std::ffi::CString;
261
262    let Ok(path) = CString::new("/sys/fs/cgroup/cgroup.subtree_control") else {
263        return false;
264    };
265    // SAFETY: access(2) is a read-only syscall that takes a pointer to a
266    // NUL-terminated C string. The kernel does not retain the pointer.
267    #[allow(unsafe_code)]
268    let rc = unsafe { libc::access(path.as_ptr(), libc::W_OK) };
269    rc == 0
270}
271
272#[cfg(not(target_os = "linux"))]
273fn probe_can_write_cgroup_root() -> bool {
274    false
275}
276
277#[cfg(target_os = "linux")]
278fn probe_has_cap_net_admin() -> bool {
279    // CAP_NET_ADMIN is bit 12 in the Linux capability bitmask.
280    // We need it in the EFFECTIVE set (`CapEff`), not just the bounding set
281    // (`CapBnd`). A regular user process has full CapBnd by default but empty
282    // CapPrm/CapEff — checking PR_CAPBSET_READ gives a false positive that
283    // makes the daemon think it can create TUN/WG interfaces when it cannot.
284    const CAP_NET_ADMIN_BIT: u64 = 1 << 12;
285    let Ok(status) = std::fs::read_to_string("/proc/self/status") else {
286        return false;
287    };
288    for line in status.lines() {
289        if let Some(hex) = line.strip_prefix("CapEff:") {
290            let trimmed = hex.trim();
291            if let Ok(eff) = u64::from_str_radix(trimmed, 16) {
292                return eff & CAP_NET_ADMIN_BIT != 0;
293            }
294            return false;
295        }
296    }
297    false
298}
299
300#[cfg(not(target_os = "linux"))]
301fn probe_has_cap_net_admin() -> bool {
302    false
303}
304
305#[cfg(target_os = "linux")]
306fn probe_tun_device_available() -> bool {
307    use std::os::unix::fs::OpenOptionsExt;
308
309    // Opening /dev/net/tun without any ioctls is benign and does not allocate
310    // a TUN interface. The fd is dropped immediately when this scope ends.
311    // Any open error — missing device, no perms, kernel module not loaded,
312    // FD exhaustion — means we can't actually use TUN. Treat as unavailable.
313    std::fs::OpenOptions::new()
314        .read(true)
315        .write(true)
316        .custom_flags(libc::O_NONBLOCK)
317        .open("/dev/net/tun")
318        .is_ok()
319}
320
321#[cfg(not(target_os = "linux"))]
322fn probe_tun_device_available() -> bool {
323    false
324}
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    #[test]
331    fn probe_does_not_panic_and_is_nested_agrees_with_cgroup_parent() {
332        let caps = DaemonCapabilities::probe();
333        assert_eq!(caps.is_nested, caps.cgroup_parent.is_some());
334    }
335
336    #[cfg(target_os = "linux")]
337    #[test]
338    fn probe_has_cap_net_admin_matches_cap_eff() {
339        // Just confirm the probe agrees with what /proc/self/status reports.
340        // The actual capability state depends on how the test is run (regular
341        // user vs root vs setcap'd binary), but the probe MUST agree with the
342        // CapEff line — that's the whole point of the bug fix.
343        let status = std::fs::read_to_string("/proc/self/status").unwrap();
344        let cap_eff_line = status
345            .lines()
346            .find(|l| l.starts_with("CapEff:"))
347            .expect("CapEff: present in /proc/self/status");
348        let hex = cap_eff_line.trim_start_matches("CapEff:").trim();
349        let eff: u64 = u64::from_str_radix(hex, 16).unwrap();
350        let expected = (eff & (1u64 << 12)) != 0;
351        assert_eq!(super::probe_has_cap_net_admin(), expected);
352    }
353
354    /// Pure classifier reproducing the logic in `probe()`. Kept in the test
355    /// module so the table below can assert behaviour without depending on
356    /// the host's actual capability state.
357    #[allow(clippy::fn_params_excessive_bools)]
358    fn classify(
359        is_nested: bool,
360        can_write_cgroup_root: bool,
361        has_cap_net_admin: bool,
362        tun_device_available: bool,
363        cgroup_parent_is_some: bool,
364    ) -> DaemonMode {
365        if !is_nested && can_write_cgroup_root && has_cap_net_admin && tun_device_available {
366            DaemonMode::Full
367        } else if can_write_cgroup_root || cgroup_parent_is_some {
368            DaemonMode::NestedAdaptive
369        } else {
370            DaemonMode::Degraded
371        }
372    }
373
374    #[test]
375    fn effective_mode_full_requires_all_four_signals() {
376        // Full: every signal must be set the right way.
377        assert_eq!(
378            classify(false, true, true, true, false),
379            DaemonMode::Full,
380            "all four signals set should be Full"
381        );
382        // Drop any single signal and Full must no longer apply.
383        assert_ne!(classify(true, true, true, true, true), DaemonMode::Full);
384        assert_ne!(classify(false, false, true, true, false), DaemonMode::Full);
385        assert_ne!(classify(false, true, false, true, false), DaemonMode::Full);
386        assert_ne!(classify(false, true, true, false, false), DaemonMode::Full);
387    }
388
389    #[test]
390    fn effective_mode_nested_adaptive_when_writable_or_has_parent() {
391        // Writable root but missing other Full signals → NestedAdaptive.
392        assert_eq!(
393            classify(false, true, false, false, false),
394            DaemonMode::NestedAdaptive
395        );
396        // Nested under a parent cgroup, no other signals → NestedAdaptive.
397        assert_eq!(
398            classify(true, false, false, false, true),
399            DaemonMode::NestedAdaptive
400        );
401    }
402
403    #[test]
404    fn effective_mode_degraded_when_no_writable_path() {
405        // No root write, no parent, nothing usable.
406        assert_eq!(
407            classify(false, false, false, false, false),
408            DaemonMode::Degraded
409        );
410        // is_nested=true but no parent and no root write — still Degraded
411        // (the is_nested signal alone, without a resolved parent, does not
412        // give us a writable cgroup to anchor under).
413        assert_eq!(
414            classify(true, false, false, false, false),
415            DaemonMode::Degraded
416        );
417    }
418
419    #[test]
420    fn serializes_round_trip_via_serde_json() {
421        let caps = DaemonCapabilities::probe();
422        let json = serde_json::to_string(&caps).expect("serialize");
423        let parsed: DaemonCapabilities = serde_json::from_str(&json).expect("deserialize");
424        assert_eq!(parsed.is_root, caps.is_root);
425        assert_eq!(parsed.is_nested, caps.is_nested);
426        assert_eq!(parsed.cgroup_parent, caps.cgroup_parent);
427        assert_eq!(parsed.can_write_cgroup_root, caps.can_write_cgroup_root);
428        assert_eq!(parsed.has_cap_net_admin, caps.has_cap_net_admin);
429        assert_eq!(parsed.tun_device_available, caps.tun_device_available);
430        assert_eq!(parsed.effective_mode, caps.effective_mode);
431    }
432
433    #[test]
434    fn daemon_mode_serde_uses_snake_case() {
435        assert_eq!(
436            serde_json::to_string(&DaemonMode::Full).unwrap(),
437            "\"full\""
438        );
439        assert_eq!(
440            serde_json::to_string(&DaemonMode::NestedAdaptive).unwrap(),
441            "\"nested_adaptive\""
442        );
443        assert_eq!(
444            serde_json::to_string(&DaemonMode::Degraded).unwrap(),
445            "\"degraded\""
446        );
447    }
448
449    #[cfg(target_os = "linux")]
450    mod target_parent {
451        use super::super::compute_target_parent;
452
453        #[test]
454        fn idempotent_when_already_under_init() {
455            // Pre-fix path: scope is the systemd-run scope itself.
456            assert_eq!(
457                compute_target_parent(
458                    "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope"
459                ),
460                "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/containers"
461            );
462            // Already migrated: scope ends with /init — strip and re-anchor.
463            assert_eq!(
464                compute_target_parent(
465                    "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/init"
466                ),
467                "/user.slice/user-1000.slice/user@1000.service/app.slice/run-p123.scope/containers"
468            );
469            // Trailing slash on either form is harmless.
470            assert_eq!(compute_target_parent("/foo/bar/"), "/foo/bar/containers");
471            assert_eq!(
472                compute_target_parent("/foo/bar/init"),
473                "/foo/bar/containers"
474            );
475        }
476    }
477
478    #[cfg(target_os = "linux")]
479    mod cgroup_parser {
480        use super::super::parse_cgroup_v2_line;
481
482        #[test]
483        fn parse_cgroup_v2_root_returns_none() {
484            assert_eq!(parse_cgroup_v2_line("0::/\n"), None);
485        }
486
487        #[test]
488        fn parse_cgroup_v2_path_returns_some() {
489            assert_eq!(
490                parse_cgroup_v2_line("0::/system.slice/forgejo-runner.service\n"),
491                Some("/system.slice/forgejo-runner.service".to_string())
492            );
493        }
494
495        #[test]
496        fn parse_cgroup_v2_hybrid_finds_v2_line() {
497            let input = "12:devices:/user.slice\n11:memory:/user.slice\n0::/foo\n";
498            assert_eq!(parse_cgroup_v2_line(input), Some("/foo".to_string()));
499        }
500
501        #[test]
502        fn parse_cgroup_v2_no_newline() {
503            assert_eq!(parse_cgroup_v2_line("0::/bar"), Some("/bar".to_string()));
504        }
505
506        #[test]
507        fn parse_cgroup_v2_missing_returns_none() {
508            assert_eq!(parse_cgroup_v2_line(""), None);
509        }
510    }
511}