Skip to main content

solti_exec/utils/
security.rs

1//! # Security: process-level hardening for subprocess runners.
2//!
3//! [`SecurityConfig`] restricts the privilege set of child processes spawned by subprocess runners.
4//!
5//! **Linux:**
6//! - Drop all process capabilities in one batch (`capget` → mask → `capset`)
7//! - Zero heap allocation in the child (closure captures only `Copy` types)
8//! - Raise kept caps in the ambient set for unprivileged `execve`
9//! - Keep an optional allowlist of caps via [`LinuxCapability`]
10//! - Set `no_new_privs` to block suid/sgid escalation
11//!
12//! **Other platforms:**
13//! - `tracing::warn` and no-op.
14//!
15//! ## Also
16//!
17//! - [`SubprocessBackendConfig`](crate::subprocess::SubprocessBackendConfig) builder that consumes `SecurityConfig`.
18//! - [`LinuxCapability`](super::LinuxCapability) capability identifiers for the keep list.
19//!
20//! ## What happens when a subprocess spawns
21//! ```text
22//!                        parent process
23//!                             │
24//!                           fork()
25//!                             │
26//!          ┌──────────────────┼───────────────────┐
27//!          │            child process             │
28//!          │                                      │
29//!          │  ┌── pre_exec hook ───────────────┐  │
30//!          │  │  1. clear ambient caps         │  │
31//!          │  │  2. capget current caps        │  │
32//!          │  │  3. mask &= keep_mask          │  │
33//!          │  │  4. capset (one syscall)       │  │
34//!          │  │  5. raise kept in ambient      │  │
35//!          │  │  6. set no_new_privs           │  │
36//!          │  └────────────────────────────────┘  │
37//!          │                                      │
38//!          │  execve("echo", ["hello"])           │
39//!          │  (runs with minimal caps)            │
40//!          └──────────────────────────────────────┘
41//! ```
42//!
43//! ## How attach_security works
44//! ```text
45//! attach_security(&mut cmd, &config)
46//!     ├──► config.is_empty()? → return early, no hook
47//!     │
48//!     ├──► Linux:
49//!     │     ├──► build KeepMask from config.keep_caps
50//!     │     │     └──► Vec<LinuxCapability> → [u32; 2] bitmask (Copy, stack-only)
51//!     │     │
52//!     │     └──► install pre_exec closure on Command
53//!     │           └──► captures: drop_all_caps (bool), no_new_privs (bool), keep_mask ([u32; 2])
54//!     │                zero heap: all Copy types
55//!     │
56//!     └──► non-Linux:
57//!           └──► warn!("security settings ignored on {os}") → Ok(())
58//! ```
59//!
60//! ## Capability drop: step by step
61//! ```text
62//! drop_capabilities_batch(keep_mask)
63//!     │
64//!     ├──► prctl(PR_CAP_AMBIENT, CLEAR_ALL)
65//!     │     └──► EINVAL? kernel < 4.3, no ambient - Ok, continue
66//!     │
67//!     ├──► capget() → read current caps into CapUserData[2]
68//!     │    ┌────────────────────────────────────────────────┐
69//!     │    │  before mask             after mask            │
70//!     │    │  effective:  1111        effective:  0010      │
71//!     │    │  permitted:  1111        permitted:  0010      │
72//!     │    │  inheritable:1111        inheritable:0010      │
73//!     │    │                                                │
74//!     │    │  keep_mask = 0010 (only CAP_NET_BIND_SERVICE)  │
75//!     │    └────────────────────────────────────────────────┘
76//!     │
77//!     ├──► capset() ← one syscall writes all caps
78//!     │
79//!     └──► for each cap set in keep_mask:
80//!           └──► prctl(PR_CAP_AMBIENT, RAISE, cap)
81//!                EINVAL | EPERM → Ok (best-effort, older kernel or no permission)
82//! ```
83//!
84//! ## KeepMask layout
85//! ```text
86//! Linux capability v3 format: CapUserData[2] = 2 × u32 = 64 bits
87//!
88//!   bits[0]                          bits[1]
89//!   ┌─────────────────────────────┐  ┌─────────────────────────────┐
90//!   │ cap 0  cap 1 ... cap 31     │  │ cap 32  cap 33 ... cap 63   │
91//!   └─────────────────────────────┘  └─────────────────────────────┘
92//!
93//!   CAP_LAST_CAP = 63 - this is NOT a guess, it's the v3 ABI limit.
94//!   If kernel ever adds cap > 63, that requires a v4 format with new structs and new syscall signatures - this whole module would need updating anyway.
95//! ```
96//!
97//! ## Configuration
98//!
99//! | Field               | What it does                          | Needs privileges? | If it fails                                    |
100//! |---------------------|---------------------------------------|-------------------|------------------------------------------------|
101//! | `drop_all_caps`     | strip all caps except `keep_caps`     | `CAP_SETPCAP`     | logs warning, go on (or abort if strict)       |
102//! | `keep_caps`         | allowlist: caps to preserve           | `CAP_SETPCAP`     | logs warning, go on (or abort if strict)       |
103//! | `fail_on_cap_error` | strict mode: abort spawn on cap error | —                 | —                                              |
104//! | `no_new_privs`      | block suid/sgid privilege escalation  | none (any user)   | **always aborts spawn**                        |
105//!
106//! ## Async-signal safety
107//!
108//! Everything inside the `pre_exec` closure runs **between `fork()` and `execve()`**.
109//! POSIX says only async-signal-safe functions are allowed there.
110//!
111//! | What we call                 | Why it's safe                              |
112//! |------------------------------|--------------------------------------------|
113//! | `prctl()`                    | direct syscall                             |
114//! | `capget()` / `capset()`      | direct syscalls                            |
115//! | `libc::write(STDERR)`        | async-signal-safe per POSIX                |
116//! | `io::Error::last_os_error()` | reads `errno`, no heap (Rust ≥ 1.74)       |
117//!
118//! The closure captures **only `Copy` types** (2 bools + `[u32; 2]`).
119//! No `Vec`, no `String`, no `Arc`: zero heap allocation in the child.
120//!
121//! ## Rules
122//! - Capability drop failures are **non-fatal** by default (logged via `pre_exec_log`, continues)
123//! - Set `fail_on_cap_error = true` to make capability drop failures **fatal** (aborts spawn)
124//! - Non-Linux: all knobs are no-op, warning emitted via `tracing::warn`
125//! - `no_new_privs` failure is **always fatal** (returns `Err`, `Command::spawn` fails)
126//! - `KeepMask` is built **before** fork (safe to iterate `Vec<LinuxCapability>`)
127//! - `SecurityConfig::is_empty()` → no hook installed, zero overhead
128use tokio::process::Command;
129
130use crate::utils::LinuxCapability;
131
132#[cfg(not(target_os = "linux"))]
133use tracing::warn;
134
135/// Declarative security policy.
136#[derive(Debug, Clone, Default)]
137pub struct SecurityConfig {
138    /// Drop all capabilities before exec.
139    ///
140    /// Note: capability operations require CAP_SETPCAP or root.
141    /// If the process lacks these privileges, the operation will log a warning and continue (unless `fail_on_cap_error` is set).
142    pub drop_all_caps: bool,
143    /// Optional allowlist of capabilities to keep after `drop_all_caps`.
144    ///
145    /// Only meaningful when `drop_all_caps = true`.
146    pub keep_caps: Vec<LinuxCapability>,
147    /// Enable `no_new_privs` for the child process.
148    ///
149    /// This flag works without root privileges.
150    /// Failures to set this flag are always fatal (spawn will fail).
151    pub no_new_privs: bool,
152    /// When `true`, capability drop failures abort the spawn instead of logging and continuing.
153    ///
154    /// Default: `false` (best-effort - non-fatal).
155    pub fail_on_cap_error: bool,
156}
157
158impl SecurityConfig {
159    /// Returns `true` if no security knobs are configured.
160    #[inline]
161    pub fn is_empty(&self) -> bool {
162        !self.drop_all_caps && self.keep_caps.is_empty() && !self.no_new_privs
163    }
164}
165
166/// Attach security policy to a `tokio::process::Command`.
167pub fn attach_security(cmd: &mut Command, config: &SecurityConfig) {
168    if config.is_empty() {
169        return;
170    }
171
172    #[cfg(target_os = "linux")]
173    {
174        linux_impl::attach(cmd, config);
175    }
176    #[cfg(not(target_os = "linux"))]
177    {
178        let _ = &cmd;
179        warn!(
180            ?config,
181            "security configuration is only enforced on Linux; current OS={}: settings will be ignored",
182            std::env::consts::OS,
183        );
184    }
185}
186
187#[cfg(target_os = "linux")]
188mod linux_impl {
189    use super::{KeepMask, SecurityConfig};
190
191    use crate::utils::log::{pre_exec_log, pre_exec_log_errno};
192    use std::io;
193    use tokio::process::Command;
194
195    const LINUX_CAPABILITY_VERSION_3: u32 = 0x2008_0522;
196    const PR_CAP_AMBIENT: libc::c_int = 47;
197    const PR_CAP_AMBIENT_RAISE: libc::c_ulong = 2;
198    const PR_CAP_AMBIENT_CLEAR_ALL: libc::c_ulong = 4;
199    const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;
200    /// Upper bound of capability v3 bitmask: `CapUserData[2]` = 2 × 32 = 64 bits → caps 0..63.
201    /// This is a kernel ABI limit, not a guess. A v4 format would require new structs + syscall signatures.
202    const CAP_LAST_CAP: u32 = 63;
203
204    /// Install the `pre_exec` hook on the command.
205    ///
206    /// Caller (`attach_security`) already checked `!config.is_empty()`.
207    pub fn attach(cmd: &mut Command, config: &SecurityConfig) {
208        let keep_mask = KeepMask::from_caps(&config.keep_caps);
209        let fail_on_cap_error = config.fail_on_cap_error;
210        let drop_all_caps = config.drop_all_caps;
211        let no_new_privs = config.no_new_privs;
212
213        // SAFETY:
214        // The pre_exec closure runs between fork() and execve() in the child process.
215        //
216        // It calls prctl, capget/capset (async-signal-safe syscalls) and pre_exec_log (raw libc::write).
217        // Error paths use io::Error::last_os_error() which stores errno inline without heap allocation (Rust >= 1.74).
218        //
219        // The closure captures only Copy types (three bools + [u32; 2]): zero heap allocation.
220        unsafe {
221            cmd.pre_exec(move || {
222                if drop_all_caps
223                    && let Err(e) = drop_capabilities_batch(keep_mask)
224                    && fail_on_cap_error
225                {
226                    return Err(e);
227                }
228                if no_new_privs {
229                    apply_no_new_privs()?;
230                }
231                Ok(())
232            });
233        }
234    }
235
236    /// Drop all capabilities except those in `keep_mask`, using batch capget/capset.
237    ///
238    /// Each step logs a distinct prefix on failure so the operator can tell which syscall failed (clear_ambient / capget / capset).
239    fn drop_capabilities_batch(keep_mask: KeepMask) -> io::Result<()> {
240        if let Err(e) = clear_ambient_caps() {
241            pre_exec_log(b"solti-exec: clear_ambient_caps failed: ");
242            if let Some(code) = e.raw_os_error() {
243                pre_exec_log_errno(code);
244            }
245            return Err(e);
246        }
247
248        let mut header = CapUserHeader {
249            version: LINUX_CAPABILITY_VERSION_3,
250            pid: 0,
251        };
252        let mut data = [CapUserData::default(); 2];
253
254        // SAFETY:
255        // Header and data are valid stack-local #[repr(C)] structs matching the kernel's
256        // __user_cap_header_struct / __user_cap_data_struct layout.
257        if unsafe { capget(&mut header, data.as_mut_ptr()) } != 0 {
258            let e = io::Error::last_os_error();
259            pre_exec_log(b"solti-exec: capget failed: ");
260            if let Some(code) = e.raw_os_error() {
261                pre_exec_log_errno(code);
262            }
263            return Err(e);
264        }
265
266        data[0].effective &= keep_mask.bits[0];
267        data[0].permitted &= keep_mask.bits[0];
268        data[0].inheritable &= keep_mask.bits[0];
269        data[1].effective &= keep_mask.bits[1];
270        data[1].permitted &= keep_mask.bits[1];
271        data[1].inheritable &= keep_mask.bits[1];
272
273        // SAFETY:
274        // Same structs, modified in-place.
275        // Single capset writes the new state.
276        if unsafe { capset(&mut header, data.as_ptr()) } != 0 {
277            let e = io::Error::last_os_error();
278            pre_exec_log(b"solti-exec: capset failed: ");
279            if let Some(code) = e.raw_os_error() {
280                pre_exec_log_errno(code);
281            }
282            return Err(e);
283        }
284
285        for cap_value in 0..=CAP_LAST_CAP {
286            if keep_mask.is_set(cap_value) {
287                let _ = raise_ambient_cap(cap_value);
288            }
289        }
290
291        Ok(())
292    }
293
294    /// Clear all ambient capabilities.
295    fn clear_ambient_caps() -> io::Result<()> {
296        let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) };
297        if rc != 0 {
298            let err = io::Error::last_os_error();
299            if err.raw_os_error() != Some(libc::EINVAL) {
300                return Err(err);
301            }
302        }
303
304        Ok(())
305    }
306
307    /// Raise a capability in the ambient set (best-effort).
308    ///
309    /// Returns `Ok(())` for `EINVAL` and `EPERM` (expected on older kernels or when lacking `CAP_SETPCAP`).
310    /// Other errors propagate, but the caller ignores the result with `let _ =`.
311    fn raise_ambient_cap(cap: u32) -> io::Result<()> {
312        let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) };
313        if rc != 0 {
314            let err = io::Error::last_os_error();
315            match err.raw_os_error() {
316                Some(libc::EINVAL) | Some(libc::EPERM) => return Ok(()),
317                _ => return Err(err),
318            }
319        }
320        Ok(())
321    }
322
323    fn apply_no_new_privs() -> io::Result<()> {
324        let rc = unsafe { libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
325        if rc != 0 {
326            Err(io::Error::last_os_error())
327        } else {
328            Ok(())
329        }
330    }
331
332    #[repr(C)]
333    struct CapUserHeader {
334        version: u32,
335        pid: libc::c_int,
336    }
337
338    #[repr(C)]
339    #[derive(Default, Clone, Copy)]
340    struct CapUserData {
341        effective: u32,
342        permitted: u32,
343        inheritable: u32,
344    }
345
346    unsafe extern "C" {
347        fn capset(hdrp: *mut CapUserHeader, datap: *const CapUserData) -> libc::c_int;
348        fn capget(hdrp: *mut CapUserHeader, datap: *mut CapUserData) -> libc::c_int;
349    }
350}
351
352/// Bitmask of Linux capabilities to keep after a bulk drop.
353///
354/// Layout mirrors the kernel v3 capability format: two `u32` words covering caps 0..31 and 32..63 respectively.
355#[derive(Clone, Copy)]
356#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
357struct KeepMask {
358    /// `bits[0]` covers caps 0..31, `bits[1]` covers caps 32..63.
359    bits: [u32; 2],
360}
361
362#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
363impl KeepMask {
364    /// Build a keep-mask from a slice of capabilities.
365    fn from_caps(caps: &[LinuxCapability]) -> Self {
366        let mut bits = [0u32; 2];
367        for cap in caps {
368            let v = cap.to_cap_value();
369            let idx = (v / 32) as usize;
370            if idx < 2 {
371                bits[idx] |= 1u32 << (v % 32);
372            }
373        }
374        Self { bits }
375    }
376
377    /// Returns `true` if the given capability number is set in the mask.
378    fn is_set(self, cap: u32) -> bool {
379        let idx = (cap / 32) as usize;
380        if idx >= 2 {
381            return false;
382        }
383        (self.bits[idx] & (1u32 << (cap % 32))) != 0
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390    use tokio::process::Command;
391
392    #[test]
393    fn empty_config_is_noop() {
394        let cfg = SecurityConfig::default();
395        assert!(cfg.is_empty());
396
397        let mut cmd = Command::new("sh");
398        attach_security(&mut cmd, &cfg);
399    }
400
401    #[cfg(target_os = "linux")]
402    #[test]
403    fn non_empty_config_attaches_pre_exec_hook_on_linux() {
404        let cfg = SecurityConfig {
405            drop_all_caps: true,
406            keep_caps: vec![LinuxCapability::NetAdmin, LinuxCapability::NetBindService],
407            no_new_privs: true,
408            ..Default::default()
409        };
410
411        assert!(!cfg.is_empty());
412
413        let mut cmd = Command::new("sh");
414        attach_security(&mut cmd, &cfg);
415    }
416
417    #[cfg(not(target_os = "linux"))]
418    #[test]
419    fn non_empty_config_is_ignored_on_non_linux() {
420        let cfg = SecurityConfig {
421            drop_all_caps: true,
422            keep_caps: vec![LinuxCapability::NetAdmin],
423            no_new_privs: true,
424            ..Default::default()
425        };
426
427        assert!(!cfg.is_empty());
428
429        let mut cmd = Command::new("sh");
430        attach_security(&mut cmd, &cfg);
431    }
432
433    #[test]
434    fn capability_names_are_correct() {
435        assert_eq!(LinuxCapability::NetAdmin.name(), "NET_ADMIN");
436        assert_eq!(LinuxCapability::SysAdmin.name(), "SYS_ADMIN");
437        assert_eq!(LinuxCapability::Chown.name(), "CHOWN");
438    }
439
440    #[cfg(target_os = "linux")]
441    #[tokio::test]
442    async fn no_new_privs_can_be_set_without_root() {
443        let cfg = SecurityConfig {
444            no_new_privs: true,
445            ..Default::default()
446        };
447        let mut cmd = Command::new("true");
448        attach_security(&mut cmd, &cfg);
449
450        let result = cmd.status().await;
451        assert!(result.is_ok(), "no_new_privs should work without root");
452        assert!(result.unwrap().success());
453    }
454
455    #[test]
456    fn keep_mask_empty_caps_all_zero() {
457        let m = KeepMask::from_caps(&[]);
458        assert_eq!(m.bits, [0, 0]);
459        for cap in 0..=63 {
460            assert!(!m.is_set(cap), "cap {cap} should not be set");
461        }
462    }
463
464    #[test]
465    fn keep_mask_single_low_cap() {
466        let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
467        assert!(m.is_set(0));
468        assert!(!m.is_set(1));
469        assert_eq!(m.bits[0], 1);
470        assert_eq!(m.bits[1], 0);
471    }
472
473    #[test]
474    fn keep_mask_cap_in_second_word() {
475        let m = KeepMask::from_caps(&[LinuxCapability::SetFCap, LinuxCapability::SysPtrace]);
476        assert!(m.is_set(31));
477        assert!(m.is_set(19));
478        assert!(!m.is_set(0));
479        assert_eq!(m.bits[1], 0)
480    }
481
482    #[test]
483    fn keep_mask_multiple_caps() {
484        let caps = [
485            LinuxCapability::Chown,          // 0
486            LinuxCapability::NetBindService, // 10
487            LinuxCapability::NetAdmin,       // 12
488            LinuxCapability::SysAdmin,       // 21
489        ];
490        let m = KeepMask::from_caps(&caps);
491        assert!(m.is_set(0));
492        assert!(m.is_set(10));
493        assert!(m.is_set(12));
494        assert!(m.is_set(21));
495        assert!(!m.is_set(1));
496        assert!(!m.is_set(11));
497        assert!(!m.is_set(63));
498    }
499
500    #[test]
501    fn keep_mask_duplicate_caps_idempotent() {
502        let m1 = KeepMask::from_caps(&[LinuxCapability::Kill]);
503        let m2 = KeepMask::from_caps(&[LinuxCapability::Kill, LinuxCapability::Kill]);
504        assert_eq!(m1.bits, m2.bits);
505    }
506
507    #[test]
508    fn keep_mask_out_of_range_returns_false() {
509        let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
510        assert!(!m.is_set(64));
511        assert!(!m.is_set(100));
512        assert!(!m.is_set(u32::MAX));
513    }
514}