Skip to main content

solti_exec/utils/
security.rs

1//! # Security: process-level hardening for subprocess runners.
2//!
3//! [`SecurityConfig`] restricts the privilege set of child processes spawned by subprocess runners.
4//!
5//! **Linux:**
6//! - Drop all process capabilities in one batch (`capget` → mask → `capset`)
7//! - Zero heap allocation in the child (closure captures only `Copy` types)
8//! - Raise kept caps in the ambient set for unprivileged `execve`
9//! - Keep an optional allowlist of caps via [`LinuxCapability`]
10//! - Set `no_new_privs` to block suid/sgid escalation
11//!
12//! **Other platforms:**
13//! - `tracing::warn` and no-op.
14//!
15//! ## Also
16//!
17//! - [`SubprocessBackendConfig`](crate::subprocess::SubprocessBackendConfig) builder that consumes `SecurityConfig`.
18//! - [`LinuxCapability`](super::LinuxCapability) capability identifiers for the keep list.
19//!
20//! ## What happens when a subprocess spawns
21//! ```text
22//!                        parent process
23//!                             │
24//!                           fork()
25//!                             │
26//!          ┌──────────────────┼───────────────────┐
27//!          │            child process             │
28//!          │                                      │
29//!          │  ┌── pre_exec hook ───────────────┐  │
30//!          │  │  1. clear ambient caps         │  │
31//!          │  │  2. capget current caps        │  │
32//!          │  │  3. mask &= keep_mask          │  │
33//!          │  │  4. capset (one syscall)       │  │
34//!          │  │  5. raise kept in ambient      │  │
35//!          │  │  6. set no_new_privs           │  │
36//!          │  └────────────────────────────────┘  │
37//!          │                                      │
38//!          │  execve("echo", ["hello"])           │
39//!          │  (runs with minimal caps)            │
40//!          └──────────────────────────────────────┘
41//! ```
42//!
43//! ## How attach_security works
44//! ```text
45//! attach_security(&mut cmd, &config)
46//!     ├──► config.is_empty()? → return early, no hook
47//!     │
48//!     ├──► Linux:
49//!     │     ├──► build KeepMask from config.keep_caps
50//!     │     │     └──► Vec<LinuxCapability> → [u32; 2] bitmask (Copy, stack-only)
51//!     │     │
52//!     │     └──► install pre_exec closure on Command
53//!     │           └──► captures: drop_all_caps (bool), no_new_privs (bool), keep_mask ([u32; 2])
54//!     │                zero heap: all Copy types
55//!     │
56//!     └──► non-Linux:
57//!           └──► warn!("security settings ignored on {os}") → Ok(())
58//! ```
59//!
60//! ## Capability drop: step by step
61//! ```text
62//! drop_capabilities_batch(keep_mask)
63//!     │
64//!     ├──► prctl(PR_CAP_AMBIENT, CLEAR_ALL)
65//!     │     └──► EINVAL? kernel < 4.3, no ambient — Ok, continue
66//!     │
67//!     ├──► capget() → read current caps into CapUserData[2]
68//!     │    ┌────────────────────────────────────────────────┐
69//!     │    │  before mask             after mask            │
70//!     │    │  effective:  1111        effective:  0010      │
71//!     │    │  permitted:  1111        permitted:  0010      │
72//!     │    │  inheritable:1111        inheritable:0010      │
73//!     │    │                                                │
74//!     │    │  keep_mask = 0010 (only CAP_NET_BIND_SERVICE)  │
75//!     │    └────────────────────────────────────────────────┘
76//!     │
77//!     ├──► capset() ← one syscall writes all caps
78//!     │
79//!     └──► for each cap set in keep_mask:
80//!           └──► prctl(PR_CAP_AMBIENT, RAISE, cap)
81//!                EINVAL | EPERM → Ok (best-effort, older kernel or no permission)
82//! ```
83//!
84//! ## KeepMask layout
85//! ```text
86//! Linux capability v3 format: CapUserData[2] = 2 × u32 = 64 bits
87//!
88//!   bits[0]                          bits[1]
89//!   ┌─────────────────────────────┐  ┌─────────────────────────────┐
90//!   │ cap 0  cap 1 ... cap 31     │  │ cap 32  cap 33 ... cap 63   │
91//!   └─────────────────────────────┘  └─────────────────────────────┘
92//!
93//!   CAP_LAST_CAP = 63 — this is NOT a guess, it's the v3 ABI limit.
94//!   If kernel ever adds cap > 63, that requires a v4 format with
95//!   new structs and new syscall signatures — this whole module
96//!   would need updating anyway.
97//! ```
98//!
99//! ## Configuration
100//!
101//! | Field               | What it does                          | Needs privileges? | If it fails                                    |
102//! |---------------------|---------------------------------------|-------------------|------------------------------------------------|
103//! | `drop_all_caps`     | strip all caps except `keep_caps`     | `CAP_SETPCAP`     | logs warning, go on (or abort if strict)       |
104//! | `keep_caps`         | allowlist: caps to preserve           | `CAP_SETPCAP`     | logs warning, go on (or abort if strict)       |
105//! | `fail_on_cap_error` | strict mode: abort spawn on cap error | —                 | —                                              |
106//! | `no_new_privs`      | block suid/sgid privilege escalation  | none (any user)   | **always aborts spawn**                        |
107//!
108//! ## Async-signal safety
109//!
110//! Everything inside the `pre_exec` closure runs **between `fork()` and `execve()`**.
111//! POSIX says only async-signal-safe functions are allowed there.
112//!
113//! | What we call                 | Why it's safe                              |
114//! |------------------------------|--------------------------------------------|
115//! | `prctl()`                    | direct syscall                             |
116//! | `capget()` / `capset()`      | direct syscalls                            |
117//! | `libc::write(STDERR)`        | async-signal-safe per POSIX                |
118//! | `io::Error::last_os_error()` | reads `errno`, no heap (Rust ≥ 1.74)       |
119//!
120//! The closure captures **only `Copy` types** (2 bools + `[u32; 2]`).
121//! No `Vec`, no `String`, no `Arc`: zero heap allocation in the child.
122//!
123//! ## Rules
124//! - Capability drop failures are **non-fatal** by default (logged via `pre_exec_log`, continues)
125//! - Set `fail_on_cap_error = true` to make capability drop failures **fatal** (aborts spawn)
126//! - Non-Linux: all knobs are no-op, warning emitted via `tracing::warn`
127//! - `no_new_privs` failure is **always fatal** (returns `Err`, `Command::spawn` fails)
128//! - `KeepMask` is built **before** fork (safe to iterate `Vec<LinuxCapability>`)
129//! - `SecurityConfig::is_empty()` → no hook installed, zero overhead
130use tokio::process::Command;
131
132use crate::utils::LinuxCapability;
133
134#[cfg(not(target_os = "linux"))]
135use tracing::warn;
136
137/// Declarative security policy.
138#[derive(Debug, Clone, Default)]
139pub struct SecurityConfig {
140    /// Drop all capabilities before exec.
141    ///
142    /// Note: capability operations require CAP_SETPCAP or root.
143    /// If the process lacks these privileges, the operation will log a warning and continue (unless `fail_on_cap_error` is set).
144    pub drop_all_caps: bool,
145    /// Optional allowlist of capabilities to keep after `drop_all_caps`.
146    ///
147    /// Only meaningful when `drop_all_caps = true`.
148    pub keep_caps: Vec<LinuxCapability>,
149    /// Enable `no_new_privs` for the child process.
150    ///
151    /// This flag works without root privileges.
152    /// Failures to set this flag are always fatal (spawn will fail).
153    pub no_new_privs: bool,
154    /// When `true`, capability drop failures abort the spawn instead of logging and continuing.
155    ///
156    /// Default: `false` (best-effort — non-fatal).
157    pub fail_on_cap_error: bool,
158}
159
160impl SecurityConfig {
161    /// Returns `true` if no security knobs are configured.
162    #[inline]
163    pub fn is_empty(&self) -> bool {
164        !self.drop_all_caps && self.keep_caps.is_empty() && !self.no_new_privs
165    }
166}
167
168/// Attach security policy to a `tokio::process::Command`.
169pub fn attach_security(cmd: &mut Command, config: &SecurityConfig) {
170    if config.is_empty() {
171        return;
172    }
173
174    #[cfg(target_os = "linux")]
175    {
176        linux_impl::attach(cmd, config);
177    }
178    #[cfg(not(target_os = "linux"))]
179    {
180        let _ = &cmd;
181        warn!(
182            ?config,
183            "security configuration is only enforced on Linux; current OS={}: settings will be ignored",
184            std::env::consts::OS,
185        );
186    }
187}
188
189#[cfg(target_os = "linux")]
190mod linux_impl {
191    use super::{KeepMask, SecurityConfig};
192
193    use crate::utils::log::{pre_exec_log, pre_exec_log_errno};
194    use std::io;
195    use tokio::process::Command;
196
197    const LINUX_CAPABILITY_VERSION_3: u32 = 0x2008_0522;
198    const PR_CAP_AMBIENT: libc::c_int = 47;
199    const PR_CAP_AMBIENT_RAISE: libc::c_ulong = 2;
200    const PR_CAP_AMBIENT_CLEAR_ALL: libc::c_ulong = 4;
201    const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;
202    /// Upper bound of capability v3 bitmask: `CapUserData[2]` = 2 × 32 = 64 bits → caps 0..63.
203    /// This is a kernel ABI limit, not a guess. A v4 format would require new structs + syscall signatures.
204    const CAP_LAST_CAP: u32 = 63;
205
206    /// Install the `pre_exec` hook on the command.
207    ///
208    /// Caller (`attach_security`) already checked `!config.is_empty()`.
209    pub fn attach(cmd: &mut Command, config: &SecurityConfig) {
210        let keep_mask = KeepMask::from_caps(&config.keep_caps);
211        let fail_on_cap_error = config.fail_on_cap_error;
212        let drop_all_caps = config.drop_all_caps;
213        let no_new_privs = config.no_new_privs;
214
215        // SAFETY:
216        // The pre_exec closure runs between fork() and execve() in the child process.
217        //
218        // It calls prctl, capget/capset (async-signal-safe syscalls) and pre_exec_log (raw libc::write).
219        // Error paths use io::Error::last_os_error() which stores errno inline without heap allocation (Rust >= 1.74).
220        //
221        // The closure captures only Copy types (three bools + [u32; 2]): zero heap allocation.
222        unsafe {
223            cmd.pre_exec(move || {
224                if drop_all_caps
225                    && let Err(e) = drop_capabilities_batch(keep_mask)
226                    && fail_on_cap_error
227                {
228                    return Err(e);
229                }
230                if no_new_privs {
231                    apply_no_new_privs()?;
232                }
233                Ok(())
234            });
235        }
236    }
237
238    /// Drop all capabilities except those in `keep_mask`, using batch capget/capset.
239    ///
240    /// Each step logs a distinct prefix on failure so the operator can tell which syscall failed (clear_ambient / capget / capset).
241    fn drop_capabilities_batch(keep_mask: KeepMask) -> io::Result<()> {
242        if let Err(e) = clear_ambient_caps() {
243            pre_exec_log(b"solti-exec: clear_ambient_caps failed: ");
244            if let Some(code) = e.raw_os_error() {
245                pre_exec_log_errno(code);
246            }
247            return Err(e);
248        }
249
250        let mut header = CapUserHeader {
251            version: LINUX_CAPABILITY_VERSION_3,
252            pid: 0,
253        };
254        let mut data = [CapUserData::default(); 2];
255
256        // SAFETY:
257        // Header and data are valid stack-local #[repr(C)] structs matching the kernel's
258        // __user_cap_header_struct / __user_cap_data_struct layout.
259        if unsafe { capget(&mut header, data.as_mut_ptr()) } != 0 {
260            let e = io::Error::last_os_error();
261            pre_exec_log(b"solti-exec: capget failed: ");
262            if let Some(code) = e.raw_os_error() {
263                pre_exec_log_errno(code);
264            }
265            return Err(e);
266        }
267
268        data[0].effective &= keep_mask.bits[0];
269        data[0].permitted &= keep_mask.bits[0];
270        data[0].inheritable &= keep_mask.bits[0];
271        data[1].effective &= keep_mask.bits[1];
272        data[1].permitted &= keep_mask.bits[1];
273        data[1].inheritable &= keep_mask.bits[1];
274
275        // SAFETY:
276        // Same structs, modified in-place.
277        // Single capset writes the new state.
278        if unsafe { capset(&mut header, data.as_ptr()) } != 0 {
279            let e = io::Error::last_os_error();
280            pre_exec_log(b"solti-exec: capset failed: ");
281            if let Some(code) = e.raw_os_error() {
282                pre_exec_log_errno(code);
283            }
284            return Err(e);
285        }
286
287        for cap_value in 0..=CAP_LAST_CAP {
288            if keep_mask.is_set(cap_value) {
289                let _ = raise_ambient_cap(cap_value);
290            }
291        }
292
293        Ok(())
294    }
295
296    /// Clear all ambient capabilities.
297    fn clear_ambient_caps() -> io::Result<()> {
298        let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) };
299        if rc != 0 {
300            let err = io::Error::last_os_error();
301            if err.raw_os_error() != Some(libc::EINVAL) {
302                return Err(err);
303            }
304        }
305
306        Ok(())
307    }
308
309    /// Raise a capability in the ambient set (best-effort).
310    ///
311    /// Returns `Ok(())` for `EINVAL` and `EPERM` (expected on older kernels or when lacking `CAP_SETPCAP`).
312    /// Other errors propagate, but the caller ignores the result with `let _ =`.
313    fn raise_ambient_cap(cap: u32) -> io::Result<()> {
314        let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) };
315        if rc != 0 {
316            let err = io::Error::last_os_error();
317            match err.raw_os_error() {
318                Some(libc::EINVAL) | Some(libc::EPERM) => return Ok(()),
319                _ => return Err(err),
320            }
321        }
322        Ok(())
323    }
324
325    fn apply_no_new_privs() -> io::Result<()> {
326        let rc = unsafe { libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
327        if rc != 0 {
328            Err(io::Error::last_os_error())
329        } else {
330            Ok(())
331        }
332    }
333
334    #[repr(C)]
335    struct CapUserHeader {
336        version: u32,
337        pid: libc::c_int,
338    }
339
340    #[repr(C)]
341    #[derive(Default, Clone, Copy)]
342    struct CapUserData {
343        effective: u32,
344        permitted: u32,
345        inheritable: u32,
346    }
347
348    unsafe extern "C" {
349        fn capset(hdrp: *mut CapUserHeader, datap: *const CapUserData) -> libc::c_int;
350        fn capget(hdrp: *mut CapUserHeader, datap: *mut CapUserData) -> libc::c_int;
351    }
352}
353
354/// Bitmask of Linux capabilities to keep after a bulk drop.
355///
356/// Layout mirrors the kernel v3 capability format: two `u32` words covering caps 0..31 and 32..63 respectively.
357#[derive(Clone, Copy)]
358#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
359struct KeepMask {
360    /// `bits[0]` covers caps 0..31, `bits[1]` covers caps 32..63.
361    bits: [u32; 2],
362}
363
364#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
365impl KeepMask {
366    /// Build a keep-mask from a slice of capabilities.
367    fn from_caps(caps: &[LinuxCapability]) -> Self {
368        let mut bits = [0u32; 2];
369        for cap in caps {
370            let v = cap.to_cap_value();
371            let idx = (v / 32) as usize;
372            if idx < 2 {
373                bits[idx] |= 1u32 << (v % 32);
374            }
375        }
376        Self { bits }
377    }
378
379    /// Returns `true` if the given capability number is set in the mask.
380    fn is_set(self, cap: u32) -> bool {
381        let idx = (cap / 32) as usize;
382        if idx >= 2 {
383            return false;
384        }
385        (self.bits[idx] & (1u32 << (cap % 32))) != 0
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392    use tokio::process::Command;
393
394    #[test]
395    fn empty_config_is_noop() {
396        let cfg = SecurityConfig::default();
397        assert!(cfg.is_empty());
398
399        let mut cmd = Command::new("sh");
400        attach_security(&mut cmd, &cfg);
401    }
402
403    #[cfg(target_os = "linux")]
404    #[test]
405    fn non_empty_config_attaches_pre_exec_hook_on_linux() {
406        let cfg = SecurityConfig {
407            drop_all_caps: true,
408            keep_caps: vec![LinuxCapability::NetAdmin, LinuxCapability::NetBindService],
409            no_new_privs: true,
410            ..Default::default()
411        };
412
413        assert!(!cfg.is_empty());
414
415        let mut cmd = Command::new("sh");
416        attach_security(&mut cmd, &cfg);
417    }
418
419    #[cfg(not(target_os = "linux"))]
420    #[test]
421    fn non_empty_config_is_ignored_on_non_linux() {
422        let cfg = SecurityConfig {
423            drop_all_caps: true,
424            keep_caps: vec![LinuxCapability::NetAdmin],
425            no_new_privs: true,
426            ..Default::default()
427        };
428
429        assert!(!cfg.is_empty());
430
431        let mut cmd = Command::new("sh");
432        attach_security(&mut cmd, &cfg);
433    }
434
435    #[test]
436    fn capability_names_are_correct() {
437        assert_eq!(LinuxCapability::NetAdmin.name(), "NET_ADMIN");
438        assert_eq!(LinuxCapability::SysAdmin.name(), "SYS_ADMIN");
439        assert_eq!(LinuxCapability::Chown.name(), "CHOWN");
440    }
441
442    #[cfg(target_os = "linux")]
443    #[tokio::test]
444    async fn no_new_privs_can_be_set_without_root() {
445        let cfg = SecurityConfig {
446            no_new_privs: true,
447            ..Default::default()
448        };
449        let mut cmd = Command::new("true");
450        attach_security(&mut cmd, &cfg);
451
452        let result = cmd.status().await;
453        assert!(result.is_ok(), "no_new_privs should work without root");
454        assert!(result.unwrap().success());
455    }
456
457    #[test]
458    fn keep_mask_empty_caps_all_zero() {
459        let m = KeepMask::from_caps(&[]);
460        assert_eq!(m.bits, [0, 0]);
461        for cap in 0..=63 {
462            assert!(!m.is_set(cap), "cap {cap} should not be set");
463        }
464    }
465
466    #[test]
467    fn keep_mask_single_low_cap() {
468        let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
469        assert!(m.is_set(0));
470        assert!(!m.is_set(1));
471        assert_eq!(m.bits[0], 1);
472        assert_eq!(m.bits[1], 0);
473    }
474
475    #[test]
476    fn keep_mask_cap_in_second_word() {
477        let m = KeepMask::from_caps(&[LinuxCapability::SetFCap, LinuxCapability::SysPtrace]);
478        assert!(m.is_set(31));
479        assert!(m.is_set(19));
480        assert!(!m.is_set(0));
481        assert_eq!(m.bits[1], 0)
482    }
483
484    #[test]
485    fn keep_mask_multiple_caps() {
486        let caps = [
487            LinuxCapability::Chown,          // 0
488            LinuxCapability::NetBindService, // 10
489            LinuxCapability::NetAdmin,       // 12
490            LinuxCapability::SysAdmin,       // 21
491        ];
492        let m = KeepMask::from_caps(&caps);
493        assert!(m.is_set(0));
494        assert!(m.is_set(10));
495        assert!(m.is_set(12));
496        assert!(m.is_set(21));
497        assert!(!m.is_set(1));
498        assert!(!m.is_set(11));
499        assert!(!m.is_set(63));
500    }
501
502    #[test]
503    fn keep_mask_duplicate_caps_idempotent() {
504        let m1 = KeepMask::from_caps(&[LinuxCapability::Kill]);
505        let m2 = KeepMask::from_caps(&[LinuxCapability::Kill, LinuxCapability::Kill]);
506        assert_eq!(m1.bits, m2.bits);
507    }
508
509    #[test]
510    fn keep_mask_out_of_range_returns_false() {
511        let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
512        assert!(!m.is_set(64));
513        assert!(!m.is_set(100));
514        assert!(!m.is_set(u32::MAX));
515    }
516}