solti_exec/utils/security.rs
1//! # Security: process-level hardening for subprocess runners.
2//!
3//! [`SecurityConfig`] restricts the privilege set of child processes spawned by subprocess runners.
4//!
5//! **Linux:**
6//! - Drop all process capabilities in one batch (`capget` → mask → `capset`)
7//! - Zero heap allocation in the child (closure captures only `Copy` types)
8//! - Raise kept caps in the ambient set for unprivileged `execve`
9//! - Keep an optional allowlist of caps via [`LinuxCapability`]
10//! - Set `no_new_privs` to block suid/sgid escalation
11//!
12//! **Other platforms:**
13//! - `tracing::warn` and no-op.
14//!
15//! ## Also
16//!
17//! - [`SubprocessBackendConfig`](crate::subprocess::SubprocessBackendConfig) builder that consumes `SecurityConfig`.
18//! - [`LinuxCapability`](super::LinuxCapability) capability identifiers for the keep list.
19//!
20//! ## What happens when a subprocess spawns
21//! ```text
22//! parent process
23//! │
24//! fork()
25//! │
26//! ┌──────────────────┼───────────────────┐
27//! │ child process │
28//! │ │
29//! │ ┌── pre_exec hook ───────────────┐ │
30//! │ │ 1. clear ambient caps │ │
31//! │ │ 2. capget current caps │ │
32//! │ │ 3. mask &= keep_mask │ │
33//! │ │ 4. capset (one syscall) │ │
34//! │ │ 5. raise kept in ambient │ │
35//! │ │ 6. set no_new_privs │ │
36//! │ └────────────────────────────────┘ │
37//! │ │
38//! │ execve("echo", ["hello"]) │
39//! │ (runs with minimal caps) │
40//! └──────────────────────────────────────┘
41//! ```
42//!
43//! ## How attach_security works
44//! ```text
45//! attach_security(&mut cmd, &config)
46//! ├──► config.is_empty()? → return early, no hook
47//! │
48//! ├──► Linux:
49//! │ ├──► build KeepMask from config.keep_caps
50//! │ │ └──► Vec<LinuxCapability> → [u32; 2] bitmask (Copy, stack-only)
51//! │ │
52//! │ └──► install pre_exec closure on Command
53//! │ └──► captures: drop_all_caps (bool), no_new_privs (bool), keep_mask ([u32; 2])
54//! │ zero heap: all Copy types
55//! │
56//! └──► non-Linux:
57//! └──► warn!("security settings ignored on {os}") → Ok(())
58//! ```
59//!
60//! ## Capability drop: step by step
61//! ```text
62//! drop_capabilities_batch(keep_mask)
63//! │
64//! ├──► prctl(PR_CAP_AMBIENT, CLEAR_ALL)
65//! │ └──► EINVAL? kernel < 4.3, no ambient — Ok, continue
66//! │
67//! ├──► capget() → read current caps into CapUserData[2]
68//! │ ┌────────────────────────────────────────────────┐
69//! │ │ before mask after mask │
70//! │ │ effective: 1111 effective: 0010 │
71//! │ │ permitted: 1111 permitted: 0010 │
72//! │ │ inheritable:1111 inheritable:0010 │
73//! │ │ │
74//! │ │ keep_mask = 0010 (only CAP_NET_BIND_SERVICE) │
75//! │ └────────────────────────────────────────────────┘
76//! │
77//! ├──► capset() ← one syscall writes all caps
78//! │
79//! └──► for each cap set in keep_mask:
80//! └──► prctl(PR_CAP_AMBIENT, RAISE, cap)
81//! EINVAL | EPERM → Ok (best-effort, older kernel or no permission)
82//! ```
83//!
84//! ## KeepMask layout
85//! ```text
86//! Linux capability v3 format: CapUserData[2] = 2 × u32 = 64 bits
87//!
88//! bits[0] bits[1]
89//! ┌─────────────────────────────┐ ┌─────────────────────────────┐
90//! │ cap 0 cap 1 ... cap 31 │ │ cap 32 cap 33 ... cap 63 │
91//! └─────────────────────────────┘ └─────────────────────────────┘
92//!
93//! CAP_LAST_CAP = 63 — this is NOT a guess, it's the v3 ABI limit.
94//! If kernel ever adds cap > 63, that requires a v4 format with
95//! new structs and new syscall signatures — this whole module
96//! would need updating anyway.
97//! ```
98//!
99//! ## Configuration
100//!
101//! | Field | What it does | Needs privileges? | If it fails |
102//! |---------------------|---------------------------------------|-------------------|------------------------------------------------|
103//! | `drop_all_caps` | strip all caps except `keep_caps` | `CAP_SETPCAP` | logs warning, go on (or abort if strict) |
104//! | `keep_caps` | allowlist: caps to preserve | `CAP_SETPCAP` | logs warning, go on (or abort if strict) |
105//! | `fail_on_cap_error` | strict mode: abort spawn on cap error | — | — |
106//! | `no_new_privs` | block suid/sgid privilege escalation | none (any user) | **always aborts spawn** |
107//!
108//! ## Async-signal safety
109//!
110//! Everything inside the `pre_exec` closure runs **between `fork()` and `execve()`**.
111//! POSIX says only async-signal-safe functions are allowed there.
112//!
113//! | What we call | Why it's safe |
114//! |------------------------------|--------------------------------------------|
115//! | `prctl()` | direct syscall |
116//! | `capget()` / `capset()` | direct syscalls |
117//! | `libc::write(STDERR)` | async-signal-safe per POSIX |
118//! | `io::Error::last_os_error()` | reads `errno`, no heap (Rust ≥ 1.74) |
119//!
120//! The closure captures **only `Copy` types** (2 bools + `[u32; 2]`).
121//! No `Vec`, no `String`, no `Arc`: zero heap allocation in the child.
122//!
123//! ## Rules
124//! - Capability drop failures are **non-fatal** by default (logged via `pre_exec_log`, continues)
125//! - Set `fail_on_cap_error = true` to make capability drop failures **fatal** (aborts spawn)
126//! - Non-Linux: all knobs are no-op, warning emitted via `tracing::warn`
127//! - `no_new_privs` failure is **always fatal** (returns `Err`, `Command::spawn` fails)
128//! - `KeepMask` is built **before** fork (safe to iterate `Vec<LinuxCapability>`)
129//! - `SecurityConfig::is_empty()` → no hook installed, zero overhead
130use tokio::process::Command;
131
132use crate::utils::LinuxCapability;
133
134#[cfg(not(target_os = "linux"))]
135use tracing::warn;
136
137/// Declarative security policy.
138#[derive(Debug, Clone, Default)]
139pub struct SecurityConfig {
140 /// Drop all capabilities before exec.
141 ///
142 /// Note: capability operations require CAP_SETPCAP or root.
143 /// If the process lacks these privileges, the operation will log a warning and continue (unless `fail_on_cap_error` is set).
144 pub drop_all_caps: bool,
145 /// Optional allowlist of capabilities to keep after `drop_all_caps`.
146 ///
147 /// Only meaningful when `drop_all_caps = true`.
148 pub keep_caps: Vec<LinuxCapability>,
149 /// Enable `no_new_privs` for the child process.
150 ///
151 /// This flag works without root privileges.
152 /// Failures to set this flag are always fatal (spawn will fail).
153 pub no_new_privs: bool,
154 /// When `true`, capability drop failures abort the spawn instead of logging and continuing.
155 ///
156 /// Default: `false` (best-effort — non-fatal).
157 pub fail_on_cap_error: bool,
158}
159
160impl SecurityConfig {
161 /// Returns `true` if no security knobs are configured.
162 #[inline]
163 pub fn is_empty(&self) -> bool {
164 !self.drop_all_caps && self.keep_caps.is_empty() && !self.no_new_privs
165 }
166}
167
168/// Attach security policy to a `tokio::process::Command`.
169pub fn attach_security(cmd: &mut Command, config: &SecurityConfig) {
170 if config.is_empty() {
171 return;
172 }
173
174 #[cfg(target_os = "linux")]
175 {
176 linux_impl::attach(cmd, config);
177 }
178 #[cfg(not(target_os = "linux"))]
179 {
180 let _ = &cmd;
181 warn!(
182 ?config,
183 "security configuration is only enforced on Linux; current OS={}: settings will be ignored",
184 std::env::consts::OS,
185 );
186 }
187}
188
189#[cfg(target_os = "linux")]
190mod linux_impl {
191 use super::{KeepMask, SecurityConfig};
192
193 use crate::utils::log::{pre_exec_log, pre_exec_log_errno};
194 use std::io;
195 use tokio::process::Command;
196
197 const LINUX_CAPABILITY_VERSION_3: u32 = 0x2008_0522;
198 const PR_CAP_AMBIENT: libc::c_int = 47;
199 const PR_CAP_AMBIENT_RAISE: libc::c_ulong = 2;
200 const PR_CAP_AMBIENT_CLEAR_ALL: libc::c_ulong = 4;
201 const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;
202 /// Upper bound of capability v3 bitmask: `CapUserData[2]` = 2 × 32 = 64 bits → caps 0..63.
203 /// This is a kernel ABI limit, not a guess. A v4 format would require new structs + syscall signatures.
204 const CAP_LAST_CAP: u32 = 63;
205
206 /// Install the `pre_exec` hook on the command.
207 ///
208 /// Caller (`attach_security`) already checked `!config.is_empty()`.
209 pub fn attach(cmd: &mut Command, config: &SecurityConfig) {
210 let keep_mask = KeepMask::from_caps(&config.keep_caps);
211 let fail_on_cap_error = config.fail_on_cap_error;
212 let drop_all_caps = config.drop_all_caps;
213 let no_new_privs = config.no_new_privs;
214
215 // SAFETY:
216 // The pre_exec closure runs between fork() and execve() in the child process.
217 //
218 // It calls prctl, capget/capset (async-signal-safe syscalls) and pre_exec_log (raw libc::write).
219 // Error paths use io::Error::last_os_error() which stores errno inline without heap allocation (Rust >= 1.74).
220 //
221 // The closure captures only Copy types (three bools + [u32; 2]): zero heap allocation.
222 unsafe {
223 cmd.pre_exec(move || {
224 if drop_all_caps
225 && let Err(e) = drop_capabilities_batch(keep_mask)
226 && fail_on_cap_error
227 {
228 return Err(e);
229 }
230 if no_new_privs {
231 apply_no_new_privs()?;
232 }
233 Ok(())
234 });
235 }
236 }
237
238 /// Drop all capabilities except those in `keep_mask`, using batch capget/capset.
239 ///
240 /// Each step logs a distinct prefix on failure so the operator can tell which syscall failed (clear_ambient / capget / capset).
241 fn drop_capabilities_batch(keep_mask: KeepMask) -> io::Result<()> {
242 if let Err(e) = clear_ambient_caps() {
243 pre_exec_log(b"solti-exec: clear_ambient_caps failed: ");
244 if let Some(code) = e.raw_os_error() {
245 pre_exec_log_errno(code);
246 }
247 return Err(e);
248 }
249
250 let mut header = CapUserHeader {
251 version: LINUX_CAPABILITY_VERSION_3,
252 pid: 0,
253 };
254 let mut data = [CapUserData::default(); 2];
255
256 // SAFETY:
257 // Header and data are valid stack-local #[repr(C)] structs matching the kernel's
258 // __user_cap_header_struct / __user_cap_data_struct layout.
259 if unsafe { capget(&mut header, data.as_mut_ptr()) } != 0 {
260 let e = io::Error::last_os_error();
261 pre_exec_log(b"solti-exec: capget failed: ");
262 if let Some(code) = e.raw_os_error() {
263 pre_exec_log_errno(code);
264 }
265 return Err(e);
266 }
267
268 data[0].effective &= keep_mask.bits[0];
269 data[0].permitted &= keep_mask.bits[0];
270 data[0].inheritable &= keep_mask.bits[0];
271 data[1].effective &= keep_mask.bits[1];
272 data[1].permitted &= keep_mask.bits[1];
273 data[1].inheritable &= keep_mask.bits[1];
274
275 // SAFETY:
276 // Same structs, modified in-place.
277 // Single capset writes the new state.
278 if unsafe { capset(&mut header, data.as_ptr()) } != 0 {
279 let e = io::Error::last_os_error();
280 pre_exec_log(b"solti-exec: capset failed: ");
281 if let Some(code) = e.raw_os_error() {
282 pre_exec_log_errno(code);
283 }
284 return Err(e);
285 }
286
287 for cap_value in 0..=CAP_LAST_CAP {
288 if keep_mask.is_set(cap_value) {
289 let _ = raise_ambient_cap(cap_value);
290 }
291 }
292
293 Ok(())
294 }
295
296 /// Clear all ambient capabilities.
297 fn clear_ambient_caps() -> io::Result<()> {
298 let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) };
299 if rc != 0 {
300 let err = io::Error::last_os_error();
301 if err.raw_os_error() != Some(libc::EINVAL) {
302 return Err(err);
303 }
304 }
305
306 Ok(())
307 }
308
309 /// Raise a capability in the ambient set (best-effort).
310 ///
311 /// Returns `Ok(())` for `EINVAL` and `EPERM` (expected on older kernels or when lacking `CAP_SETPCAP`).
312 /// Other errors propagate, but the caller ignores the result with `let _ =`.
313 fn raise_ambient_cap(cap: u32) -> io::Result<()> {
314 let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) };
315 if rc != 0 {
316 let err = io::Error::last_os_error();
317 match err.raw_os_error() {
318 Some(libc::EINVAL) | Some(libc::EPERM) => return Ok(()),
319 _ => return Err(err),
320 }
321 }
322 Ok(())
323 }
324
325 fn apply_no_new_privs() -> io::Result<()> {
326 let rc = unsafe { libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
327 if rc != 0 {
328 Err(io::Error::last_os_error())
329 } else {
330 Ok(())
331 }
332 }
333
334 #[repr(C)]
335 struct CapUserHeader {
336 version: u32,
337 pid: libc::c_int,
338 }
339
340 #[repr(C)]
341 #[derive(Default, Clone, Copy)]
342 struct CapUserData {
343 effective: u32,
344 permitted: u32,
345 inheritable: u32,
346 }
347
348 unsafe extern "C" {
349 fn capset(hdrp: *mut CapUserHeader, datap: *const CapUserData) -> libc::c_int;
350 fn capget(hdrp: *mut CapUserHeader, datap: *mut CapUserData) -> libc::c_int;
351 }
352}
353
354/// Bitmask of Linux capabilities to keep after a bulk drop.
355///
356/// Layout mirrors the kernel v3 capability format: two `u32` words covering caps 0..31 and 32..63 respectively.
357#[derive(Clone, Copy)]
358#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
359struct KeepMask {
360 /// `bits[0]` covers caps 0..31, `bits[1]` covers caps 32..63.
361 bits: [u32; 2],
362}
363
364#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
365impl KeepMask {
366 /// Build a keep-mask from a slice of capabilities.
367 fn from_caps(caps: &[LinuxCapability]) -> Self {
368 let mut bits = [0u32; 2];
369 for cap in caps {
370 let v = cap.to_cap_value();
371 let idx = (v / 32) as usize;
372 if idx < 2 {
373 bits[idx] |= 1u32 << (v % 32);
374 }
375 }
376 Self { bits }
377 }
378
379 /// Returns `true` if the given capability number is set in the mask.
380 fn is_set(self, cap: u32) -> bool {
381 let idx = (cap / 32) as usize;
382 if idx >= 2 {
383 return false;
384 }
385 (self.bits[idx] & (1u32 << (cap % 32))) != 0
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392 use tokio::process::Command;
393
394 #[test]
395 fn empty_config_is_noop() {
396 let cfg = SecurityConfig::default();
397 assert!(cfg.is_empty());
398
399 let mut cmd = Command::new("sh");
400 attach_security(&mut cmd, &cfg);
401 }
402
403 #[cfg(target_os = "linux")]
404 #[test]
405 fn non_empty_config_attaches_pre_exec_hook_on_linux() {
406 let cfg = SecurityConfig {
407 drop_all_caps: true,
408 keep_caps: vec![LinuxCapability::NetAdmin, LinuxCapability::NetBindService],
409 no_new_privs: true,
410 ..Default::default()
411 };
412
413 assert!(!cfg.is_empty());
414
415 let mut cmd = Command::new("sh");
416 attach_security(&mut cmd, &cfg);
417 }
418
419 #[cfg(not(target_os = "linux"))]
420 #[test]
421 fn non_empty_config_is_ignored_on_non_linux() {
422 let cfg = SecurityConfig {
423 drop_all_caps: true,
424 keep_caps: vec![LinuxCapability::NetAdmin],
425 no_new_privs: true,
426 ..Default::default()
427 };
428
429 assert!(!cfg.is_empty());
430
431 let mut cmd = Command::new("sh");
432 attach_security(&mut cmd, &cfg);
433 }
434
435 #[test]
436 fn capability_names_are_correct() {
437 assert_eq!(LinuxCapability::NetAdmin.name(), "NET_ADMIN");
438 assert_eq!(LinuxCapability::SysAdmin.name(), "SYS_ADMIN");
439 assert_eq!(LinuxCapability::Chown.name(), "CHOWN");
440 }
441
442 #[cfg(target_os = "linux")]
443 #[tokio::test]
444 async fn no_new_privs_can_be_set_without_root() {
445 let cfg = SecurityConfig {
446 no_new_privs: true,
447 ..Default::default()
448 };
449 let mut cmd = Command::new("true");
450 attach_security(&mut cmd, &cfg);
451
452 let result = cmd.status().await;
453 assert!(result.is_ok(), "no_new_privs should work without root");
454 assert!(result.unwrap().success());
455 }
456
457 #[test]
458 fn keep_mask_empty_caps_all_zero() {
459 let m = KeepMask::from_caps(&[]);
460 assert_eq!(m.bits, [0, 0]);
461 for cap in 0..=63 {
462 assert!(!m.is_set(cap), "cap {cap} should not be set");
463 }
464 }
465
466 #[test]
467 fn keep_mask_single_low_cap() {
468 let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
469 assert!(m.is_set(0));
470 assert!(!m.is_set(1));
471 assert_eq!(m.bits[0], 1);
472 assert_eq!(m.bits[1], 0);
473 }
474
475 #[test]
476 fn keep_mask_cap_in_second_word() {
477 let m = KeepMask::from_caps(&[LinuxCapability::SetFCap, LinuxCapability::SysPtrace]);
478 assert!(m.is_set(31));
479 assert!(m.is_set(19));
480 assert!(!m.is_set(0));
481 assert_eq!(m.bits[1], 0)
482 }
483
484 #[test]
485 fn keep_mask_multiple_caps() {
486 let caps = [
487 LinuxCapability::Chown, // 0
488 LinuxCapability::NetBindService, // 10
489 LinuxCapability::NetAdmin, // 12
490 LinuxCapability::SysAdmin, // 21
491 ];
492 let m = KeepMask::from_caps(&caps);
493 assert!(m.is_set(0));
494 assert!(m.is_set(10));
495 assert!(m.is_set(12));
496 assert!(m.is_set(21));
497 assert!(!m.is_set(1));
498 assert!(!m.is_set(11));
499 assert!(!m.is_set(63));
500 }
501
502 #[test]
503 fn keep_mask_duplicate_caps_idempotent() {
504 let m1 = KeepMask::from_caps(&[LinuxCapability::Kill]);
505 let m2 = KeepMask::from_caps(&[LinuxCapability::Kill, LinuxCapability::Kill]);
506 assert_eq!(m1.bits, m2.bits);
507 }
508
509 #[test]
510 fn keep_mask_out_of_range_returns_false() {
511 let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
512 assert!(!m.is_set(64));
513 assert!(!m.is_set(100));
514 assert!(!m.is_set(u32::MAX));
515 }
516}