solti_exec/utils/security.rs
1//! # Security: process-level hardening for subprocess runners.
2//!
3//! [`SecurityConfig`] restricts the privilege set of child processes spawned by subprocess runners.
4//!
5//! **Linux:**
6//! - Drop all process capabilities in one batch (`capget` → mask → `capset`)
7//! - Zero heap allocation in the child (closure captures only `Copy` types)
8//! - Raise kept caps in the ambient set for unprivileged `execve`
9//! - Keep an optional allowlist of caps via [`LinuxCapability`]
10//! - Set `no_new_privs` to block suid/sgid escalation
11//!
12//! **Other platforms:**
13//! - `tracing::warn` and no-op.
14//!
15//! ## Also
16//!
17//! - [`SubprocessBackendConfig`](crate::subprocess::SubprocessBackendConfig) builder that consumes `SecurityConfig`.
18//! - [`LinuxCapability`](super::LinuxCapability) capability identifiers for the keep list.
19//!
20//! ## What happens when a subprocess spawns
21//! ```text
22//! parent process
23//! │
24//! fork()
25//! │
26//! ┌──────────────────┼───────────────────┐
27//! │ child process │
28//! │ │
29//! │ ┌── pre_exec hook ───────────────┐ │
30//! │ │ 1. clear ambient caps │ │
31//! │ │ 2. capget current caps │ │
32//! │ │ 3. mask &= keep_mask │ │
33//! │ │ 4. capset (one syscall) │ │
34//! │ │ 5. raise kept in ambient │ │
35//! │ │ 6. set no_new_privs │ │
36//! │ └────────────────────────────────┘ │
37//! │ │
38//! │ execve("echo", ["hello"]) │
39//! │ (runs with minimal caps) │
40//! └──────────────────────────────────────┘
41//! ```
42//!
43//! ## How attach_security works
44//! ```text
45//! attach_security(&mut cmd, &config)
46//! ├──► config.is_empty()? → return early, no hook
47//! │
48//! ├──► Linux:
49//! │ ├──► build KeepMask from config.keep_caps
50//! │ │ └──► Vec<LinuxCapability> → [u32; 2] bitmask (Copy, stack-only)
51//! │ │
52//! │ └──► install pre_exec closure on Command
53//! │ └──► captures: drop_all_caps (bool), no_new_privs (bool), keep_mask ([u32; 2])
54//! │ zero heap: all Copy types
55//! │
56//! └──► non-Linux:
57//! └──► warn!("security settings ignored on {os}") → Ok(())
58//! ```
59//!
60//! ## Capability drop: step by step
61//! ```text
62//! drop_capabilities_batch(keep_mask)
63//! │
64//! ├──► prctl(PR_CAP_AMBIENT, CLEAR_ALL)
65//! │ └──► EINVAL? kernel < 4.3, no ambient - Ok, continue
66//! │
67//! ├──► capget() → read current caps into CapUserData[2]
68//! │ ┌────────────────────────────────────────────────┐
69//! │ │ before mask after mask │
70//! │ │ effective: 1111 effective: 0010 │
71//! │ │ permitted: 1111 permitted: 0010 │
72//! │ │ inheritable:1111 inheritable:0010 │
73//! │ │ │
74//! │ │ keep_mask = 0010 (only CAP_NET_BIND_SERVICE) │
75//! │ └────────────────────────────────────────────────┘
76//! │
77//! ├──► capset() ← one syscall writes all caps
78//! │
79//! └──► for each cap set in keep_mask:
80//! └──► prctl(PR_CAP_AMBIENT, RAISE, cap)
81//! EINVAL | EPERM → Ok (best-effort, older kernel or no permission)
82//! ```
83//!
84//! ## KeepMask layout
85//! ```text
86//! Linux capability v3 format: CapUserData[2] = 2 × u32 = 64 bits
87//!
88//! bits[0] bits[1]
89//! ┌─────────────────────────────┐ ┌─────────────────────────────┐
90//! │ cap 0 cap 1 ... cap 31 │ │ cap 32 cap 33 ... cap 63 │
91//! └─────────────────────────────┘ └─────────────────────────────┘
92//!
93//! CAP_LAST_CAP = 63 - this is NOT a guess, it's the v3 ABI limit.
94//! If kernel ever adds cap > 63, that requires a v4 format with new structs and new syscall signatures - this whole module would need updating anyway.
95//! ```
96//!
97//! ## Configuration
98//!
99//! | Field | What it does | Needs privileges? | If it fails |
100//! |---------------------|---------------------------------------|-------------------|------------------------------------------------|
101//! | `drop_all_caps` | strip all caps except `keep_caps` | `CAP_SETPCAP` | logs warning, go on (or abort if strict) |
102//! | `keep_caps` | allowlist: caps to preserve | `CAP_SETPCAP` | logs warning, go on (or abort if strict) |
103//! | `fail_on_cap_error` | strict mode: abort spawn on cap error | — | — |
104//! | `no_new_privs` | block suid/sgid privilege escalation | none (any user) | **always aborts spawn** |
105//!
106//! ## Async-signal safety
107//!
108//! Everything inside the `pre_exec` closure runs **between `fork()` and `execve()`**.
109//! POSIX says only async-signal-safe functions are allowed there.
110//!
111//! | What we call | Why it's safe |
112//! |------------------------------|--------------------------------------------|
113//! | `prctl()` | direct syscall |
114//! | `capget()` / `capset()` | direct syscalls |
115//! | `libc::write(STDERR)` | async-signal-safe per POSIX |
116//! | `io::Error::last_os_error()` | reads `errno`, no heap (Rust ≥ 1.74) |
117//!
118//! The closure captures **only `Copy` types** (2 bools + `[u32; 2]`).
119//! No `Vec`, no `String`, no `Arc`: zero heap allocation in the child.
120//!
121//! ## Rules
122//! - Capability drop failures are **non-fatal** by default (logged via `pre_exec_log`, continues)
123//! - Set `fail_on_cap_error = true` to make capability drop failures **fatal** (aborts spawn)
124//! - Non-Linux: all knobs are no-op, warning emitted via `tracing::warn`
125//! - `no_new_privs` failure is **always fatal** (returns `Err`, `Command::spawn` fails)
126//! - `KeepMask` is built **before** fork (safe to iterate `Vec<LinuxCapability>`)
127//! - `SecurityConfig::is_empty()` → no hook installed, zero overhead
128use tokio::process::Command;
129
130use crate::utils::LinuxCapability;
131
132#[cfg(not(target_os = "linux"))]
133use tracing::warn;
134
135/// Declarative security policy.
136#[derive(Debug, Clone, Default)]
137pub struct SecurityConfig {
138 /// Drop all capabilities before exec.
139 ///
140 /// Note: capability operations require CAP_SETPCAP or root.
141 /// If the process lacks these privileges, the operation will log a warning and continue (unless `fail_on_cap_error` is set).
142 pub drop_all_caps: bool,
143 /// Optional allowlist of capabilities to keep after `drop_all_caps`.
144 ///
145 /// Only meaningful when `drop_all_caps = true`.
146 pub keep_caps: Vec<LinuxCapability>,
147 /// Enable `no_new_privs` for the child process.
148 ///
149 /// This flag works without root privileges.
150 /// Failures to set this flag are always fatal (spawn will fail).
151 pub no_new_privs: bool,
152 /// When `true`, capability drop failures abort the spawn instead of logging and continuing.
153 ///
154 /// Default: `false` (best-effort - non-fatal).
155 pub fail_on_cap_error: bool,
156}
157
158impl SecurityConfig {
159 /// Returns `true` if no security knobs are configured.
160 #[inline]
161 pub fn is_empty(&self) -> bool {
162 !self.drop_all_caps && self.keep_caps.is_empty() && !self.no_new_privs
163 }
164}
165
166/// Attach security policy to a `tokio::process::Command`.
167pub fn attach_security(cmd: &mut Command, config: &SecurityConfig) {
168 if config.is_empty() {
169 return;
170 }
171
172 #[cfg(target_os = "linux")]
173 {
174 linux_impl::attach(cmd, config);
175 }
176 #[cfg(not(target_os = "linux"))]
177 {
178 let _ = &cmd;
179 warn!(
180 ?config,
181 "security configuration is only enforced on Linux; current OS={}: settings will be ignored",
182 std::env::consts::OS,
183 );
184 }
185}
186
187#[cfg(target_os = "linux")]
188mod linux_impl {
189 use super::{KeepMask, SecurityConfig};
190
191 use crate::utils::log::{pre_exec_log, pre_exec_log_errno};
192 use std::io;
193 use tokio::process::Command;
194
195 const LINUX_CAPABILITY_VERSION_3: u32 = 0x2008_0522;
196 const PR_CAP_AMBIENT: libc::c_int = 47;
197 const PR_CAP_AMBIENT_RAISE: libc::c_ulong = 2;
198 const PR_CAP_AMBIENT_CLEAR_ALL: libc::c_ulong = 4;
199 const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;
200 /// Upper bound of capability v3 bitmask: `CapUserData[2]` = 2 × 32 = 64 bits → caps 0..63.
201 /// This is a kernel ABI limit, not a guess. A v4 format would require new structs + syscall signatures.
202 const CAP_LAST_CAP: u32 = 63;
203
204 /// Install the `pre_exec` hook on the command.
205 ///
206 /// Caller (`attach_security`) already checked `!config.is_empty()`.
207 pub fn attach(cmd: &mut Command, config: &SecurityConfig) {
208 let keep_mask = KeepMask::from_caps(&config.keep_caps);
209 let fail_on_cap_error = config.fail_on_cap_error;
210 let drop_all_caps = config.drop_all_caps;
211 let no_new_privs = config.no_new_privs;
212
213 // SAFETY:
214 // The pre_exec closure runs between fork() and execve() in the child process.
215 //
216 // It calls prctl, capget/capset (async-signal-safe syscalls) and pre_exec_log (raw libc::write).
217 // Error paths use io::Error::last_os_error() which stores errno inline without heap allocation (Rust >= 1.74).
218 //
219 // The closure captures only Copy types (three bools + [u32; 2]): zero heap allocation.
220 unsafe {
221 cmd.pre_exec(move || {
222 if drop_all_caps
223 && let Err(e) = drop_capabilities_batch(keep_mask)
224 && fail_on_cap_error
225 {
226 return Err(e);
227 }
228 if no_new_privs {
229 apply_no_new_privs()?;
230 }
231 Ok(())
232 });
233 }
234 }
235
236 /// Drop all capabilities except those in `keep_mask`, using batch capget/capset.
237 ///
238 /// Each step logs a distinct prefix on failure so the operator can tell which syscall failed (clear_ambient / capget / capset).
239 fn drop_capabilities_batch(keep_mask: KeepMask) -> io::Result<()> {
240 if let Err(e) = clear_ambient_caps() {
241 pre_exec_log(b"solti-exec: clear_ambient_caps failed: ");
242 if let Some(code) = e.raw_os_error() {
243 pre_exec_log_errno(code);
244 }
245 return Err(e);
246 }
247
248 let mut header = CapUserHeader {
249 version: LINUX_CAPABILITY_VERSION_3,
250 pid: 0,
251 };
252 let mut data = [CapUserData::default(); 2];
253
254 // SAFETY:
255 // Header and data are valid stack-local #[repr(C)] structs matching the kernel's
256 // __user_cap_header_struct / __user_cap_data_struct layout.
257 if unsafe { capget(&mut header, data.as_mut_ptr()) } != 0 {
258 let e = io::Error::last_os_error();
259 pre_exec_log(b"solti-exec: capget failed: ");
260 if let Some(code) = e.raw_os_error() {
261 pre_exec_log_errno(code);
262 }
263 return Err(e);
264 }
265
266 data[0].effective &= keep_mask.bits[0];
267 data[0].permitted &= keep_mask.bits[0];
268 data[0].inheritable &= keep_mask.bits[0];
269 data[1].effective &= keep_mask.bits[1];
270 data[1].permitted &= keep_mask.bits[1];
271 data[1].inheritable &= keep_mask.bits[1];
272
273 // SAFETY:
274 // Same structs, modified in-place.
275 // Single capset writes the new state.
276 if unsafe { capset(&mut header, data.as_ptr()) } != 0 {
277 let e = io::Error::last_os_error();
278 pre_exec_log(b"solti-exec: capset failed: ");
279 if let Some(code) = e.raw_os_error() {
280 pre_exec_log_errno(code);
281 }
282 return Err(e);
283 }
284
285 for cap_value in 0..=CAP_LAST_CAP {
286 if keep_mask.is_set(cap_value) {
287 let _ = raise_ambient_cap(cap_value);
288 }
289 }
290
291 Ok(())
292 }
293
294 /// Clear all ambient capabilities.
295 fn clear_ambient_caps() -> io::Result<()> {
296 let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) };
297 if rc != 0 {
298 let err = io::Error::last_os_error();
299 if err.raw_os_error() != Some(libc::EINVAL) {
300 return Err(err);
301 }
302 }
303
304 Ok(())
305 }
306
307 /// Raise a capability in the ambient set (best-effort).
308 ///
309 /// Returns `Ok(())` for `EINVAL` and `EPERM` (expected on older kernels or when lacking `CAP_SETPCAP`).
310 /// Other errors propagate, but the caller ignores the result with `let _ =`.
311 fn raise_ambient_cap(cap: u32) -> io::Result<()> {
312 let rc = unsafe { libc::prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0) };
313 if rc != 0 {
314 let err = io::Error::last_os_error();
315 match err.raw_os_error() {
316 Some(libc::EINVAL) | Some(libc::EPERM) => return Ok(()),
317 _ => return Err(err),
318 }
319 }
320 Ok(())
321 }
322
323 fn apply_no_new_privs() -> io::Result<()> {
324 let rc = unsafe { libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
325 if rc != 0 {
326 Err(io::Error::last_os_error())
327 } else {
328 Ok(())
329 }
330 }
331
332 #[repr(C)]
333 struct CapUserHeader {
334 version: u32,
335 pid: libc::c_int,
336 }
337
338 #[repr(C)]
339 #[derive(Default, Clone, Copy)]
340 struct CapUserData {
341 effective: u32,
342 permitted: u32,
343 inheritable: u32,
344 }
345
346 unsafe extern "C" {
347 fn capset(hdrp: *mut CapUserHeader, datap: *const CapUserData) -> libc::c_int;
348 fn capget(hdrp: *mut CapUserHeader, datap: *mut CapUserData) -> libc::c_int;
349 }
350}
351
352/// Bitmask of Linux capabilities to keep after a bulk drop.
353///
354/// Layout mirrors the kernel v3 capability format: two `u32` words covering caps 0..31 and 32..63 respectively.
355#[derive(Clone, Copy)]
356#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
357struct KeepMask {
358 /// `bits[0]` covers caps 0..31, `bits[1]` covers caps 32..63.
359 bits: [u32; 2],
360}
361
362#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
363impl KeepMask {
364 /// Build a keep-mask from a slice of capabilities.
365 fn from_caps(caps: &[LinuxCapability]) -> Self {
366 let mut bits = [0u32; 2];
367 for cap in caps {
368 let v = cap.to_cap_value();
369 let idx = (v / 32) as usize;
370 if idx < 2 {
371 bits[idx] |= 1u32 << (v % 32);
372 }
373 }
374 Self { bits }
375 }
376
377 /// Returns `true` if the given capability number is set in the mask.
378 fn is_set(self, cap: u32) -> bool {
379 let idx = (cap / 32) as usize;
380 if idx >= 2 {
381 return false;
382 }
383 (self.bits[idx] & (1u32 << (cap % 32))) != 0
384 }
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390 use tokio::process::Command;
391
392 #[test]
393 fn empty_config_is_noop() {
394 let cfg = SecurityConfig::default();
395 assert!(cfg.is_empty());
396
397 let mut cmd = Command::new("sh");
398 attach_security(&mut cmd, &cfg);
399 }
400
401 #[cfg(target_os = "linux")]
402 #[test]
403 fn non_empty_config_attaches_pre_exec_hook_on_linux() {
404 let cfg = SecurityConfig {
405 drop_all_caps: true,
406 keep_caps: vec![LinuxCapability::NetAdmin, LinuxCapability::NetBindService],
407 no_new_privs: true,
408 ..Default::default()
409 };
410
411 assert!(!cfg.is_empty());
412
413 let mut cmd = Command::new("sh");
414 attach_security(&mut cmd, &cfg);
415 }
416
417 #[cfg(not(target_os = "linux"))]
418 #[test]
419 fn non_empty_config_is_ignored_on_non_linux() {
420 let cfg = SecurityConfig {
421 drop_all_caps: true,
422 keep_caps: vec![LinuxCapability::NetAdmin],
423 no_new_privs: true,
424 ..Default::default()
425 };
426
427 assert!(!cfg.is_empty());
428
429 let mut cmd = Command::new("sh");
430 attach_security(&mut cmd, &cfg);
431 }
432
433 #[test]
434 fn capability_names_are_correct() {
435 assert_eq!(LinuxCapability::NetAdmin.name(), "NET_ADMIN");
436 assert_eq!(LinuxCapability::SysAdmin.name(), "SYS_ADMIN");
437 assert_eq!(LinuxCapability::Chown.name(), "CHOWN");
438 }
439
440 #[cfg(target_os = "linux")]
441 #[tokio::test]
442 async fn no_new_privs_can_be_set_without_root() {
443 let cfg = SecurityConfig {
444 no_new_privs: true,
445 ..Default::default()
446 };
447 let mut cmd = Command::new("true");
448 attach_security(&mut cmd, &cfg);
449
450 let result = cmd.status().await;
451 assert!(result.is_ok(), "no_new_privs should work without root");
452 assert!(result.unwrap().success());
453 }
454
455 #[test]
456 fn keep_mask_empty_caps_all_zero() {
457 let m = KeepMask::from_caps(&[]);
458 assert_eq!(m.bits, [0, 0]);
459 for cap in 0..=63 {
460 assert!(!m.is_set(cap), "cap {cap} should not be set");
461 }
462 }
463
464 #[test]
465 fn keep_mask_single_low_cap() {
466 let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
467 assert!(m.is_set(0));
468 assert!(!m.is_set(1));
469 assert_eq!(m.bits[0], 1);
470 assert_eq!(m.bits[1], 0);
471 }
472
473 #[test]
474 fn keep_mask_cap_in_second_word() {
475 let m = KeepMask::from_caps(&[LinuxCapability::SetFCap, LinuxCapability::SysPtrace]);
476 assert!(m.is_set(31));
477 assert!(m.is_set(19));
478 assert!(!m.is_set(0));
479 assert_eq!(m.bits[1], 0)
480 }
481
482 #[test]
483 fn keep_mask_multiple_caps() {
484 let caps = [
485 LinuxCapability::Chown, // 0
486 LinuxCapability::NetBindService, // 10
487 LinuxCapability::NetAdmin, // 12
488 LinuxCapability::SysAdmin, // 21
489 ];
490 let m = KeepMask::from_caps(&caps);
491 assert!(m.is_set(0));
492 assert!(m.is_set(10));
493 assert!(m.is_set(12));
494 assert!(m.is_set(21));
495 assert!(!m.is_set(1));
496 assert!(!m.is_set(11));
497 assert!(!m.is_set(63));
498 }
499
500 #[test]
501 fn keep_mask_duplicate_caps_idempotent() {
502 let m1 = KeepMask::from_caps(&[LinuxCapability::Kill]);
503 let m2 = KeepMask::from_caps(&[LinuxCapability::Kill, LinuxCapability::Kill]);
504 assert_eq!(m1.bits, m2.bits);
505 }
506
507 #[test]
508 fn keep_mask_out_of_range_returns_false() {
509 let m = KeepMask::from_caps(&[LinuxCapability::Chown]);
510 assert!(!m.is_set(64));
511 assert!(!m.is_set(100));
512 assert!(!m.is_set(u32::MAX));
513 }
514}