sandlock_core/policy_fn.rs
1//! Dynamic policy — live policy modification via syscall event callbacks.
2//!
3//! Allows a user-provided callback to inspect syscall events and adjust
4//! sandbox permissions at runtime (grant, restrict, per-PID overrides).
5//!
6//! ```ignore
7//! let policy = Policy::builder()
8//! .fs_read("/usr").fs_read("/lib")
9//! .net_allow_host("127.0.0.1")
10//! .policy_fn(|event, ctx| {
11//! if event.syscall == "connect" && event.host == Some("10.0.0.5".parse().unwrap()) {
12//! return Verdict::Deny;
13//! }
14//! Verdict::Allow
15//! })
16//! .build()?;
17//! ```
18//!
19//! # TOCTOU and string-typed fields
20//!
21//! Path and argv strings the kernel will re-read after a `Continue`
22//! response (per `seccomp_unotify(2)`) are not exposed on this event.
23//! Path-based access control belongs in static Landlock rules
24//! (`fs_read`/`fs_write`/`fs_deny`), which the kernel enforces directly
25//! and which are not subject to user-memory races. Network fields
26//! (`host`, `port`) are TOCTOU-safe because the supervisor performs
27//! `connect`/`sendto`/`bind` on-behalf via `pidfd_getfd` and the kernel
28//! never re-reads child memory for those syscalls.
29
30use std::collections::{HashMap, HashSet};
31use std::net::IpAddr;
32use std::sync::{Arc, RwLock};
33
34// ============================================================
35// SyscallCategory
36// ============================================================
37
38/// High-level category of a syscall event.
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum SyscallCategory {
41 /// Filesystem operations (openat, unlinkat, mkdirat, etc.)
42 File,
43 /// Network operations (connect, sendto, bind, etc.)
44 Network,
45 /// Process lifecycle (clone, execve, vfork, etc.)
46 Process,
47 /// Memory management (mmap, munmap, brk, etc.)
48 Memory,
49}
50
51// ============================================================
52// SyscallEvent
53// ============================================================
54
55/// An intercepted syscall event observed by the seccomp supervisor.
56///
57/// # TOCTOU and string-typed fields
58///
59/// Path strings are deliberately absent. Per `seccomp_unotify(2)`, the
60/// kernel re-reads user-memory pointers after a `Continue` response, so
61/// any path-string-based decision is racy in a multi-threaded child.
62/// Path-based access control belongs in static Landlock rules
63/// (`fs_read` / `fs_write` / `fs_deny`); see issue #27.
64///
65/// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by
66/// construction: with `policy_fn` active, fork-like syscalls are traced
67/// for one ptrace creation event, so children are registered in
68/// `ProcessIndex` before they can run user code. Before the supervisor
69/// exposes `argv` to `policy_fn` or returns `Continue` for an execve, it
70/// then `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task that could write
71/// the memory — both sibling threads of the calling tid (same TGID, share
72/// `mm_struct`) and peer threads in other TGIDs that may alias argv
73/// pages via `MAP_SHARED` mappings or share `mm_struct` via
74/// `clone(CLONE_VM)`. The kernel's post-Continue re-read therefore
75/// sees the same memory the supervisor inspected. Siblings are killed
76/// by the kernel during execve's `de_thread` step; peer threads are
77/// detached after `NOTIF_SEND` and resume normally. See
78/// `crate::freeze`.
79///
80/// Network fields (`host`, `port`) are TOCTOU-safe because the
81/// supervisor performs `connect`/`sendto`/`bind` on-behalf via
82/// `pidfd_getfd` and the kernel never re-reads child memory for those.
83#[derive(Debug, Clone)]
84pub struct SyscallEvent {
85 /// Syscall name (e.g., "connect", "openat", "execve", "clone").
86 pub syscall: String,
87 /// High-level category.
88 pub category: SyscallCategory,
89 /// PID of the process that made the syscall.
90 pub pid: u32,
91 /// Parent PID (read from /proc/{pid}/stat).
92 pub parent_pid: Option<u32>,
93 /// Destination IP address (for connect, sendto). TOCTOU-safe.
94 pub host: Option<IpAddr>,
95 /// Destination port (for connect, sendto, bind). TOCTOU-safe.
96 pub port: Option<u16>,
97 /// Size argument (for mmap, brk).
98 pub size: Option<u64>,
99 /// Command arguments for execve/execveat. TOCTOU-safe: every task
100 /// in `ProcessIndex` (caller's siblings and peer processes) is
101 /// frozen before argv is read for this event and before the kernel
102 /// re-reads argv from child memory; fork-like syscalls register
103 /// children before they can run user code while `policy_fn` is
104 /// active.
105 pub argv: Option<Vec<String>>,
106 /// Whether the supervisor denied this syscall.
107 pub denied: bool,
108}
109
110impl SyscallEvent {
111 /// Returns true if any argv element contains the given substring.
112 /// Only meaningful for execve/execveat events (where argv is populated).
113 pub fn argv_contains(&self, s: &str) -> bool {
114 self.argv.as_ref().map_or(false, |args| args.iter().any(|a| a.contains(s)))
115 }
116}
117
118// ============================================================
119// LivePolicy — atomically swappable runtime policy
120// ============================================================
121
122/// Runtime policy state that can be modified by the policy callback.
123///
124/// This is separate from the static `Policy` — it holds only the fields
125/// that can be dynamically adjusted at runtime.
126#[derive(Debug, Clone)]
127pub struct LivePolicy {
128 /// Allowed destination IPs for outbound connections.
129 pub allowed_ips: HashSet<IpAddr>,
130 /// Maximum memory in bytes (0 = unlimited).
131 pub max_memory_bytes: u64,
132 /// Maximum number of forks.
133 pub max_processes: u32,
134}
135
136// ============================================================
137// PolicyContext
138// ============================================================
139
140/// Context passed to the policy callback for inspecting and modifying policy.
141///
142/// - `grant()`: expand permissions up to the ceiling (reversible)
143/// - `restrict()`: permanently shrink permissions (irreversible)
144/// - `restrict_pid()`: apply per-PID network overrides
145pub struct PolicyContext {
146 live: Arc<RwLock<LivePolicy>>,
147 ceiling: LivePolicy,
148 restricted: HashSet<&'static str>,
149 pid_overrides: Arc<RwLock<HashMap<u32, HashSet<IpAddr>>>>,
150 denied_paths: Arc<RwLock<HashSet<String>>>,
151}
152
153impl PolicyContext {
154 pub(crate) fn new(
155 live: Arc<RwLock<LivePolicy>>,
156 ceiling: LivePolicy,
157 pid_overrides: Arc<RwLock<HashMap<u32, HashSet<IpAddr>>>>,
158 denied_paths: Arc<RwLock<HashSet<String>>>,
159 ) -> Self {
160 Self {
161 live,
162 ceiling,
163 restricted: HashSet::new(),
164 pid_overrides,
165 denied_paths,
166 }
167 }
168
169 /// Current effective policy (snapshot).
170 pub fn current(&self) -> LivePolicy {
171 self.live.read().unwrap().clone()
172 }
173
174 /// Maximum permissions (immutable ceiling).
175 pub fn ceiling(&self) -> &LivePolicy {
176 &self.ceiling
177 }
178
179 // ---- Grant (expand within ceiling) ----
180
181 /// Expand allowed IPs. Cannot exceed ceiling. Fails if restricted.
182 pub fn grant_network(&mut self, ips: &[IpAddr]) -> Result<(), PolicyFnError> {
183 self.check_not_restricted("allowed_ips")?;
184 let mut live = self.live.write().unwrap();
185 for ip in ips {
186 if self.ceiling.allowed_ips.contains(ip) {
187 live.allowed_ips.insert(*ip);
188 }
189 }
190 Ok(())
191 }
192
193 /// Expand max memory. Cannot exceed ceiling. Fails if restricted.
194 pub fn grant_max_memory(&mut self, bytes: u64) -> Result<(), PolicyFnError> {
195 self.check_not_restricted("max_memory_bytes")?;
196 let mut live = self.live.write().unwrap();
197 live.max_memory_bytes = bytes.min(self.ceiling.max_memory_bytes);
198 Ok(())
199 }
200
201 /// Expand max processes. Cannot exceed ceiling. Fails if restricted.
202 pub fn grant_max_processes(&mut self, n: u32) -> Result<(), PolicyFnError> {
203 self.check_not_restricted("max_processes")?;
204 let mut live = self.live.write().unwrap();
205 live.max_processes = n.min(self.ceiling.max_processes);
206 Ok(())
207 }
208
209 // ---- Restrict (permanent shrink) ----
210
211 /// Permanently restrict allowed IPs. Cannot be granted back.
212 pub fn restrict_network(&mut self, ips: &[IpAddr]) {
213 self.restricted.insert("allowed_ips");
214 let mut live = self.live.write().unwrap();
215 live.allowed_ips = ips.iter().copied().collect();
216 }
217
218 /// Permanently restrict max memory. Cannot be granted back.
219 pub fn restrict_max_memory(&mut self, bytes: u64) {
220 self.restricted.insert("max_memory_bytes");
221 let mut live = self.live.write().unwrap();
222 live.max_memory_bytes = bytes;
223 }
224
225 /// Permanently restrict max processes. Cannot be granted back.
226 pub fn restrict_max_processes(&mut self, n: u32) {
227 self.restricted.insert("max_processes");
228 let mut live = self.live.write().unwrap();
229 live.max_processes = n;
230 }
231
232 // ---- Per-PID overrides ----
233
234 /// Restrict network for a specific PID (tighter than global policy).
235 pub fn restrict_pid_network(&self, pid: u32, ips: &[IpAddr]) {
236 let mut overrides = self.pid_overrides.write().unwrap();
237 overrides.insert(pid, ips.iter().copied().collect());
238 }
239
240 /// Remove per-PID override, falling back to global policy.
241 pub fn clear_pid_override(&self, pid: u32) {
242 let mut overrides = self.pid_overrides.write().unwrap();
243 overrides.remove(&pid);
244 }
245
246 // ---- Filesystem restriction ----
247
248 /// Deny access to a path (and all children). Checked by the supervisor
249 /// on openat/stat/access syscalls. Takes effect immediately.
250 pub fn deny_path(&self, path: &str) {
251 let mut denied = self.denied_paths.write().unwrap();
252 denied.insert(path.to_string());
253 }
254
255 /// Remove a previously denied path.
256 pub fn allow_path(&self, path: &str) {
257 let mut denied = self.denied_paths.write().unwrap();
258 denied.remove(path);
259 }
260
261 // ---- Internal ----
262
263 fn check_not_restricted(&self, field: &str) -> Result<(), PolicyFnError> {
264 if self.restricted.contains(field) {
265 Err(PolicyFnError::FieldRestricted(field.to_string()))
266 } else {
267 Ok(())
268 }
269 }
270}
271
272// ============================================================
273// Error type
274// ============================================================
275
276/// Errors from policy callback operations.
277#[derive(Debug, thiserror::Error)]
278pub enum PolicyFnError {
279 #[error("cannot grant restricted field: {0}")]
280 FieldRestricted(String),
281}
282
283// ============================================================
284// PolicyCallback type
285// ============================================================
286
287/// Verdict returned by the policy callback for the current syscall.
288#[derive(Debug, Clone, PartialEq, Eq)]
289pub enum Verdict {
290 /// Allow the syscall to proceed (default).
291 Allow,
292 /// Allow but flag for audit logging.
293 Audit,
294 /// Deny the syscall with EPERM.
295 Deny,
296 /// Deny the syscall with a specific errno.
297 DenyWith(i32),
298}
299
300impl Default for Verdict {
301 fn default() -> Self { Verdict::Allow }
302}
303
304/// A callback function invoked for each intercepted syscall.
305///
306/// Called synchronously on a dedicated thread. For `execve` syscalls,
307/// the child process is held until the callback returns.
308///
309/// Return `Verdict::Deny` to block the current syscall. Only effective
310/// for held syscalls (execve/execveat) and network syscalls (connect/sendto).
311///
312/// Wrapped in `Arc` so that `Policy` remains `Clone`.
313pub type PolicyCallback = Arc<dyn Fn(SyscallEvent, &mut PolicyContext) -> Verdict + Send + Sync + 'static>;
314
315// ============================================================
316// Event channel types (used by supervisor integration)
317// ============================================================
318
319/// An event sent from the supervisor to the policy callback thread.
320pub struct PolicyEvent {
321 pub event: SyscallEvent,
322 /// If Some, the supervisor blocks until this is signaled.
323 /// Used for execve to allow pre-execution policy changes.
324 /// The Verdict is sent back to control allow/deny.
325 pub gate: Option<tokio::sync::oneshot::Sender<Verdict>>,
326}
327
328// ============================================================
329// Policy callback runner
330// ============================================================
331
332/// Spawn a thread that receives syscall events and calls the policy callback.
333///
334/// Returns a sender for the supervisor to push events into.
335pub(crate) fn spawn_policy_fn(
336 callback: PolicyCallback,
337 live: Arc<RwLock<LivePolicy>>,
338 ceiling: LivePolicy,
339 pid_overrides: Arc<RwLock<HashMap<u32, HashSet<IpAddr>>>>,
340 denied_paths: Arc<RwLock<HashSet<String>>>,
341) -> tokio::sync::mpsc::UnboundedSender<PolicyEvent> {
342 let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<PolicyEvent>();
343
344 std::thread::Builder::new()
345 .name("sandlock-policy-fn".to_string())
346 .spawn(move || {
347 let mut ctx = PolicyContext::new(live, ceiling, pid_overrides, denied_paths);
348
349 while let Some(pe) = rx.blocking_recv() {
350 let verdict = callback(pe.event, &mut ctx);
351
352 // Signal the supervisor with the verdict.
353 // For execve, this unblocks the child.
354 if let Some(gate) = pe.gate {
355 let _ = gate.send(verdict);
356 }
357 }
358 })
359 .expect("failed to spawn policy-fn thread");
360
361 tx
362}
363
364// ============================================================
365// Tests
366// ============================================================
367
368#[cfg(test)]
369mod tests {
370 use super::*;
371
372 fn test_live() -> LivePolicy {
373 LivePolicy {
374 allowed_ips: ["127.0.0.1", "10.0.0.1"]
375 .iter()
376 .map(|s| s.parse().unwrap())
377 .collect(),
378 max_memory_bytes: 1024 * 1024 * 1024,
379 max_processes: 64,
380 }
381 }
382
383 #[test]
384 fn test_grant_within_ceiling() {
385 let live = Arc::new(RwLock::new(LivePolicy {
386 allowed_ips: HashSet::new(),
387 max_memory_bytes: 0,
388 max_processes: 0,
389 }));
390 let ceiling = test_live();
391 let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
392 let denied_paths = Arc::new(RwLock::new(HashSet::new()));
393 let mut ctx = PolicyContext::new(live.clone(), ceiling, pid_overrides, denied_paths);
394
395 let ip: IpAddr = "127.0.0.1".parse().unwrap();
396 ctx.grant_network(&[ip]).unwrap();
397 assert!(live.read().unwrap().allowed_ips.contains(&ip));
398 }
399
400 #[test]
401 fn test_grant_capped_to_ceiling() {
402 let live = Arc::new(RwLock::new(LivePolicy {
403 allowed_ips: HashSet::new(),
404 max_memory_bytes: 0,
405 max_processes: 0,
406 }));
407 let ceiling = test_live();
408 let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
409 let denied_paths = Arc::new(RwLock::new(HashSet::new()));
410 let mut ctx = PolicyContext::new(live.clone(), ceiling, pid_overrides, denied_paths);
411
412 // Try to grant an IP not in ceiling — should be silently ignored
413 let foreign: IpAddr = "8.8.8.8".parse().unwrap();
414 ctx.grant_network(&[foreign]).unwrap();
415 assert!(!live.read().unwrap().allowed_ips.contains(&foreign));
416 }
417
418 #[test]
419 fn test_restrict_then_grant_fails() {
420 let live = Arc::new(RwLock::new(test_live()));
421 let ceiling = test_live();
422 let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
423 let denied_paths = Arc::new(RwLock::new(HashSet::new()));
424 let mut ctx = PolicyContext::new(live, ceiling, pid_overrides, denied_paths);
425
426 ctx.restrict_network(&[]);
427 let ip: IpAddr = "127.0.0.1".parse().unwrap();
428 assert!(ctx.grant_network(&[ip]).is_err());
429 }
430
431 #[test]
432 fn test_restrict_max_memory() {
433 let live = Arc::new(RwLock::new(test_live()));
434 let ceiling = test_live();
435 let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
436 let denied_paths = Arc::new(RwLock::new(HashSet::new()));
437 let mut ctx = PolicyContext::new(live.clone(), ceiling, pid_overrides, denied_paths);
438
439 ctx.restrict_max_memory(256 * 1024 * 1024);
440 assert_eq!(live.read().unwrap().max_memory_bytes, 256 * 1024 * 1024);
441 }
442
443 #[test]
444 fn test_pid_override() {
445 let live = Arc::new(RwLock::new(test_live()));
446 let ceiling = test_live();
447 let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
448 let denied_paths = Arc::new(RwLock::new(HashSet::new()));
449 let ctx = PolicyContext::new(live, ceiling, pid_overrides.clone(), denied_paths);
450
451 let localhost: IpAddr = "127.0.0.1".parse().unwrap();
452 ctx.restrict_pid_network(1234, &[localhost]);
453
454 let overrides = pid_overrides.read().unwrap();
455 let pid_ips = overrides.get(&1234).unwrap();
456 assert!(pid_ips.contains(&localhost));
457 assert_eq!(pid_ips.len(), 1);
458 }
459
460 #[test]
461 fn test_clear_pid_override() {
462 let live = Arc::new(RwLock::new(test_live()));
463 let ceiling = test_live();
464 let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
465 let denied_paths = Arc::new(RwLock::new(HashSet::new()));
466 let ctx = PolicyContext::new(live, ceiling, pid_overrides.clone(), denied_paths);
467
468 let localhost: IpAddr = "127.0.0.1".parse().unwrap();
469 ctx.restrict_pid_network(1234, &[localhost]);
470 ctx.clear_pid_override(1234);
471 assert!(!pid_overrides.read().unwrap().contains_key(&1234));
472 }
473
474 #[test]
475 fn test_event_argv_contains() {
476 let event = SyscallEvent {
477 syscall: "execve".to_string(),
478 category: SyscallCategory::Process,
479 pid: 1,
480 parent_pid: Some(0),
481 host: None,
482 port: None,
483 size: None,
484 argv: Some(vec!["python3".into(), "-c".into(), "print(1)".into()]),
485 denied: false,
486 };
487 assert!(event.argv_contains("python3"));
488 assert!(event.argv_contains("-c"));
489 assert!(!event.argv_contains("ruby"));
490 assert_eq!(event.category, SyscallCategory::Process);
491 }
492
493 #[test]
494 fn test_event_argv_contains_none() {
495 let event = SyscallEvent {
496 syscall: "openat".to_string(),
497 category: SyscallCategory::File,
498 pid: 1,
499 parent_pid: None,
500 host: None,
501 port: None,
502 size: None,
503 argv: None,
504 denied: false,
505 };
506 assert!(!event.argv_contains("anything"));
507 }
508}