Skip to main content

sandlock_core/
policy_fn.rs

1//! Dynamic policy — live policy modification via syscall event callbacks.
2//!
3//! Allows a user-provided callback to inspect syscall events and adjust
4//! sandbox permissions at runtime (grant, restrict, per-PID overrides).
5//!
6//! ```ignore
7//! let policy = Policy::builder()
8//!     .fs_read("/usr").fs_read("/lib")
9//!     .net_allow_host("127.0.0.1")
10//!     .policy_fn(|event, ctx| {
11//!         if event.syscall == "connect" && event.host == Some("10.0.0.5".parse().unwrap()) {
12//!             return Verdict::Deny;
13//!         }
14//!         Verdict::Allow
15//!     })
16//!     .build()?;
17//! ```
18//!
19//! # TOCTOU and string-typed fields
20//!
21//! Path and argv strings the kernel will re-read after a `Continue`
22//! response (per `seccomp_unotify(2)`) are not exposed on this event.
23//! Path-based access control belongs in static Landlock rules
24//! (`fs_read`/`fs_write`/`fs_deny`), which the kernel enforces directly
25//! and which are not subject to user-memory races. Network fields
26//! (`host`, `port`) are TOCTOU-safe because the supervisor performs
27//! `connect`/`sendto`/`bind` on-behalf via `pidfd_getfd` and the kernel
28//! never re-reads child memory for those syscalls.
29
30use std::collections::{HashMap, HashSet};
31use std::net::IpAddr;
32use std::sync::{Arc, RwLock};
33
34// ============================================================
35// SyscallCategory
36// ============================================================
37
38/// High-level category of a syscall event.
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum SyscallCategory {
41    /// Filesystem operations (openat, unlinkat, mkdirat, etc.)
42    File,
43    /// Network operations (connect, sendto, bind, etc.)
44    Network,
45    /// Process lifecycle (clone, execve, vfork, etc.)
46    Process,
47    /// Memory management (mmap, munmap, brk, etc.)
48    Memory,
49}
50
51// ============================================================
52// SyscallEvent
53// ============================================================
54
55/// An intercepted syscall event observed by the seccomp supervisor.
56///
57/// # TOCTOU and string-typed fields
58///
59/// Path strings are deliberately absent. Per `seccomp_unotify(2)`, the
60/// kernel re-reads user-memory pointers after a `Continue` response, so
61/// any path-string-based decision is racy in a multi-threaded child.
62/// Path-based access control belongs in static Landlock rules
63/// (`fs_read` / `fs_write` / `fs_deny`); see issue #27.
64///
65/// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by
66/// construction: with `policy_fn` active, fork-like syscalls are traced
67/// for one ptrace creation event, so children are registered in
68/// `ProcessIndex` before they can run user code. Before the supervisor
69/// exposes `argv` to `policy_fn` or returns `Continue` for an execve, it
70/// then `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task that could write
71/// the memory — both sibling threads of the calling tid (same TGID, share
72/// `mm_struct`) and peer threads in other TGIDs that may alias argv
73/// pages via `MAP_SHARED` mappings or share `mm_struct` via
74/// `clone(CLONE_VM)`. The kernel's post-Continue re-read therefore
75/// sees the same memory the supervisor inspected. Siblings are killed
76/// by the kernel during execve's `de_thread` step; peer threads are
77/// detached after `NOTIF_SEND` and resume normally. See
78/// `crate::freeze`.
79///
80/// Network fields (`host`, `port`) are TOCTOU-safe because the
81/// supervisor performs `connect`/`sendto`/`bind` on-behalf via
82/// `pidfd_getfd` and the kernel never re-reads child memory for those.
83#[derive(Debug, Clone)]
84pub struct SyscallEvent {
85    /// Syscall name (e.g., "connect", "openat", "execve", "clone").
86    pub syscall: String,
87    /// High-level category.
88    pub category: SyscallCategory,
89    /// PID of the process that made the syscall.
90    pub pid: u32,
91    /// Parent PID (read from /proc/{pid}/stat).
92    pub parent_pid: Option<u32>,
93    /// Destination IP address (for connect, sendto). TOCTOU-safe.
94    pub host: Option<IpAddr>,
95    /// Destination port (for connect, sendto, bind). TOCTOU-safe.
96    pub port: Option<u16>,
97    /// Size argument (for mmap, brk).
98    pub size: Option<u64>,
99    /// Command arguments for execve/execveat. TOCTOU-safe: every task
100    /// in `ProcessIndex` (caller's siblings and peer processes) is
101    /// frozen before argv is read for this event and before the kernel
102    /// re-reads argv from child memory; fork-like syscalls register
103    /// children before they can run user code while `policy_fn` is
104    /// active.
105    pub argv: Option<Vec<String>>,
106    /// Whether the supervisor denied this syscall.
107    pub denied: bool,
108}
109
110impl SyscallEvent {
111    /// Returns true if any argv element contains the given substring.
112    /// Only meaningful for execve/execveat events (where argv is populated).
113    pub fn argv_contains(&self, s: &str) -> bool {
114        self.argv.as_ref().map_or(false, |args| args.iter().any(|a| a.contains(s)))
115    }
116}
117
118// ============================================================
119// LivePolicy — atomically swappable runtime policy
120// ============================================================
121
122/// Runtime policy state that can be modified by the policy callback.
123///
124/// This is separate from the static `Policy` — it holds only the fields
125/// that can be dynamically adjusted at runtime.
126#[derive(Debug, Clone)]
127pub struct LivePolicy {
128    /// Allowed destination IPs for outbound connections.
129    pub allowed_ips: HashSet<IpAddr>,
130    /// Maximum memory in bytes (0 = unlimited).
131    pub max_memory_bytes: u64,
132    /// Maximum number of forks.
133    pub max_processes: u32,
134}
135
136// ============================================================
137// PolicyContext
138// ============================================================
139
140/// Context passed to the policy callback for inspecting and modifying policy.
141///
142/// - `grant()`: expand permissions up to the ceiling (reversible)
143/// - `restrict()`: permanently shrink permissions (irreversible)
144/// - `restrict_pid()`: apply per-PID network overrides
145pub struct PolicyContext {
146    live: Arc<RwLock<LivePolicy>>,
147    ceiling: LivePolicy,
148    restricted: HashSet<&'static str>,
149    pid_overrides: Arc<RwLock<HashMap<u32, HashSet<IpAddr>>>>,
150    denied_paths: Arc<RwLock<HashSet<String>>>,
151}
152
153impl PolicyContext {
154    pub(crate) fn new(
155        live: Arc<RwLock<LivePolicy>>,
156        ceiling: LivePolicy,
157        pid_overrides: Arc<RwLock<HashMap<u32, HashSet<IpAddr>>>>,
158        denied_paths: Arc<RwLock<HashSet<String>>>,
159    ) -> Self {
160        Self {
161            live,
162            ceiling,
163            restricted: HashSet::new(),
164            pid_overrides,
165            denied_paths,
166        }
167    }
168
169    /// Current effective policy (snapshot).
170    pub fn current(&self) -> LivePolicy {
171        self.live.read().unwrap().clone()
172    }
173
174    /// Maximum permissions (immutable ceiling).
175    pub fn ceiling(&self) -> &LivePolicy {
176        &self.ceiling
177    }
178
179    // ---- Grant (expand within ceiling) ----
180
181    /// Expand allowed IPs. Cannot exceed ceiling. Fails if restricted.
182    pub fn grant_network(&mut self, ips: &[IpAddr]) -> Result<(), PolicyFnError> {
183        self.check_not_restricted("allowed_ips")?;
184        let mut live = self.live.write().unwrap();
185        for ip in ips {
186            if self.ceiling.allowed_ips.contains(ip) {
187                live.allowed_ips.insert(*ip);
188            }
189        }
190        Ok(())
191    }
192
193    /// Expand max memory. Cannot exceed ceiling. Fails if restricted.
194    pub fn grant_max_memory(&mut self, bytes: u64) -> Result<(), PolicyFnError> {
195        self.check_not_restricted("max_memory_bytes")?;
196        let mut live = self.live.write().unwrap();
197        live.max_memory_bytes = bytes.min(self.ceiling.max_memory_bytes);
198        Ok(())
199    }
200
201    /// Expand max processes. Cannot exceed ceiling. Fails if restricted.
202    pub fn grant_max_processes(&mut self, n: u32) -> Result<(), PolicyFnError> {
203        self.check_not_restricted("max_processes")?;
204        let mut live = self.live.write().unwrap();
205        live.max_processes = n.min(self.ceiling.max_processes);
206        Ok(())
207    }
208
209    // ---- Restrict (permanent shrink) ----
210
211    /// Permanently restrict allowed IPs. Cannot be granted back.
212    pub fn restrict_network(&mut self, ips: &[IpAddr]) {
213        self.restricted.insert("allowed_ips");
214        let mut live = self.live.write().unwrap();
215        live.allowed_ips = ips.iter().copied().collect();
216    }
217
218    /// Permanently restrict max memory. Cannot be granted back.
219    pub fn restrict_max_memory(&mut self, bytes: u64) {
220        self.restricted.insert("max_memory_bytes");
221        let mut live = self.live.write().unwrap();
222        live.max_memory_bytes = bytes;
223    }
224
225    /// Permanently restrict max processes. Cannot be granted back.
226    pub fn restrict_max_processes(&mut self, n: u32) {
227        self.restricted.insert("max_processes");
228        let mut live = self.live.write().unwrap();
229        live.max_processes = n;
230    }
231
232    // ---- Per-PID overrides ----
233
234    /// Restrict network for a specific PID (tighter than global policy).
235    pub fn restrict_pid_network(&self, pid: u32, ips: &[IpAddr]) {
236        let mut overrides = self.pid_overrides.write().unwrap();
237        overrides.insert(pid, ips.iter().copied().collect());
238    }
239
240    /// Remove per-PID override, falling back to global policy.
241    pub fn clear_pid_override(&self, pid: u32) {
242        let mut overrides = self.pid_overrides.write().unwrap();
243        overrides.remove(&pid);
244    }
245
246    // ---- Filesystem restriction ----
247
248    /// Deny access to a path (and all children). Checked by the supervisor
249    /// on openat/stat/access syscalls. Takes effect immediately.
250    pub fn deny_path(&self, path: &str) {
251        let mut denied = self.denied_paths.write().unwrap();
252        denied.insert(path.to_string());
253    }
254
255    /// Remove a previously denied path.
256    pub fn allow_path(&self, path: &str) {
257        let mut denied = self.denied_paths.write().unwrap();
258        denied.remove(path);
259    }
260
261    // ---- Internal ----
262
263    fn check_not_restricted(&self, field: &str) -> Result<(), PolicyFnError> {
264        if self.restricted.contains(field) {
265            Err(PolicyFnError::FieldRestricted(field.to_string()))
266        } else {
267            Ok(())
268        }
269    }
270}
271
272// ============================================================
273// Error type
274// ============================================================
275
276/// Errors from policy callback operations.
277#[derive(Debug, thiserror::Error)]
278pub enum PolicyFnError {
279    #[error("cannot grant restricted field: {0}")]
280    FieldRestricted(String),
281}
282
283// ============================================================
284// PolicyCallback type
285// ============================================================
286
287/// Verdict returned by the policy callback for the current syscall.
288#[derive(Debug, Clone, PartialEq, Eq)]
289pub enum Verdict {
290    /// Allow the syscall to proceed (default).
291    Allow,
292    /// Allow but flag for audit logging.
293    Audit,
294    /// Deny the syscall with EPERM.
295    Deny,
296    /// Deny the syscall with a specific errno.
297    DenyWith(i32),
298}
299
300impl Default for Verdict {
301    fn default() -> Self { Verdict::Allow }
302}
303
304/// A callback function invoked for each intercepted syscall.
305///
306/// Called synchronously on a dedicated thread. For `execve` syscalls,
307/// the child process is held until the callback returns.
308///
309/// Return `Verdict::Deny` to block the current syscall. Only effective
310/// for held syscalls (execve/execveat) and network syscalls (connect/sendto).
311///
312/// Wrapped in `Arc` so that `Policy` remains `Clone`.
313pub type PolicyCallback = Arc<dyn Fn(SyscallEvent, &mut PolicyContext) -> Verdict + Send + Sync + 'static>;
314
315// ============================================================
316// Event channel types (used by supervisor integration)
317// ============================================================
318
319/// An event sent from the supervisor to the policy callback thread.
320pub struct PolicyEvent {
321    pub event: SyscallEvent,
322    /// If Some, the supervisor blocks until this is signaled.
323    /// Used for execve to allow pre-execution policy changes.
324    /// The Verdict is sent back to control allow/deny.
325    pub gate: Option<tokio::sync::oneshot::Sender<Verdict>>,
326}
327
328// ============================================================
329// Policy callback runner
330// ============================================================
331
332/// Spawn a thread that receives syscall events and calls the policy callback.
333///
334/// Returns a sender for the supervisor to push events into.
335pub(crate) fn spawn_policy_fn(
336    callback: PolicyCallback,
337    live: Arc<RwLock<LivePolicy>>,
338    ceiling: LivePolicy,
339    pid_overrides: Arc<RwLock<HashMap<u32, HashSet<IpAddr>>>>,
340    denied_paths: Arc<RwLock<HashSet<String>>>,
341) -> tokio::sync::mpsc::UnboundedSender<PolicyEvent> {
342    let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<PolicyEvent>();
343
344    std::thread::Builder::new()
345        .name("sandlock-policy-fn".to_string())
346        .spawn(move || {
347            let mut ctx = PolicyContext::new(live, ceiling, pid_overrides, denied_paths);
348
349            while let Some(pe) = rx.blocking_recv() {
350                let verdict = callback(pe.event, &mut ctx);
351
352                // Signal the supervisor with the verdict.
353                // For execve, this unblocks the child.
354                if let Some(gate) = pe.gate {
355                    let _ = gate.send(verdict);
356                }
357            }
358        })
359        .expect("failed to spawn policy-fn thread");
360
361    tx
362}
363
364// ============================================================
365// Tests
366// ============================================================
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    fn test_live() -> LivePolicy {
373        LivePolicy {
374            allowed_ips: ["127.0.0.1", "10.0.0.1"]
375                .iter()
376                .map(|s| s.parse().unwrap())
377                .collect(),
378            max_memory_bytes: 1024 * 1024 * 1024,
379            max_processes: 64,
380        }
381    }
382
383    #[test]
384    fn test_grant_within_ceiling() {
385        let live = Arc::new(RwLock::new(LivePolicy {
386            allowed_ips: HashSet::new(),
387            max_memory_bytes: 0,
388            max_processes: 0,
389        }));
390        let ceiling = test_live();
391        let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
392        let denied_paths = Arc::new(RwLock::new(HashSet::new()));
393        let mut ctx = PolicyContext::new(live.clone(), ceiling, pid_overrides, denied_paths);
394
395        let ip: IpAddr = "127.0.0.1".parse().unwrap();
396        ctx.grant_network(&[ip]).unwrap();
397        assert!(live.read().unwrap().allowed_ips.contains(&ip));
398    }
399
400    #[test]
401    fn test_grant_capped_to_ceiling() {
402        let live = Arc::new(RwLock::new(LivePolicy {
403            allowed_ips: HashSet::new(),
404            max_memory_bytes: 0,
405            max_processes: 0,
406        }));
407        let ceiling = test_live();
408        let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
409        let denied_paths = Arc::new(RwLock::new(HashSet::new()));
410        let mut ctx = PolicyContext::new(live.clone(), ceiling, pid_overrides, denied_paths);
411
412        // Try to grant an IP not in ceiling — should be silently ignored
413        let foreign: IpAddr = "8.8.8.8".parse().unwrap();
414        ctx.grant_network(&[foreign]).unwrap();
415        assert!(!live.read().unwrap().allowed_ips.contains(&foreign));
416    }
417
418    #[test]
419    fn test_restrict_then_grant_fails() {
420        let live = Arc::new(RwLock::new(test_live()));
421        let ceiling = test_live();
422        let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
423        let denied_paths = Arc::new(RwLock::new(HashSet::new()));
424        let mut ctx = PolicyContext::new(live, ceiling, pid_overrides, denied_paths);
425
426        ctx.restrict_network(&[]);
427        let ip: IpAddr = "127.0.0.1".parse().unwrap();
428        assert!(ctx.grant_network(&[ip]).is_err());
429    }
430
431    #[test]
432    fn test_restrict_max_memory() {
433        let live = Arc::new(RwLock::new(test_live()));
434        let ceiling = test_live();
435        let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
436        let denied_paths = Arc::new(RwLock::new(HashSet::new()));
437        let mut ctx = PolicyContext::new(live.clone(), ceiling, pid_overrides, denied_paths);
438
439        ctx.restrict_max_memory(256 * 1024 * 1024);
440        assert_eq!(live.read().unwrap().max_memory_bytes, 256 * 1024 * 1024);
441    }
442
443    #[test]
444    fn test_pid_override() {
445        let live = Arc::new(RwLock::new(test_live()));
446        let ceiling = test_live();
447        let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
448        let denied_paths = Arc::new(RwLock::new(HashSet::new()));
449        let ctx = PolicyContext::new(live, ceiling, pid_overrides.clone(), denied_paths);
450
451        let localhost: IpAddr = "127.0.0.1".parse().unwrap();
452        ctx.restrict_pid_network(1234, &[localhost]);
453
454        let overrides = pid_overrides.read().unwrap();
455        let pid_ips = overrides.get(&1234).unwrap();
456        assert!(pid_ips.contains(&localhost));
457        assert_eq!(pid_ips.len(), 1);
458    }
459
460    #[test]
461    fn test_clear_pid_override() {
462        let live = Arc::new(RwLock::new(test_live()));
463        let ceiling = test_live();
464        let pid_overrides = Arc::new(RwLock::new(HashMap::new()));
465        let denied_paths = Arc::new(RwLock::new(HashSet::new()));
466        let ctx = PolicyContext::new(live, ceiling, pid_overrides.clone(), denied_paths);
467
468        let localhost: IpAddr = "127.0.0.1".parse().unwrap();
469        ctx.restrict_pid_network(1234, &[localhost]);
470        ctx.clear_pid_override(1234);
471        assert!(!pid_overrides.read().unwrap().contains_key(&1234));
472    }
473
474    #[test]
475    fn test_event_argv_contains() {
476        let event = SyscallEvent {
477            syscall: "execve".to_string(),
478            category: SyscallCategory::Process,
479            pid: 1,
480            parent_pid: Some(0),
481            host: None,
482            port: None,
483            size: None,
484            argv: Some(vec!["python3".into(), "-c".into(), "print(1)".into()]),
485            denied: false,
486        };
487        assert!(event.argv_contains("python3"));
488        assert!(event.argv_contains("-c"));
489        assert!(!event.argv_contains("ruby"));
490        assert_eq!(event.category, SyscallCategory::Process);
491    }
492
493    #[test]
494    fn test_event_argv_contains_none() {
495        let event = SyscallEvent {
496            syscall: "openat".to_string(),
497            category: SyscallCategory::File,
498            pid: 1,
499            parent_pid: None,
500            host: None,
501            port: None,
502            size: None,
503            argv: None,
504            denied: false,
505        };
506        assert!(!event.argv_contains("anything"));
507    }
508}