Skip to main content

sandlock_core/seccomp/
state.rs

1// Domain-specific state structs — each domain is locked independently so
2// handlers only contend on the state they actually need. Per-process
3// state is bundled into a single `PerProcessState` owned by
4// `ProcessIndex`; cleanup on exit is just dropping the entry's `Arc`.
5
6use std::collections::{HashMap, HashSet};
7use std::sync::Arc;
8use tokio::sync::Mutex as AsyncMutex;
9
10/// Resource-limit runtime state shared across notification handlers.
11pub struct ResourceState {
12    /// Live concurrent process count — incremented on fork, decremented on wait.
13    pub proc_count: u32,
14    /// Maximum allowed concurrent processes.
15    pub max_processes: u32,
16    /// Estimated anonymous memory usage (bytes).
17    pub mem_used: u64,
18    /// Maximum allowed anonymous memory (bytes).
19    pub max_memory_bytes: u64,
20    /// Whether fork notifications should be held (checkpoint/freeze).
21    pub hold_forks: bool,
22    /// Notification IDs held during a checkpoint freeze.
23    pub held_notif_ids: Vec<u64>,
24    /// Exponentially-weighted load average.
25    pub load_avg: crate::procfs::LoadAvg,
26    /// Instant when the supervisor started (for uptime reporting).
27    pub start_instant: std::time::Instant,
28}
29
30impl ResourceState {
31    /// Create a new resource state with the given limits.
32    pub fn new(max_memory_bytes: u64, max_processes: u32) -> Self {
33        Self {
34            proc_count: 0,
35            max_processes,
36            mem_used: 0,
37            max_memory_bytes,
38            hold_forks: false,
39            held_notif_ids: Vec::new(),
40            load_avg: crate::procfs::LoadAvg::new(),
41            start_instant: std::time::Instant::now(),
42        }
43    }
44}
45
46// ============================================================
47// ProcfsState — /proc virtualization state
48// ============================================================
49
50/// /proc virtualization runtime state. Sandbox membership lives in
51/// `ProcessIndex`; per-process getdents caches live in
52/// `PerProcessState::procfs_dir_cache`. This struct only holds
53/// truly global virtualization state.
54pub struct ProcfsState {
55    /// Base address of the last vDSO we patched (0 = not yet patched).
56    pub vdso_patched_addr: u64,
57}
58
59impl ProcfsState {
60    pub fn new() -> Self {
61        Self {
62            vdso_patched_addr: 0,
63        }
64    }
65}
66
67// ============================================================
68// PidKey — stable per-process identity
69// ============================================================
70
71/// Stable process identity. Numeric pid plus the start_time that
72/// distinguishes a specific process instance from any future recycle
73/// of the same pid slot.
74#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
75pub struct PidKey {
76    /// Numeric PID observed by seccomp notification.
77    pub pid: i32,
78    /// Process start time from /proc/<pid>/stat field 22.
79    pub start_time: u64,
80}
81
82/// Read the process start time (field 22 of /proc/<pid>/stat) for `pid`.
83/// Returns None if the process is gone or /proc is not readable.
84pub(crate) fn read_pid_start_time(pid: i32) -> Option<u64> {
85    let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
86    // Skip past "pid (comm)" — comm may contain spaces and parens, but the
87    // last ") " in the line ends the comm field.
88    let rest = stat.rsplit_once(") ")?.1;
89    // The first token after "(comm) " is field 3; field 22 is therefore nth(19).
90    rest.split_whitespace().nth(19)?.parse().ok()
91}
92
93// ============================================================
94// PerProcessState — bundled per-process supervisor state
95// ============================================================
96
97/// All per-process supervisor state for one tracked child. One
98/// instance lives per `PidKey`, owned by `ProcessIndex` behind an
99/// `Arc<AsyncMutex<…>>`. Cleanup on process exit is one operation:
100/// `ProcessIndex::unregister` drops the index's `Arc`, and the
101/// supervisor's per-handler clones drop along with their tasks.
102#[derive(Default)]
103pub struct PerProcessState {
104    /// Logical cwd while the process is chdir'd into a COW-only
105    /// directory. None means "use kernel-reported cwd".
106    pub virtual_cwd: Option<String>,
107    /// Recorded brk base for memory accounting. None until first brk.
108    pub brk_base: Option<u64>,
109    /// COW directory dirent cache. Keyed by child's fd; value is
110    /// (host target path, sorted dirent bytes left to return).
111    /// Entries are invalidated when the fd is reused for a different
112    /// directory.
113    pub cow_dir_cache: HashMap<u32, (String, Vec<Vec<u8>>)>,
114    /// /proc directory dirent cache. Keyed by (child fd, target
115    /// path); same drain-on-EOF semantics as cow_dir_cache.
116    pub procfs_dir_cache: HashMap<(u32, String), Vec<Vec<u8>>>,
117}
118
119// ============================================================
120// ProcessIndex — sandbox membership + per-process state
121// ============================================================
122
123/// Source-of-truth registry for processes inside the sandbox.
124///
125/// Maps the kernel's numeric `pid` (the value that arrives in seccomp
126/// notifications) to the canonical `PidKey` plus an
127/// `Arc<AsyncMutex<PerProcessState>>` holding everything per-process.
128/// Held behind an internal `std::sync::RwLock` so the read-mostly hot
129/// paths (`key_for`, `contains`, `entry_for`, `/proc` virtualization)
130/// avoid an async mutex on every notification, and so `ProcessIndex`
131/// doesn't need its own outer wrapper in `SupervisorCtx`. Lock guards
132/// are `!Send` and the compiler will reject holding one across an
133/// `.await`, which keeps callers honest.
134///
135/// Ownership of each child's pidfd lives with the per-child watcher
136/// task, not with this index. That keeps the kernel fd alive for as
137/// long as the `AsyncFd` registration in the tokio IO driver does,
138/// and avoids a race where dropping the fd from the index could
139/// deregister a recycled fd from epoll.
140pub struct ProcessIndex {
141    inner: std::sync::RwLock<HashMap<i32, ProcessEntry>>,
142}
143
144#[derive(Clone)]
145struct ProcessEntry {
146    key: PidKey,
147    state: Arc<AsyncMutex<PerProcessState>>,
148}
149
150impl ProcessIndex {
151    pub fn new() -> Self {
152        Self {
153            inner: std::sync::RwLock::new(HashMap::new()),
154        }
155    }
156
157    /// Register a process by reading its start_time once and
158    /// allocating its `PerProcessState`. Returns the canonical key,
159    /// or None if the process is already gone. The caller is
160    /// responsible for keeping the pidfd alive — the per-child
161    /// watcher task does this via `AsyncFd<OwnedFd>`.
162    pub fn register(&self, pid: i32) -> Option<PidKey> {
163        let start_time = read_pid_start_time(pid)?;
164        let key = PidKey { pid, start_time };
165        let entry = ProcessEntry {
166            key,
167            state: Arc::new(AsyncMutex::new(PerProcessState::default())),
168        };
169        self.inner.write().ok()?.insert(pid, entry);
170        Some(key)
171    }
172
173    /// Look up the canonical PidKey for a notification's raw pid.
174    /// Returns None if this pid was never registered (e.g. pidfd_open
175    /// failed at fork) — callers should fall back to a no-op.
176    pub fn key_for(&self, pid: i32) -> Option<PidKey> {
177        self.inner.read().ok()?.get(&pid).map(|e| e.key)
178    }
179
180    /// Look up both the PidKey and the per-process state handle for
181    /// `pid`. Returns None if the pid isn't tracked. The caller locks
182    /// the returned `Arc<AsyncMutex<…>>` to read or mutate.
183    pub fn entry_for(&self, pid: i32) -> Option<(PidKey, Arc<AsyncMutex<PerProcessState>>)> {
184        self.inner
185            .read()
186            .ok()?
187            .get(&pid)
188            .map(|e| (e.key, Arc::clone(&e.state)))
189    }
190
191    /// Cheap membership test — used by /proc virtualization to gate
192    /// access to `/proc/<pid>/...` paths and by getdents filtering.
193    pub fn contains(&self, pid: i32) -> bool {
194        self.inner
195            .read()
196            .map(|g| g.contains_key(&pid))
197            .unwrap_or(false)
198    }
199
200    /// Number of tracked processes (for /proc/loadavg total).
201    pub fn len(&self) -> usize {
202        self.inner.read().map(|g| g.len()).unwrap_or(0)
203    }
204
205    /// Largest tracked pid (for /proc/loadavg last_pid).
206    pub fn max_pid(&self) -> Option<i32> {
207        self.inner.read().ok()?.keys().copied().max()
208    }
209
210    /// Snapshot the set of tracked pids. Used by getdents filtering
211    /// where the caller needs O(1) lookups inside a loop and would
212    /// otherwise have to re-acquire the read lock per entry.
213    pub fn pids_snapshot(&self) -> HashSet<i32> {
214        self.inner
215            .read()
216            .map(|g| g.keys().copied().collect())
217            .unwrap_or_default()
218    }
219
220    /// Remove a process from the index. The per-process state's
221    /// `Arc` reference held by the index drops here; remaining clones
222    /// (e.g. a handler that's mid-execution for that pid) will drop
223    /// when they go out of scope, and the inner `PerProcessState`
224    /// frees automatically.
225    pub fn unregister(&self, key: PidKey) {
226        if let Ok(mut g) = self.inner.write() {
227            // Only clear if the entry still points at this key. A PID
228            // recycled with a fresh start_time may already have
229            // overwritten the entry via register(); we must not stomp it.
230            if g.get(&key.pid).map(|e| e.key) == Some(key) {
231                g.remove(&key.pid);
232            }
233        }
234    }
235
236    /// Defensive sweep: drop entries whose process is gone (or whose
237    /// start_time has changed). Called from a low-frequency backstop
238    /// task in case a pidfd watcher failed to spawn or the kernel
239    /// didn't deliver the readability event.
240    pub fn prune_dead(&self) {
241        let candidates: Vec<(i32, PidKey)> = match self.inner.read() {
242            Ok(g) => g.iter().map(|(p, e)| (*p, e.key)).collect(),
243            Err(_) => return,
244        };
245        let mut dead = Vec::new();
246        for (pid, key) in candidates {
247            match read_pid_start_time(pid) {
248                Some(st) if st == key.start_time => continue,
249                _ => dead.push(key),
250            }
251        }
252        if dead.is_empty() {
253            return;
254        }
255        if let Ok(mut g) = self.inner.write() {
256            for key in dead {
257                if g.get(&key.pid).map(|e| e.key) == Some(key) {
258                    g.remove(&key.pid);
259                }
260            }
261        }
262    }
263}
264
265impl Default for ProcessIndex {
266    fn default() -> Self {
267        Self::new()
268    }
269}
270
271// ============================================================
272// CowState — copy-on-write filesystem state (global only)
273// ============================================================
274
275/// Global COW state. Per-process COW state (virtual cwd, dir cache)
276/// lives in `PerProcessState`.
277pub struct CowState {
278    /// Seccomp-based COW branch (None if COW disabled).
279    pub branch: Option<crate::cow::seccomp::SeccompCowBranch>,
280}
281
282impl CowState {
283    pub fn new() -> Self {
284        Self { branch: None }
285    }
286}
287
288// ============================================================
289// NetworkState — network policy and port remapping state
290// ============================================================
291
292/// Network policy and port-remapping state.
293pub struct NetworkState {
294    /// Global network policy: unrestricted or limited to a set of IPs.
295    pub network_policy: crate::seccomp::notif::NetworkPolicy,
296    /// Port binding and remapping tracker.
297    pub port_map: crate::port_remap::PortMap,
298    /// Per-PID network overrides from policy_fn.
299    pub pid_ip_overrides: std::sync::Arc<std::sync::RwLock<HashMap<u32, HashSet<std::net::IpAddr>>>>,
300    /// HTTP ACL proxy address (None if HTTP ACL not active).
301    pub http_acl_addr: Option<std::net::SocketAddr>,
302    /// TCP ports to intercept and redirect to the HTTP ACL proxy.
303    pub http_acl_ports: HashSet<u16>,
304    /// Shared map for recording original destination IPs on proxy redirect.
305    pub http_acl_orig_dest: Option<crate::http_acl::OrigDestMap>,
306}
307
308impl NetworkState {
309    pub fn new() -> Self {
310        Self {
311            network_policy: crate::seccomp::notif::NetworkPolicy::Unrestricted,
312            port_map: crate::port_remap::PortMap::new(),
313            pid_ip_overrides: std::sync::Arc::new(std::sync::RwLock::new(HashMap::new())),
314            http_acl_addr: None,
315            http_acl_ports: HashSet::new(),
316            http_acl_orig_dest: None,
317        }
318    }
319
320    /// Get the effective network policy for a PID.
321    ///
322    /// Priority: per-PID override > live policy (from PolicyFnState) > global network_policy.
323    /// The `live_policy` parameter allows checking the live policy without needing
324    /// to lock the PolicyFnState mutex.
325    pub fn effective_network_policy(
326        &self,
327        pid: u32,
328        live_policy: Option<&std::sync::Arc<std::sync::RwLock<crate::policy_fn::LivePolicy>>>,
329    ) -> crate::seccomp::notif::NetworkPolicy {
330        if let Ok(overrides) = self.pid_ip_overrides.read() {
331            if let Some(ips) = overrides.get(&pid) {
332                return crate::seccomp::notif::NetworkPolicy::AllowList(ips.clone());
333            }
334        }
335        if let Some(lp) = live_policy {
336            if let Ok(live) = lp.read() {
337                if !live.allowed_ips.is_empty() {
338                    return crate::seccomp::notif::NetworkPolicy::AllowList(live.allowed_ips.clone());
339                }
340            }
341        }
342        self.network_policy.clone()
343    }
344}
345
346// ============================================================
347// TimeRandomState — deterministic time/random state
348// ============================================================
349
350/// Time offset and deterministic random state.
351pub struct TimeRandomState {
352    /// Clock offset for time virtualization.
353    pub time_offset: Option<i64>,
354    /// Deterministic PRNG state (seeded from policy).
355    pub random_state: Option<rand_chacha::ChaCha8Rng>,
356}
357
358impl TimeRandomState {
359    pub fn new(time_offset: Option<i64>, random_state: Option<rand_chacha::ChaCha8Rng>) -> Self {
360        Self { time_offset, random_state }
361    }
362}
363
364// ============================================================
365// PolicyFnState — dynamic policy callback state
366// ============================================================
367
368/// Dynamic policy callback state.
369pub struct PolicyFnState {
370    /// Event sender for dynamic policy callback (None if no policy_fn).
371    pub event_tx: Option<tokio::sync::mpsc::UnboundedSender<crate::policy_fn::PolicyEvent>>,
372    /// Shared live policy for dynamic updates (None if no policy_fn).
373    pub live_policy: Option<std::sync::Arc<std::sync::RwLock<crate::policy_fn::LivePolicy>>>,
374    /// Dynamically denied paths from policy_fn.
375    pub denied_paths: std::sync::Arc<std::sync::RwLock<HashSet<String>>>,
376}
377
378impl PolicyFnState {
379    pub fn new() -> Self {
380        Self {
381            event_tx: None,
382            live_policy: None,
383            denied_paths: std::sync::Arc::new(std::sync::RwLock::new(HashSet::new())),
384        }
385    }
386
387    /// Check if a path is dynamically denied.
388    pub fn is_path_denied(&self, path: &str) -> bool {
389        if let Ok(denied) = self.denied_paths.read() {
390            let path = std::path::Path::new(path);
391            denied.iter().any(|d| path.starts_with(std::path::Path::new(d)))
392        } else {
393            false
394        }
395    }
396}
397
398// ============================================================
399// ChrootState — chroot-specific runtime state
400// ============================================================
401
402/// Chroot-specific runtime state.
403pub struct ChrootState {
404    /// Virtual exe path for chroot (set by handle_chroot_exec when memfd patching
405    /// rewrites PT_INTERP, since /proc/self/exe would otherwise show the memfd path).
406    pub chroot_exe: Option<std::path::PathBuf>,
407}
408
409impl ChrootState {
410    pub fn new() -> Self {
411        Self { chroot_exe: None }
412    }
413}
414
415#[cfg(test)]
416mod tests {
417    use super::*;
418
419    #[test]
420    fn process_index_register_lookup_unregister() {
421        let self_pid = unsafe { libc::getpid() };
422        let idx = ProcessIndex::new();
423        let key = idx
424            .register(self_pid)
425            .expect("register should succeed for live pid");
426        assert_eq!(key.pid, self_pid);
427
428        assert_eq!(idx.key_for(self_pid), Some(key));
429        assert!(idx.contains(self_pid));
430        assert_eq!(idx.key_for(self_pid + 999_999), None);
431        assert!(!idx.contains(self_pid + 999_999));
432        assert_eq!(idx.len(), 1);
433        assert_eq!(idx.max_pid(), Some(self_pid));
434
435        idx.unregister(key);
436        assert_eq!(idx.key_for(self_pid), None);
437        assert!(!idx.contains(self_pid));
438        assert_eq!(idx.len(), 0);
439        assert_eq!(idx.max_pid(), None);
440    }
441
442    #[test]
443    fn process_index_register_overwrites_stale_entry_for_recycled_pid() {
444        let self_pid = unsafe { libc::getpid() };
445        let idx = ProcessIndex::new();
446        // Forge a stale entry by direct insertion under the lock.
447        {
448            let stale_key = PidKey { pid: self_pid, start_time: 0 };
449            let stale = ProcessEntry {
450                key: stale_key,
451                state: Arc::new(AsyncMutex::new(PerProcessState::default())),
452            };
453            idx.inner.write().unwrap().insert(self_pid, stale);
454        }
455
456        let new_key = idx.register(self_pid).unwrap();
457        assert_ne!(new_key.start_time, 0);
458        assert_eq!(idx.key_for(self_pid), Some(new_key));
459
460        // Unregistering by the stale key must NOT clobber the fresh
461        // registration; only an exact-match unregister wins.
462        let stale_key = PidKey { pid: self_pid, start_time: 0 };
463        idx.unregister(stale_key);
464        assert_eq!(idx.key_for(self_pid), Some(new_key));
465    }
466
467    #[tokio::test]
468    async fn process_index_entry_for_returns_shared_handle() {
469        let self_pid = unsafe { libc::getpid() };
470        let idx = ProcessIndex::new();
471        let key = idx.register(self_pid).unwrap();
472
473        let (k1, s1) = idx.entry_for(self_pid).unwrap();
474        let (k2, s2) = idx.entry_for(self_pid).unwrap();
475        assert_eq!(k1, key);
476        assert_eq!(k2, key);
477
478        // Two clones of the same Arc — writes through one are visible
479        // through the other.
480        s1.lock().await.brk_base = Some(0xdead_beef);
481        assert_eq!(s2.lock().await.brk_base, Some(0xdead_beef));
482
483        // After unregister, entry_for returns None but existing Arc
484        // clones stay valid (kept alive by callers).
485        idx.unregister(key);
486        assert!(idx.entry_for(self_pid).is_none());
487        assert_eq!(s1.lock().await.brk_base, Some(0xdead_beef));
488    }
489
490    #[test]
491    fn process_index_pids_snapshot_is_independent() {
492        let self_pid = unsafe { libc::getpid() };
493        let idx = ProcessIndex::new();
494        let key = idx.register(self_pid).unwrap();
495        let snap = idx.pids_snapshot();
496        idx.unregister(key);
497        assert!(snap.contains(&self_pid));
498        assert!(!idx.contains(self_pid));
499    }
500
501    #[test]
502    fn process_index_prune_dead_drops_recycled_entries() {
503        let self_pid = unsafe { libc::getpid() };
504        let idx = ProcessIndex::new();
505        // Insert a stale entry for self with a wrong start_time.
506        let stale_key = PidKey { pid: self_pid, start_time: 0 };
507        let stale = ProcessEntry {
508            key: stale_key,
509            state: Arc::new(AsyncMutex::new(PerProcessState::default())),
510        };
511        idx.inner.write().unwrap().insert(self_pid, stale);
512
513        idx.prune_dead();
514        assert!(!idx.contains(self_pid));
515    }
516
517    #[test]
518    fn process_index_prune_dead_keeps_live_entries() {
519        let self_pid = unsafe { libc::getpid() };
520        let idx = ProcessIndex::new();
521        let key = idx.register(self_pid).unwrap();
522        idx.prune_dead();
523        assert_eq!(idx.key_for(self_pid), Some(key));
524    }
525}