sandlock_core/seccomp/state.rs
1// Domain-specific state structs — each domain is locked independently so
2// handlers only contend on the state they actually need. Per-process
3// state is bundled into a single `PerProcessState` owned by
4// `ProcessIndex`; cleanup on exit is just dropping the entry's `Arc`.
5
6use std::collections::{HashMap, HashSet};
7use std::sync::Arc;
8use tokio::sync::Mutex as AsyncMutex;
9
10/// Resource-limit runtime state shared across notification handlers.
11pub struct ResourceState {
12 /// Live concurrent process count — incremented on fork, decremented on wait.
13 pub proc_count: u32,
14 /// Maximum allowed concurrent processes.
15 pub max_processes: u32,
16 /// Estimated anonymous memory usage (bytes).
17 pub mem_used: u64,
18 /// Maximum allowed anonymous memory (bytes).
19 pub max_memory_bytes: u64,
20 /// Whether fork notifications should be held (checkpoint/freeze).
21 pub hold_forks: bool,
22 /// Notification IDs held during a checkpoint freeze.
23 pub held_notif_ids: Vec<u64>,
24 /// Exponentially-weighted load average.
25 pub load_avg: crate::procfs::LoadAvg,
26 /// Instant when the supervisor started (for uptime reporting).
27 pub start_instant: std::time::Instant,
28}
29
30impl ResourceState {
31 /// Create a new resource state with the given limits.
32 pub fn new(max_memory_bytes: u64, max_processes: u32) -> Self {
33 Self {
34 proc_count: 0,
35 max_processes,
36 mem_used: 0,
37 max_memory_bytes,
38 hold_forks: false,
39 held_notif_ids: Vec::new(),
40 load_avg: crate::procfs::LoadAvg::new(),
41 start_instant: std::time::Instant::now(),
42 }
43 }
44}
45
46// ============================================================
47// ProcfsState — /proc virtualization state
48// ============================================================
49
50/// /proc virtualization runtime state. Sandbox membership lives in
51/// `ProcessIndex`; per-process getdents caches live in
52/// `PerProcessState::procfs_dir_cache`. This struct only holds
53/// truly global virtualization state.
54pub struct ProcfsState {
55 /// Base address of the last vDSO we patched (0 = not yet patched).
56 pub vdso_patched_addr: u64,
57}
58
59impl ProcfsState {
60 pub fn new() -> Self {
61 Self {
62 vdso_patched_addr: 0,
63 }
64 }
65}
66
67// ============================================================
68// PidKey — stable per-process identity
69// ============================================================
70
71/// Stable process identity. Numeric pid plus the start_time that
72/// distinguishes a specific process instance from any future recycle
73/// of the same pid slot.
74#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
75pub struct PidKey {
76 /// Numeric PID observed by seccomp notification.
77 pub pid: i32,
78 /// Process start time from /proc/<pid>/stat field 22.
79 pub start_time: u64,
80}
81
82/// Read the process start time (field 22 of /proc/<pid>/stat) for `pid`.
83/// Returns None if the process is gone or /proc is not readable.
84pub(crate) fn read_pid_start_time(pid: i32) -> Option<u64> {
85 let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
86 // Skip past "pid (comm)" — comm may contain spaces and parens, but the
87 // last ") " in the line ends the comm field.
88 let rest = stat.rsplit_once(") ")?.1;
89 // The first token after "(comm) " is field 3; field 22 is therefore nth(19).
90 rest.split_whitespace().nth(19)?.parse().ok()
91}
92
93// ============================================================
94// PerProcessState — bundled per-process supervisor state
95// ============================================================
96
97/// All per-process supervisor state for one tracked child. One
98/// instance lives per `PidKey`, owned by `ProcessIndex` behind an
99/// `Arc<AsyncMutex<…>>`. Cleanup on process exit is one operation:
100/// `ProcessIndex::unregister` drops the index's `Arc`, and the
101/// supervisor's per-handler clones drop along with their tasks.
102#[derive(Default)]
103pub struct PerProcessState {
104 /// Logical cwd while the process is chdir'd into a COW-only
105 /// directory. None means "use kernel-reported cwd".
106 pub virtual_cwd: Option<String>,
107 /// Recorded brk base for memory accounting. None until first brk.
108 pub brk_base: Option<u64>,
109 /// COW directory dirent cache. Keyed by child's fd; value is
110 /// (host target path, sorted dirent bytes left to return).
111 /// Entries are invalidated when the fd is reused for a different
112 /// directory.
113 pub cow_dir_cache: HashMap<u32, (String, Vec<Vec<u8>>)>,
114 /// /proc directory dirent cache. Keyed by (child fd, target
115 /// path); same drain-on-EOF semantics as cow_dir_cache.
116 pub procfs_dir_cache: HashMap<(u32, String), Vec<Vec<u8>>>,
117}
118
119// ============================================================
120// ProcessIndex — sandbox membership + per-process state
121// ============================================================
122
123/// Source-of-truth registry for processes inside the sandbox.
124///
125/// Maps the kernel's numeric `pid` (the value that arrives in seccomp
126/// notifications) to the canonical `PidKey` plus an
127/// `Arc<AsyncMutex<PerProcessState>>` holding everything per-process.
128/// Held behind an internal `std::sync::RwLock` so the read-mostly hot
129/// paths (`key_for`, `contains`, `entry_for`, `/proc` virtualization)
130/// avoid an async mutex on every notification, and so `ProcessIndex`
131/// doesn't need its own outer wrapper in `SupervisorCtx`. Lock guards
132/// are `!Send` and the compiler will reject holding one across an
133/// `.await`, which keeps callers honest.
134///
135/// Ownership of each child's pidfd lives with the per-child watcher
136/// task, not with this index. That keeps the kernel fd alive for as
137/// long as the `AsyncFd` registration in the tokio IO driver does,
138/// and avoids a race where dropping the fd from the index could
139/// deregister a recycled fd from epoll.
140pub struct ProcessIndex {
141 inner: std::sync::RwLock<HashMap<i32, ProcessEntry>>,
142}
143
144#[derive(Clone)]
145struct ProcessEntry {
146 key: PidKey,
147 state: Arc<AsyncMutex<PerProcessState>>,
148}
149
150impl ProcessIndex {
151 pub fn new() -> Self {
152 Self {
153 inner: std::sync::RwLock::new(HashMap::new()),
154 }
155 }
156
157 /// Register a process by reading its start_time once and
158 /// allocating its `PerProcessState`. Returns the canonical key,
159 /// or None if the process is already gone. The caller is
160 /// responsible for keeping the pidfd alive — the per-child
161 /// watcher task does this via `AsyncFd<OwnedFd>`.
162 pub fn register(&self, pid: i32) -> Option<PidKey> {
163 let start_time = read_pid_start_time(pid)?;
164 let key = PidKey { pid, start_time };
165 let entry = ProcessEntry {
166 key,
167 state: Arc::new(AsyncMutex::new(PerProcessState::default())),
168 };
169 self.inner.write().ok()?.insert(pid, entry);
170 Some(key)
171 }
172
173 /// Look up the canonical PidKey for a notification's raw pid.
174 /// Returns None if this pid was never registered (e.g. pidfd_open
175 /// failed at fork) — callers should fall back to a no-op.
176 pub fn key_for(&self, pid: i32) -> Option<PidKey> {
177 self.inner.read().ok()?.get(&pid).map(|e| e.key)
178 }
179
180 /// Look up both the PidKey and the per-process state handle for
181 /// `pid`. Returns None if the pid isn't tracked. The caller locks
182 /// the returned `Arc<AsyncMutex<…>>` to read or mutate.
183 pub fn entry_for(&self, pid: i32) -> Option<(PidKey, Arc<AsyncMutex<PerProcessState>>)> {
184 self.inner
185 .read()
186 .ok()?
187 .get(&pid)
188 .map(|e| (e.key, Arc::clone(&e.state)))
189 }
190
191 /// Cheap membership test — used by /proc virtualization to gate
192 /// access to `/proc/<pid>/...` paths and by getdents filtering.
193 pub fn contains(&self, pid: i32) -> bool {
194 self.inner
195 .read()
196 .map(|g| g.contains_key(&pid))
197 .unwrap_or(false)
198 }
199
200 /// Number of tracked processes (for /proc/loadavg total).
201 pub fn len(&self) -> usize {
202 self.inner.read().map(|g| g.len()).unwrap_or(0)
203 }
204
205 /// Largest tracked pid (for /proc/loadavg last_pid).
206 pub fn max_pid(&self) -> Option<i32> {
207 self.inner.read().ok()?.keys().copied().max()
208 }
209
210 /// Snapshot the set of tracked pids. Used by getdents filtering
211 /// where the caller needs O(1) lookups inside a loop and would
212 /// otherwise have to re-acquire the read lock per entry.
213 pub fn pids_snapshot(&self) -> HashSet<i32> {
214 self.inner
215 .read()
216 .map(|g| g.keys().copied().collect())
217 .unwrap_or_default()
218 }
219
220 /// Remove a process from the index. The per-process state's
221 /// `Arc` reference held by the index drops here; remaining clones
222 /// (e.g. a handler that's mid-execution for that pid) will drop
223 /// when they go out of scope, and the inner `PerProcessState`
224 /// frees automatically.
225 pub fn unregister(&self, key: PidKey) {
226 if let Ok(mut g) = self.inner.write() {
227 // Only clear if the entry still points at this key. A PID
228 // recycled with a fresh start_time may already have
229 // overwritten the entry via register(); we must not stomp it.
230 if g.get(&key.pid).map(|e| e.key) == Some(key) {
231 g.remove(&key.pid);
232 }
233 }
234 }
235
236 /// Defensive sweep: drop entries whose process is gone (or whose
237 /// start_time has changed). Called from a low-frequency backstop
238 /// task in case a pidfd watcher failed to spawn or the kernel
239 /// didn't deliver the readability event.
240 pub fn prune_dead(&self) {
241 let candidates: Vec<(i32, PidKey)> = match self.inner.read() {
242 Ok(g) => g.iter().map(|(p, e)| (*p, e.key)).collect(),
243 Err(_) => return,
244 };
245 let mut dead = Vec::new();
246 for (pid, key) in candidates {
247 match read_pid_start_time(pid) {
248 Some(st) if st == key.start_time => continue,
249 _ => dead.push(key),
250 }
251 }
252 if dead.is_empty() {
253 return;
254 }
255 if let Ok(mut g) = self.inner.write() {
256 for key in dead {
257 if g.get(&key.pid).map(|e| e.key) == Some(key) {
258 g.remove(&key.pid);
259 }
260 }
261 }
262 }
263}
264
265impl Default for ProcessIndex {
266 fn default() -> Self {
267 Self::new()
268 }
269}
270
271// ============================================================
272// CowState — copy-on-write filesystem state (global only)
273// ============================================================
274
275/// Global COW state. Per-process COW state (virtual cwd, dir cache)
276/// lives in `PerProcessState`.
277pub struct CowState {
278 /// Seccomp-based COW branch (None if COW disabled).
279 pub branch: Option<crate::cow::seccomp::SeccompCowBranch>,
280}
281
282impl CowState {
283 pub fn new() -> Self {
284 Self { branch: None }
285 }
286}
287
288// ============================================================
289// NetworkState — network policy and port remapping state
290// ============================================================
291
292/// Network policy and port-remapping state.
293pub struct NetworkState {
294 /// Global network policy: unrestricted or limited to a set of IPs.
295 pub network_policy: crate::seccomp::notif::NetworkPolicy,
296 /// Port binding and remapping tracker.
297 pub port_map: crate::port_remap::PortMap,
298 /// Per-PID network overrides from policy_fn.
299 pub pid_ip_overrides: std::sync::Arc<std::sync::RwLock<HashMap<u32, HashSet<std::net::IpAddr>>>>,
300 /// HTTP ACL proxy address (None if HTTP ACL not active).
301 pub http_acl_addr: Option<std::net::SocketAddr>,
302 /// TCP ports to intercept and redirect to the HTTP ACL proxy.
303 pub http_acl_ports: HashSet<u16>,
304 /// Shared map for recording original destination IPs on proxy redirect.
305 pub http_acl_orig_dest: Option<crate::http_acl::OrigDestMap>,
306}
307
308impl NetworkState {
309 pub fn new() -> Self {
310 Self {
311 network_policy: crate::seccomp::notif::NetworkPolicy::Unrestricted,
312 port_map: crate::port_remap::PortMap::new(),
313 pid_ip_overrides: std::sync::Arc::new(std::sync::RwLock::new(HashMap::new())),
314 http_acl_addr: None,
315 http_acl_ports: HashSet::new(),
316 http_acl_orig_dest: None,
317 }
318 }
319
320 /// Get the effective network policy for a PID.
321 ///
322 /// Priority: per-PID override > live policy (from PolicyFnState) > global network_policy.
323 /// The `live_policy` parameter allows checking the live policy without needing
324 /// to lock the PolicyFnState mutex.
325 pub fn effective_network_policy(
326 &self,
327 pid: u32,
328 live_policy: Option<&std::sync::Arc<std::sync::RwLock<crate::policy_fn::LivePolicy>>>,
329 ) -> crate::seccomp::notif::NetworkPolicy {
330 if let Ok(overrides) = self.pid_ip_overrides.read() {
331 if let Some(ips) = overrides.get(&pid) {
332 return crate::seccomp::notif::NetworkPolicy::AllowList(ips.clone());
333 }
334 }
335 if let Some(lp) = live_policy {
336 if let Ok(live) = lp.read() {
337 if !live.allowed_ips.is_empty() {
338 return crate::seccomp::notif::NetworkPolicy::AllowList(live.allowed_ips.clone());
339 }
340 }
341 }
342 self.network_policy.clone()
343 }
344}
345
346// ============================================================
347// TimeRandomState — deterministic time/random state
348// ============================================================
349
350/// Time offset and deterministic random state.
351pub struct TimeRandomState {
352 /// Clock offset for time virtualization.
353 pub time_offset: Option<i64>,
354 /// Deterministic PRNG state (seeded from policy).
355 pub random_state: Option<rand_chacha::ChaCha8Rng>,
356}
357
358impl TimeRandomState {
359 pub fn new(time_offset: Option<i64>, random_state: Option<rand_chacha::ChaCha8Rng>) -> Self {
360 Self { time_offset, random_state }
361 }
362}
363
364// ============================================================
365// PolicyFnState — dynamic policy callback state
366// ============================================================
367
368/// Dynamic policy callback state.
369pub struct PolicyFnState {
370 /// Event sender for dynamic policy callback (None if no policy_fn).
371 pub event_tx: Option<tokio::sync::mpsc::UnboundedSender<crate::policy_fn::PolicyEvent>>,
372 /// Shared live policy for dynamic updates (None if no policy_fn).
373 pub live_policy: Option<std::sync::Arc<std::sync::RwLock<crate::policy_fn::LivePolicy>>>,
374 /// Dynamically denied paths from policy_fn.
375 pub denied_paths: std::sync::Arc<std::sync::RwLock<HashSet<String>>>,
376}
377
378impl PolicyFnState {
379 pub fn new() -> Self {
380 Self {
381 event_tx: None,
382 live_policy: None,
383 denied_paths: std::sync::Arc::new(std::sync::RwLock::new(HashSet::new())),
384 }
385 }
386
387 /// Check if a path is dynamically denied.
388 pub fn is_path_denied(&self, path: &str) -> bool {
389 if let Ok(denied) = self.denied_paths.read() {
390 let path = std::path::Path::new(path);
391 denied.iter().any(|d| path.starts_with(std::path::Path::new(d)))
392 } else {
393 false
394 }
395 }
396}
397
398// ============================================================
399// ChrootState — chroot-specific runtime state
400// ============================================================
401
402/// Chroot-specific runtime state.
403pub struct ChrootState {
404 /// Virtual exe path for chroot (set by handle_chroot_exec when memfd patching
405 /// rewrites PT_INTERP, since /proc/self/exe would otherwise show the memfd path).
406 pub chroot_exe: Option<std::path::PathBuf>,
407}
408
409impl ChrootState {
410 pub fn new() -> Self {
411 Self { chroot_exe: None }
412 }
413}
414
415#[cfg(test)]
416mod tests {
417 use super::*;
418
419 #[test]
420 fn process_index_register_lookup_unregister() {
421 let self_pid = unsafe { libc::getpid() };
422 let idx = ProcessIndex::new();
423 let key = idx
424 .register(self_pid)
425 .expect("register should succeed for live pid");
426 assert_eq!(key.pid, self_pid);
427
428 assert_eq!(idx.key_for(self_pid), Some(key));
429 assert!(idx.contains(self_pid));
430 assert_eq!(idx.key_for(self_pid + 999_999), None);
431 assert!(!idx.contains(self_pid + 999_999));
432 assert_eq!(idx.len(), 1);
433 assert_eq!(idx.max_pid(), Some(self_pid));
434
435 idx.unregister(key);
436 assert_eq!(idx.key_for(self_pid), None);
437 assert!(!idx.contains(self_pid));
438 assert_eq!(idx.len(), 0);
439 assert_eq!(idx.max_pid(), None);
440 }
441
442 #[test]
443 fn process_index_register_overwrites_stale_entry_for_recycled_pid() {
444 let self_pid = unsafe { libc::getpid() };
445 let idx = ProcessIndex::new();
446 // Forge a stale entry by direct insertion under the lock.
447 {
448 let stale_key = PidKey { pid: self_pid, start_time: 0 };
449 let stale = ProcessEntry {
450 key: stale_key,
451 state: Arc::new(AsyncMutex::new(PerProcessState::default())),
452 };
453 idx.inner.write().unwrap().insert(self_pid, stale);
454 }
455
456 let new_key = idx.register(self_pid).unwrap();
457 assert_ne!(new_key.start_time, 0);
458 assert_eq!(idx.key_for(self_pid), Some(new_key));
459
460 // Unregistering by the stale key must NOT clobber the fresh
461 // registration; only an exact-match unregister wins.
462 let stale_key = PidKey { pid: self_pid, start_time: 0 };
463 idx.unregister(stale_key);
464 assert_eq!(idx.key_for(self_pid), Some(new_key));
465 }
466
467 #[tokio::test]
468 async fn process_index_entry_for_returns_shared_handle() {
469 let self_pid = unsafe { libc::getpid() };
470 let idx = ProcessIndex::new();
471 let key = idx.register(self_pid).unwrap();
472
473 let (k1, s1) = idx.entry_for(self_pid).unwrap();
474 let (k2, s2) = idx.entry_for(self_pid).unwrap();
475 assert_eq!(k1, key);
476 assert_eq!(k2, key);
477
478 // Two clones of the same Arc — writes through one are visible
479 // through the other.
480 s1.lock().await.brk_base = Some(0xdead_beef);
481 assert_eq!(s2.lock().await.brk_base, Some(0xdead_beef));
482
483 // After unregister, entry_for returns None but existing Arc
484 // clones stay valid (kept alive by callers).
485 idx.unregister(key);
486 assert!(idx.entry_for(self_pid).is_none());
487 assert_eq!(s1.lock().await.brk_base, Some(0xdead_beef));
488 }
489
490 #[test]
491 fn process_index_pids_snapshot_is_independent() {
492 let self_pid = unsafe { libc::getpid() };
493 let idx = ProcessIndex::new();
494 let key = idx.register(self_pid).unwrap();
495 let snap = idx.pids_snapshot();
496 idx.unregister(key);
497 assert!(snap.contains(&self_pid));
498 assert!(!idx.contains(self_pid));
499 }
500
501 #[test]
502 fn process_index_prune_dead_drops_recycled_entries() {
503 let self_pid = unsafe { libc::getpid() };
504 let idx = ProcessIndex::new();
505 // Insert a stale entry for self with a wrong start_time.
506 let stale_key = PidKey { pid: self_pid, start_time: 0 };
507 let stale = ProcessEntry {
508 key: stale_key,
509 state: Arc::new(AsyncMutex::new(PerProcessState::default())),
510 };
511 idx.inner.write().unwrap().insert(self_pid, stale);
512
513 idx.prune_dead();
514 assert!(!idx.contains(self_pid));
515 }
516
517 #[test]
518 fn process_index_prune_dead_keeps_live_entries() {
519 let self_pid = unsafe { libc::getpid() };
520 let idx = ProcessIndex::new();
521 let key = idx.register(self_pid).unwrap();
522 idx.prune_dead();
523 assert_eq!(idx.key_for(self_pid), Some(key));
524 }
525}