Skip to main content

sandbox_seccomp/
profile.rs

1//! Seccomp filter building and management
2
3use sandbox_core::{Result, SandboxError};
4use std::collections::HashSet;
5
6/// Seccomp filter profile.
7///
8/// Each profile includes all syscalls from profiles below it (cumulative):
9/// `Essential < Minimal < IoHeavy < Compute < Network < Unrestricted`
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub enum SeccompProfile {
12    /// Essential — only the ~40 syscalls needed for process bootstrap (linker, glibc init, exit)
13    Essential,
14    /// Minimal — Essential + signals, pipes, timers, process control (~110 total)
15    Minimal,
16    /// IO-heavy — Minimal + file manipulation (mkdir, chmod, rename, fsync, …)
17    IoHeavy,
18    /// Compute — IoHeavy + advanced scheduling and NUMA (sched_setscheduler, mbind, …)
19    Compute,
20    /// Network — Compute + sockets (socket, bind, listen, connect, …)
21    Network,
22    /// Unrestricted — Network + privileged ops (ptrace, mount, bpf, setuid, …)
23    Unrestricted,
24}
25
26impl SeccompProfile {
27    /// Get all profiles
28    pub fn all() -> Vec<Self> {
29        vec![
30            SeccompProfile::Essential,
31            SeccompProfile::Minimal,
32            SeccompProfile::IoHeavy,
33            SeccompProfile::Compute,
34            SeccompProfile::Network,
35            SeccompProfile::Unrestricted,
36        ]
37    }
38
39    /// Get description of profile
40    pub fn description(&self) -> &'static str {
41        match self {
42            SeccompProfile::Essential => "Process bootstrap only (~40 syscalls)",
43            SeccompProfile::Minimal => "Essential + signals, pipes, timers, process control",
44            SeccompProfile::IoHeavy => "Minimal + file manipulation (mkdir, chmod, rename, …)",
45            SeccompProfile::Compute => "IoHeavy + advanced scheduling/NUMA",
46            SeccompProfile::Network => "Compute + socket operations",
47            SeccompProfile::Unrestricted => "Network + privileged operations",
48        }
49    }
50}
51
52/// Seccomp filter builder
53#[derive(Debug, Clone)]
54pub struct SeccompFilter {
55    allowed: HashSet<String>,
56    blocked: HashSet<String>,
57    kill_on_violation: bool,
58    profile: SeccompProfile,
59}
60
61impl SeccompFilter {
62    /// Create filter from profile
63    pub fn from_profile(profile: SeccompProfile) -> Self {
64        let allowed = Self::syscalls_for_profile(&profile);
65        Self {
66            allowed,
67            blocked: HashSet::new(),
68            kill_on_violation: true,
69            profile,
70        }
71    }
72
73    /// Create minimal filter
74    pub fn minimal() -> Self {
75        Self::from_profile(SeccompProfile::Minimal)
76    }
77
78    /// Syscalls needed for process bootstrap (linker, glibc init, exit).
79    fn essential_syscalls() -> Vec<&'static str> {
80        vec![
81            // Lifecycle
82            "exit",
83            "exit_group",
84            // Exec
85            "execve",
86            "execveat",
87            // Memory (linker)
88            "brk",
89            "mmap",
90            "munmap",
91            "mprotect",
92            "madvise",
93            // File (linker)
94            "openat",
95            "open",
96            "read",
97            "write",
98            "close",
99            "close_range",
100            // Stat
101            "fstat",
102            "stat",
103            "lstat",
104            "newfstatat",
105            "statx",
106            // Access
107            "access",
108            "faccessat",
109            "faccessat2",
110            // Seek
111            "lseek",
112            // Links
113            "readlink",
114            "readlinkat",
115            // glibc init
116            "arch_prctl",
117            "set_tid_address",
118            "set_robust_list",
119            "futex",
120            "getrandom",
121            "rseq",
122            "prlimit64",
123            "prctl",
124            // CWD
125            "getcwd",
126            // Identity
127            "getpid",
128            "gettid",
129            "getuid",
130            "geteuid",
131            "getgid",
132            "getegid",
133            // FD
134            "fcntl",
135        ]
136    }
137
138    /// Extra syscalls for a typical program (signals, pipes, timers, etc).
139    fn minimal_extras() -> Vec<&'static str> {
140        vec![
141            // Signals
142            "rt_sigaction",
143            "rt_sigprocmask",
144            "rt_sigpending",
145            "rt_sigtimedwait",
146            "rt_sigqueueinfo",
147            "rt_sigreturn",
148            "sigaltstack",
149            "kill",
150            "tkill",
151            "tgkill",
152            // Processes
153            "clone",
154            "clone3",
155            "fork",
156            "vfork",
157            "wait4",
158            "waitpid",
159            "waitid",
160            // I/O avançado
161            "readv",
162            "writev",
163            "pread64",
164            "pwrite64",
165            "ioctl",
166            "flock",
167            // FDs
168            "dup",
169            "dup2",
170            "dup3",
171            "pipe",
172            "pipe2",
173            "eventfd2",
174            // Time
175            "clock_gettime",
176            "clock_getres",
177            "gettimeofday",
178            "time",
179            "nanosleep",
180            "clock_nanosleep",
181            // Timers
182            "timer_create",
183            "timer_settime",
184            "timer_gettime",
185            "timer_getoverrun",
186            "timer_delete",
187            // Info
188            "getppid",
189            "getresuid",
190            "getresgid",
191            "uname",
192            "umask",
193            "sysinfo",
194            "getpgrp",
195            "getpgid",
196            "setpgid",
197            "getsid",
198            "setsid",
199            // Scheduling
200            "sched_getaffinity",
201            "sched_yield",
202            // Limits
203            "getrlimit",
204            "setrlimit",
205            "getrusage",
206            // Polling
207            "pselect6",
208            "ppoll",
209            "epoll_create1",
210            "epoll_ctl",
211            "epoll_wait",
212            "poll",
213            "select",
214            // Dir
215            "chdir",
216            "fchdir",
217            "getdents",
218            "getdents64",
219            // Memory
220            "mremap",
221            "mlock",
222            "munlock",
223            "mlockall",
224            "munlockall",
225            "memfd_create",
226            // Misc
227            "get_robust_list",
228        ]
229    }
230
231    /// Extra syscalls for file manipulation.
232    fn io_heavy_extras() -> Vec<&'static str> {
233        vec![
234            "mkdir",
235            "mkdirat",
236            "rmdir",
237            "unlink",
238            "unlinkat",
239            "rename",
240            "renameat",
241            "link",
242            "linkat",
243            "symlink",
244            "symlinkat",
245            "chmod",
246            "fchmod",
247            "fchmodat",
248            "chown",
249            "fchown",
250            "fchownat",
251            "lchown",
252            "utimes",
253            "futimesat",
254            "utime",
255            "utimensat",
256            "truncate",
257            "ftruncate",
258            "fallocate",
259            "sendfile",
260            "splice",
261            "tee",
262            "vmsplice",
263            "statfs",
264            "fstatfs",
265            "fsync",
266            "fdatasync",
267        ]
268    }
269
270    /// Extra syscalls for compute-intensive workloads.
271    fn compute_extras() -> Vec<&'static str> {
272        vec![
273            "sched_getscheduler",
274            "sched_setscheduler",
275            "sched_getparam",
276            "sched_setparam",
277            "sched_get_priority_max",
278            "sched_get_priority_min",
279            "sched_rr_get_interval",
280            "sched_setaffinity",
281            "mbind",
282            "get_mempolicy",
283            "set_mempolicy",
284            "migrate_pages",
285            "move_pages",
286            "membarrier",
287        ]
288    }
289
290    /// Extra syscalls for networking.
291    fn network_extras() -> Vec<&'static str> {
292        vec![
293            "socket",
294            "socketpair",
295            "bind",
296            "listen",
297            "accept",
298            "accept4",
299            "connect",
300            "shutdown",
301            "sendto",
302            "recvfrom",
303            "sendmsg",
304            "recvmsg",
305            "sendmmsg",
306            "recvmmsg",
307            "setsockopt",
308            "getsockopt",
309            "getsockname",
310            "getpeername",
311        ]
312    }
313
314    /// Extra syscalls for unrestricted / privileged mode.
315    fn unrestricted_extras() -> Vec<&'static str> {
316        vec![
317            "ptrace",
318            "process_vm_readv",
319            "process_vm_writev",
320            "perf_event_open",
321            "bpf",
322            "seccomp",
323            "mount",
324            "umount2",
325            "pivot_root",
326            "capget",
327            "capset",
328            "setuid",
329            "setgid",
330            "setreuid",
331            "setregid",
332            "setresuid",
333            "setresgid",
334            "getgroups",
335            "setgroups",
336            "setfsgid",
337            "setfsuid",
338        ]
339    }
340
341    /// Get syscalls for a profile (cumulative).
342    fn syscalls_for_profile(profile: &SeccompProfile) -> HashSet<String> {
343        let mut syscalls = HashSet::new();
344
345        let mut add = |list: Vec<&str>| {
346            for s in list {
347                syscalls.insert(s.to_string());
348            }
349        };
350
351        // Cumulative: each level includes all levels below it
352        add(Self::essential_syscalls());
353
354        if matches!(
355            profile,
356            SeccompProfile::Minimal
357                | SeccompProfile::IoHeavy
358                | SeccompProfile::Compute
359                | SeccompProfile::Network
360                | SeccompProfile::Unrestricted
361        ) {
362            add(Self::minimal_extras());
363        }
364
365        if matches!(
366            profile,
367            SeccompProfile::IoHeavy
368                | SeccompProfile::Compute
369                | SeccompProfile::Network
370                | SeccompProfile::Unrestricted
371        ) {
372            add(Self::io_heavy_extras());
373        }
374
375        if matches!(
376            profile,
377            SeccompProfile::Compute | SeccompProfile::Network | SeccompProfile::Unrestricted
378        ) {
379            add(Self::compute_extras());
380        }
381
382        if matches!(
383            profile,
384            SeccompProfile::Network | SeccompProfile::Unrestricted
385        ) {
386            add(Self::network_extras());
387        }
388
389        if matches!(profile, SeccompProfile::Unrestricted) {
390            add(Self::unrestricted_extras());
391        }
392
393        syscalls
394    }
395
396    /// Add syscall to whitelist
397    pub fn allow_syscall(&mut self, name: impl Into<String>) {
398        self.allowed.insert(name.into());
399    }
400
401    /// Block a syscall (deny even if in whitelist)
402    pub fn block_syscall(&mut self, name: impl Into<String>) {
403        self.blocked.insert(name.into());
404    }
405
406    /// Check if syscall is allowed
407    pub fn is_allowed(&self, name: &str) -> bool {
408        if self.blocked.contains(name) {
409            return false;
410        }
411        self.allowed.contains(name)
412    }
413
414    /// Get allowed syscalls
415    pub fn allowed_syscalls(&self) -> &HashSet<String> {
416        &self.allowed
417    }
418
419    /// Get blocked syscalls
420    pub fn blocked_syscalls(&self) -> &HashSet<String> {
421        &self.blocked
422    }
423
424    /// Count allowed syscalls
425    pub fn allowed_count(&self) -> usize {
426        self.allowed.len() - self.blocked.len()
427    }
428
429    /// Check if killing on violation
430    pub fn is_kill_on_violation(&self) -> bool {
431        self.kill_on_violation
432    }
433
434    /// Set kill on violation
435    pub fn set_kill_on_violation(&mut self, kill: bool) {
436        self.kill_on_violation = kill;
437    }
438
439    /// Get the profile used to create this filter
440    pub fn profile(&self) -> SeccompProfile {
441        self.profile.clone()
442    }
443
444    /// Validate that filter is correct
445    pub fn validate(&self) -> Result<()> {
446        if self.allowed.is_empty() && self.profile != SeccompProfile::Unrestricted {
447            return Err(SandboxError::Seccomp(
448                "Filter has no allowed syscalls".to_string(),
449            ));
450        }
451        Ok(())
452    }
453
454    /// Export as BPF program (simplified - just returns syscall names)
455    pub fn export(&self) -> Result<Vec<String>> {
456        self.validate()?;
457        let mut list: Vec<_> = self.allowed.iter().cloned().collect();
458        list.sort();
459        Ok(list)
460    }
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466
467    #[test]
468    fn test_seccomp_profile_all() {
469        let profiles = SeccompProfile::all();
470        assert_eq!(profiles.len(), 6);
471    }
472
473    #[test]
474    fn test_seccomp_profile_description() {
475        assert!(!SeccompProfile::Essential.description().is_empty());
476        assert!(!SeccompProfile::Minimal.description().is_empty());
477        assert_ne!(
478            SeccompProfile::Essential.description(),
479            SeccompProfile::Minimal.description()
480        );
481        assert_ne!(
482            SeccompProfile::Minimal.description(),
483            SeccompProfile::Network.description()
484        );
485    }
486
487    #[test]
488    fn test_seccomp_filter_essential() {
489        let filter = SeccompFilter::from_profile(SeccompProfile::Essential);
490        // Bootstrap syscalls
491        assert!(filter.is_allowed("read"));
492        assert!(filter.is_allowed("write"));
493        assert!(filter.is_allowed("exit"));
494        assert!(filter.is_allowed("execve"));
495        assert!(filter.is_allowed("mmap"));
496        assert!(filter.is_allowed("brk"));
497        assert!(filter.is_allowed("openat"));
498        assert!(filter.is_allowed("close"));
499        assert!(filter.is_allowed("arch_prctl"));
500        assert!(filter.is_allowed("futex"));
501        assert!(filter.is_allowed("getpid"));
502        assert!(filter.is_allowed("gettid"));
503        assert!(filter.is_allowed("lseek"));
504        assert!(filter.is_allowed("fcntl"));
505
506        // NOT in Essential
507        assert!(!filter.is_allowed("clone"));
508        assert!(!filter.is_allowed("rt_sigaction"));
509        assert!(!filter.is_allowed("nanosleep"));
510        assert!(!filter.is_allowed("socket"));
511        assert!(!filter.is_allowed("ptrace"));
512        assert!(!filter.is_allowed("mkdir"));
513
514        let count = filter.allowed_count();
515        assert!(
516            (35..=50).contains(&count),
517            "Essential profile should have ~40 syscalls, got {}",
518            count
519        );
520    }
521
522    #[test]
523    fn test_seccomp_filter_minimal() {
524        let filter = SeccompFilter::minimal();
525        assert!(filter.is_allowed("read"));
526        assert!(filter.is_allowed("write"));
527        assert!(filter.is_allowed("exit"));
528        assert!(filter.is_allowed("clone3"));
529        assert!(filter.is_allowed("lseek"));
530        assert!(filter.is_allowed("sched_getaffinity"));
531        assert!(filter.is_allowed("nanosleep"));
532        assert!(filter.is_allowed("gettid"));
533        assert!(filter.is_allowed("rt_sigaction"));
534        assert!(!filter.is_allowed("ptrace"));
535        assert!(!filter.is_allowed("mkdir"));
536        assert!(!filter.is_allowed("socket"));
537        assert!(
538            filter.allowed_count() > 100,
539            "Minimal profile should have > 100 syscalls for runtime compatibility, got {}",
540            filter.allowed_count()
541        );
542    }
543
544    #[test]
545    fn test_seccomp_filter_io_heavy() {
546        let filter = SeccompFilter::from_profile(SeccompProfile::IoHeavy);
547        assert!(filter.is_allowed("read"));
548        assert!(filter.is_allowed("mkdir"));
549        assert!(filter.is_allowed("unlink"));
550        // Also has Minimal extras (cumulative)
551        assert!(filter.is_allowed("clone"));
552        assert!(filter.is_allowed("rt_sigaction"));
553        let io_count = filter.allowed_count();
554
555        let minimal = SeccompFilter::minimal();
556        assert!(io_count > minimal.allowed_count());
557    }
558
559    #[test]
560    fn test_seccomp_filter_network() {
561        let filter = SeccompFilter::from_profile(SeccompProfile::Network);
562        assert!(filter.is_allowed("socket"));
563        assert!(filter.is_allowed("connect"));
564        assert!(filter.is_allowed("bind"));
565        // Cumulative: also has IoHeavy extras
566        assert!(filter.is_allowed("mkdir"));
567        // Cumulative: also has Compute extras
568        assert!(filter.is_allowed("sched_setscheduler"));
569    }
570
571    #[test]
572    fn test_seccomp_filter_allow_syscall() {
573        let mut filter = SeccompFilter::minimal();
574        filter.allow_syscall("custom_syscall");
575        assert!(filter.is_allowed("custom_syscall"));
576    }
577
578    #[test]
579    fn test_seccomp_filter_block_syscall() {
580        let mut filter = SeccompFilter::minimal();
581        filter.block_syscall("read");
582        assert!(!filter.is_allowed("read"));
583    }
584
585    #[test]
586    fn test_seccomp_filter_block_overrides_allow() {
587        let mut filter = SeccompFilter::minimal();
588        assert!(filter.is_allowed("write"));
589        filter.block_syscall("write");
590        assert!(!filter.is_allowed("write"));
591    }
592
593    #[test]
594    fn test_seccomp_filter_validate() {
595        let filter = SeccompFilter::minimal();
596        assert!(filter.validate().is_ok());
597
598        let empty_filter = SeccompFilter {
599            allowed: HashSet::new(),
600            blocked: HashSet::new(),
601            kill_on_violation: true,
602            profile: SeccompProfile::Minimal,
603        };
604        assert!(empty_filter.validate().is_err());
605    }
606
607    #[test]
608    fn test_seccomp_filter_export() {
609        let filter = SeccompFilter::minimal();
610        let syscalls = filter.export().unwrap();
611        assert!(!syscalls.is_empty());
612        assert!(syscalls.contains(&"read".to_string()));
613
614        // Should be sorted
615        let mut sorted = syscalls.clone();
616        sorted.sort();
617        assert_eq!(syscalls, sorted);
618    }
619
620    #[test]
621    fn test_seccomp_kill_on_violation() {
622        let mut filter = SeccompFilter::minimal();
623        assert!(filter.is_kill_on_violation());
624
625        filter.set_kill_on_violation(false);
626        assert!(!filter.is_kill_on_violation());
627    }
628
629    #[test]
630    fn test_validate_unrestricted_with_no_allowed() {
631        let filter = SeccompFilter {
632            allowed: HashSet::new(),
633            blocked: HashSet::new(),
634            kill_on_violation: true,
635            profile: SeccompProfile::Unrestricted,
636        };
637        assert!(filter.validate().is_ok());
638    }
639
640    #[test]
641    fn test_profiles_are_cumulative() {
642        let essential = SeccompFilter::from_profile(SeccompProfile::Essential);
643        let minimal = SeccompFilter::from_profile(SeccompProfile::Minimal);
644        let io_heavy = SeccompFilter::from_profile(SeccompProfile::IoHeavy);
645        let compute = SeccompFilter::from_profile(SeccompProfile::Compute);
646        let network = SeccompFilter::from_profile(SeccompProfile::Network);
647        let unrestricted = SeccompFilter::from_profile(SeccompProfile::Unrestricted);
648
649        // Each profile must be a strict superset of the one below
650        assert!(
651            essential
652                .allowed_syscalls()
653                .is_subset(minimal.allowed_syscalls()),
654            "Essential should be a subset of Minimal"
655        );
656        assert!(
657            minimal
658                .allowed_syscalls()
659                .is_subset(io_heavy.allowed_syscalls()),
660            "Minimal should be a subset of IoHeavy"
661        );
662        assert!(
663            io_heavy
664                .allowed_syscalls()
665                .is_subset(compute.allowed_syscalls()),
666            "IoHeavy should be a subset of Compute"
667        );
668        assert!(
669            compute
670                .allowed_syscalls()
671                .is_subset(network.allowed_syscalls()),
672            "Compute should be a subset of Network"
673        );
674        assert!(
675            network
676                .allowed_syscalls()
677                .is_subset(unrestricted.allowed_syscalls()),
678            "Network should be a subset of Unrestricted"
679        );
680
681        // And strictly more syscalls at each level
682        assert!(minimal.allowed_count() > essential.allowed_count());
683        assert!(io_heavy.allowed_count() > minimal.allowed_count());
684        assert!(compute.allowed_count() > io_heavy.allowed_count());
685        assert!(network.allowed_count() > compute.allowed_count());
686        assert!(unrestricted.allowed_count() > network.allowed_count());
687    }
688}