sandbox_rs/isolation/
seccomp_bpf.rs

1//! Seccomp BPF filter compilation and loading
2
3use super::seccomp::SeccompFilter;
4use crate::errors::{Result, SandboxError};
5
6/// Compiled BPF instruction
7#[repr(C)]
8#[derive(Debug, Clone, Copy)]
9pub struct BpfInstr {
10    pub code: u16,
11    pub jt: u8,
12    pub jf: u8,
13    pub k: u32,
14}
15
16/// Seccomp action codes
17pub mod actions {
18    /// Kill the process
19    pub const SECCOMP_RET_KILL: u32 = 0x00000000;
20    /// Trigger SIGSYS with architecture-specific si_code
21    pub const SECCOMP_RET_TRAP: u32 = 0x00030000;
22    /// Return errno value
23    pub const SECCOMP_RET_ERRNO: u32 = 0x00050000;
24    /// Load into trace_syscall_table (not recommended)
25    pub const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
26    /// Allow syscall
27    pub const SECCOMP_RET_ALLOW: u32 = 0x7fff0000;
28}
29
30/// BPF architecture codes
31pub mod arch {
32    pub const AUDIT_ARCH_X86_64: u32 = 0xc000003e;
33    pub const AUDIT_ARCH_I386: u32 = 0x40000003;
34    pub const AUDIT_ARCH_ARM: u32 = 0x40000028;
35    pub const AUDIT_ARCH_AARCH64: u32 = 0xc00000b7;
36}
37
38/// Get current architecture code
39pub fn get_arch() -> u32 {
40    #[cfg(target_arch = "x86_64")]
41    {
42        arch::AUDIT_ARCH_X86_64
43    }
44    #[cfg(target_arch = "x86")]
45    {
46        arch::AUDIT_ARCH_I386
47    }
48    #[cfg(target_arch = "arm")]
49    {
50        arch::AUDIT_ARCH_ARM
51    }
52    #[cfg(target_arch = "aarch64")]
53    {
54        arch::AUDIT_ARCH_AARCH64
55    }
56    #[cfg(not(any(
57        target_arch = "x86_64",
58        target_arch = "x86",
59        target_arch = "arm",
60        target_arch = "aarch64"
61    )))]
62    {
63        0
64    }
65}
66
67/// Syscall number mapping for x86_64
68#[derive(Debug, Clone, Copy)]
69pub struct SyscallNumber(pub u32);
70
71impl SyscallNumber {
72    /// Get syscall number by name (x86_64)
73    pub fn from_name(name: &str) -> Option<Self> {
74        let num = match name {
75            // Process management
76            "exit" => 60,
77            "exit_group" => 231,
78            "clone" => 56,
79            "fork" => 57,
80            "vfork" => 58,
81            // Signal handling
82            "rt_sigaction" => 13,
83            "rt_sigprocmask" => 14,
84            "rt_sigpending" => 127,
85            "rt_sigtimedwait" => 128,
86            "rt_sigqueueinfo" => 129,
87            "rt_sigreturn" => 15,
88            "kill" => 62,
89            "tkill" => 200,
90            "tgkill" => 268,
91            "sigaltstack" => 131,
92            // Basic I/O
93            "read" => 0,
94            "write" => 1,
95            "readv" => 19,
96            "writev" => 20,
97            "pread64" => 17,
98            "pwrite64" => 18,
99            // File operations
100            "open" => 2,
101            "openat" => 257,
102            "close" => 3,
103            "stat" => 4,
104            "fstat" => 5,
105            "lstat" => 6,
106            "fcntl" => 72,
107            "ioctl" => 16,
108            // Memory
109            "mmap" => 9,
110            "munmap" => 11,
111            "mremap" => 25,
112            "mprotect" => 10,
113            "madvise" => 28,
114            "brk" => 12,
115            "mlock" => 149,
116            "munlock" => 150,
117            "mlockall" => 151,
118            "munlockall" => 152,
119            // Process execution
120            "execve" => 59,
121            "execveat" => 322,
122            // Waiting
123            "wait4" => 114,
124            "waitpid" => 114,
125            "waitid" => 247,
126            // File descriptors
127            "dup" => 32,
128            "dup2" => 33,
129            "dup3" => 292,
130            // Getting time
131            "clock_gettime" => 228,
132            "clock_getres" => 229,
133            "gettimeofday" => 96,
134            "time" => 201,
135            // Process info
136            "getpid" => 39,
137            "getppid" => 110,
138            "getuid" => 102,
139            "geteuid" => 107,
140            "getgid" => 104,
141            "getegid" => 108,
142            "getpgrp" => 111,
143            "getpgid" => 121,
144            "getsid" => 124,
145            // Limits
146            "getrlimit" => 97,
147            "setrlimit" => 160,
148            "getrusage" => 98,
149            // Misc
150            "futex" => 202,
151            "set_tid_address" => 218,
152            "set_robust_list" => 273,
153            "get_robust_list" => 274,
154            "pselect6" => 270,
155            "ppoll" => 271,
156            "epoll_create1" => 291,
157            "epoll_ctl" => 233,
158            "epoll_wait" => 232,
159            "poll" => 7,
160            "select" => 23,
161            "getcwd" => 79,
162            "chdir" => 80,
163            "fchdir" => 81,
164            "getdents" => 78,
165            "getdents64" => 217,
166            "prctl" => 157,
167            "arch_prctl" => 158,
168            // File operations (IO heavy)
169            "mkdir" => 83,
170            "mkdirat" => 258,
171            "rmdir" => 84,
172            "unlink" => 87,
173            "unlinkat" => 263,
174            "rename" => 82,
175            "renameat" => 264,
176            "link" => 86,
177            "linkat" => 265,
178            "symlink" => 88,
179            "symlinkat" => 266,
180            "readlink" => 89,
181            "readlinkat" => 267,
182            "chmod" => 90,
183            "fchmod" => 91,
184            "fchmodat" => 268,
185            "chown" => 92,
186            "fchown" => 93,
187            "fchownat" => 260,
188            "lchown" => 94,
189            "utimes" => 235,
190            "futimes" => 271,
191            "utime" => 132,
192            "utimensat" => 280,
193            "truncate" => 76,
194            "ftruncate" => 77,
195            "fallocate" => 285,
196            "access" => 21,
197            "faccessat" => 269,
198            "sendfile" => 40,
199            "splice" => 275,
200            "tee" => 276,
201            "vmsplice" => 278,
202            "statfs" => 137,
203            "fstatfs" => 138,
204            "fsync" => 74,
205            "fdatasync" => 75,
206            // Network
207            "socket" => 41,
208            "socketpair" => 53,
209            "bind" => 49,
210            "listen" => 50,
211            "accept" => 43,
212            "accept4" => 288,
213            "connect" => 42,
214            "shutdown" => 48,
215            "sendto" => 44,
216            "recvfrom" => 45,
217            "sendmsg" => 46,
218            "recvmsg" => 47,
219            "sendmmsg" => 307,
220            "recvmmsg" => 299,
221            "setsockopt" => 54,
222            "getsockopt" => 55,
223            "setsockname" => 106,
224            "getsockname" => 51,
225            "getpeername" => 52,
226            // Dangerous (unrestricted)
227            "ptrace" => 101,
228            "process_vm_readv" => 310,
229            "process_vm_writev" => 311,
230            "perf_event_open" => 298,
231            "bpf" => 321,
232            "seccomp" => 317,
233            "mount" => 165,
234            "umount2" => 166,
235            "pivot_root" => 155,
236            "capget" => 125,
237            "capset" => 126,
238            "setuid" => 105,
239            "setgid" => 106,
240            "setreuid" => 113,
241            "setregid" => 114,
242            "setresuid" => 164,
243            "setresgid" => 170,
244            "getgroups" => 115,
245            "setgroups" => 116,
246            "setfsgid" => 123,
247            "setfsuid" => 122,
248            _ => return None,
249        };
250        Some(SyscallNumber(num as u32))
251    }
252}
253
254/// BPF filter compiler
255pub struct SeccompCompiler;
256
257impl SeccompCompiler {
258    /// Compile a filter to BPF instructions
259    pub fn compile(filter: &SeccompFilter) -> Result<Vec<BpfInstr>> {
260        let mut instrs = Vec::new();
261
262        // Check architecture
263        instrs.push(BpfInstr {
264            code: 0x20, // LD.W M[0] (load word from memory offset 0)
265            jt: 0,
266            jf: 0,
267            k: 4, // offset of arch in seccomp_data
268        });
269
270        let arch = get_arch();
271        instrs.push(BpfInstr {
272            code: 0x15, // JEQ (jump if equal)
273            jt: 1,      // jump if true
274            jf: 0,      // jump if false (skip next)
275            k: arch,
276        });
277
278        // Reject if wrong architecture
279        instrs.push(BpfInstr {
280            code: 0x06, // RET
281            jt: 0,
282            jf: 0,
283            k: actions::SECCOMP_RET_KILL,
284        });
285
286        // Load syscall number
287        instrs.push(BpfInstr {
288            code: 0x20, // LD.W
289            jt: 0,
290            jf: 0,
291            k: 0, // offset of syscall number in seccomp_data
292        });
293
294        // Build jump table for allowed syscalls
295        let allowed = filter.allowed_syscalls();
296        let blocked = filter.blocked_syscalls();
297
298        for syscall_name in allowed.iter() {
299            if blocked.contains(syscall_name) {
300                continue; // Skip blocked syscalls
301            }
302
303            if let Some(SyscallNumber(num)) = SyscallNumber::from_name(syscall_name) {
304                instrs.push(BpfInstr {
305                    code: 0x15, // JEQ
306                    jt: 1,      // jump if equal
307                    jf: 0,      // default
308                    k: num,
309                });
310
311                // Allow this syscall
312                instrs.push(BpfInstr {
313                    code: 0x06, // RET
314                    jt: 0,
315                    jf: 0,
316                    k: actions::SECCOMP_RET_ALLOW,
317                });
318            }
319        }
320
321        // Default: reject if no match
322        if filter.is_kill_on_violation() {
323            instrs.push(BpfInstr {
324                code: 0x06, // RET
325                jt: 0,
326                jf: 0,
327                k: actions::SECCOMP_RET_KILL,
328            });
329        } else {
330            instrs.push(BpfInstr {
331                code: 0x06, // RET
332                jt: 0,
333                jf: 0,
334                k: actions::SECCOMP_RET_TRAP,
335            });
336        }
337
338        Ok(instrs)
339    }
340
341    /// Load BPF filter via prctl
342    pub fn load(filter: &SeccompFilter) -> Result<()> {
343        let instrs = Self::compile(filter)?;
344
345        // Convert to raw format for prctl (keep native_instrs alive while loading)
346        let (_native_instrs, prog) = instrs_to_sock_fprog(&instrs);
347
348        unsafe {
349            // Kernel requires NO_NEW_PRIVS before enabling seccomp filters when unprivileged
350            if libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0 {
351                return Err(SandboxError::Seccomp(format!(
352                    "Failed to set PR_SET_NO_NEW_PRIVS: {}",
353                    std::io::Error::last_os_error()
354                )));
355            }
356
357            let ret = libc::prctl(
358                libc::PR_SET_SECCOMP,
359                libc::SECCOMP_MODE_FILTER,
360                &prog as *const _,
361            );
362
363            if ret != 0 {
364                return Err(SandboxError::Seccomp(format!(
365                    "Failed to load seccomp filter: {}",
366                    std::io::Error::last_os_error()
367                )));
368            }
369        }
370
371        Ok(())
372    }
373}
374
375/// Convert BpfInstr to sock_fprog format
376fn instrs_to_sock_fprog(instrs: &[BpfInstr]) -> (Vec<bpf_insn>, sockfprog) {
377    let native_instrs: Vec<bpf_insn> = instrs.iter().copied().map(bpf_insn::from).collect();
378    let prog = sockfprog {
379        len: native_instrs.len() as u16,
380        filter: native_instrs.as_ptr() as *mut bpf_insn,
381    };
382    (native_instrs, prog)
383}
384
385/// BPF instruction struct (same as kernel)
386#[repr(C)]
387#[derive(Clone, Copy)]
388pub struct bpf_insn {
389    pub code: u16,
390    pub jt: u8,
391    pub jf: u8,
392    pub k: u32,
393}
394
395impl From<BpfInstr> for bpf_insn {
396    fn from(instr: BpfInstr) -> Self {
397        bpf_insn {
398            code: instr.code,
399            jt: instr.jt,
400            jf: instr.jf,
401            k: instr.k,
402        }
403    }
404}
405
406/// Socket filter program (for prctl)
407#[repr(C)]
408struct sockfprog {
409    len: u16,
410    filter: *mut bpf_insn,
411}
412
413#[cfg(test)]
414mod tests {
415    use super::super::seccomp::{SeccompFilter, SeccompProfile};
416    use super::*;
417
418    #[test]
419    fn test_get_arch() {
420        let arch = get_arch();
421        #[cfg(target_arch = "x86_64")]
422        assert_eq!(arch, arch::AUDIT_ARCH_X86_64);
423    }
424
425    #[test]
426    fn test_syscall_number_read() {
427        let num = SyscallNumber::from_name("read").unwrap();
428        assert_eq!(num.0, 0);
429    }
430
431    #[test]
432    fn test_syscall_number_write() {
433        let num = SyscallNumber::from_name("write").unwrap();
434        assert_eq!(num.0, 1);
435    }
436
437    #[test]
438    fn test_syscall_number_invalid() {
439        let num = SyscallNumber::from_name("invalid_syscall");
440        assert!(num.is_none());
441    }
442
443    #[test]
444    fn test_syscall_number_exit() {
445        let num = SyscallNumber::from_name("exit").unwrap();
446        assert_eq!(num.0, 60);
447    }
448
449    #[test]
450    fn test_syscall_number_execve() {
451        let num = SyscallNumber::from_name("execve").unwrap();
452        assert_eq!(num.0, 59);
453    }
454
455    #[test]
456    fn test_compile_minimal_filter() {
457        let filter = SeccompFilter::minimal();
458        let result = SeccompCompiler::compile(&filter);
459        assert!(result.is_ok());
460
461        let instrs = result.unwrap();
462        assert!(!instrs.is_empty());
463    }
464
465    #[test]
466    fn test_compile_io_heavy_filter() {
467        let filter = SeccompFilter::from_profile(SeccompProfile::IoHeavy);
468        let result = SeccompCompiler::compile(&filter);
469        assert!(result.is_ok());
470
471        let instrs = result.unwrap();
472        assert!(instrs.len() > 5);
473    }
474
475    #[test]
476    fn test_bpf_instr_creation() {
477        let instr = BpfInstr {
478            code: 0x06,
479            jt: 0,
480            jf: 0,
481            k: actions::SECCOMP_RET_ALLOW,
482        };
483
484        assert_eq!(instr.code, 0x06);
485        assert_eq!(instr.k, actions::SECCOMP_RET_ALLOW);
486    }
487
488    #[test]
489    fn test_actions_values() {
490        assert_eq!(actions::SECCOMP_RET_KILL, 0x00000000);
491        assert_eq!(actions::SECCOMP_RET_ALLOW, 0x7fff0000);
492    }
493
494    #[test]
495    fn test_multiple_syscall_numbers() {
496        let syscalls = vec!["read", "write", "open", "close", "fork"];
497
498        for syscall in syscalls {
499            let num = SyscallNumber::from_name(syscall);
500            assert!(num.is_some(), "Failed to get number for {}", syscall);
501        }
502    }
503}