Skip to main content

sandlock_core/
context.rs

1// Fork + confinement sequence: child-side Landlock + seccomp application
2// and parent-child pipe synchronization.
3
4use std::ffi::CString;
5use std::io;
6use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
7use std::path::PathBuf;
8
9use crate::policy::{FsIsolation, Policy};
10use crate::seccomp::bpf::{self, stmt, jump};
11use crate::sys::structs::{
12    AF_INET, AF_INET6, AF_NETLINK,
13    BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
14    CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM, NETLINK_SOCK_DIAG, SECCOMP_RET_ERRNO,
15    SOCK_DGRAM, SOCK_RAW, SOCK_TYPE_MASK, TIOCLINUX, TIOCSTI,
16    PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER,
17    OFFSET_ARGS0_LO, OFFSET_ARGS1_LO, OFFSET_ARGS2_LO, OFFSET_NR,
18    SockFilter,
19};
20
21// ============================================================
22// Pipe pair for parent-child synchronization
23// ============================================================
24
25/// Pipes for parent-child communication after fork().
26pub struct PipePair {
27    /// Parent reads the notif fd number written by the child.
28    pub notif_r: OwnedFd,
29    /// Child writes the notif fd number to the parent.
30    pub notif_w: OwnedFd,
31    /// Child reads the "supervisor ready" signal from the parent.
32    pub ready_r: OwnedFd,
33    /// Parent writes the "supervisor ready" signal to the child.
34    pub ready_w: OwnedFd,
35}
36
37impl PipePair {
38    /// Create two pipe pairs using `pipe2(O_CLOEXEC)`.
39    pub fn new() -> io::Result<Self> {
40        let mut notif_fds = [0i32; 2];
41        let mut ready_fds = [0i32; 2];
42
43        // SAFETY: pipe2 with valid pointers and O_CLOEXEC
44        let ret = unsafe { libc::pipe2(notif_fds.as_mut_ptr(), libc::O_CLOEXEC) };
45        if ret < 0 {
46            return Err(io::Error::last_os_error());
47        }
48
49        let ret = unsafe { libc::pipe2(ready_fds.as_mut_ptr(), libc::O_CLOEXEC) };
50        if ret < 0 {
51            // Close the first pair on failure
52            unsafe {
53                libc::close(notif_fds[0]);
54                libc::close(notif_fds[1]);
55            }
56            return Err(io::Error::last_os_error());
57        }
58
59        // SAFETY: pipe2 returned valid fds
60        Ok(PipePair {
61            notif_r: unsafe { OwnedFd::from_raw_fd(notif_fds[0]) },
62            notif_w: unsafe { OwnedFd::from_raw_fd(notif_fds[1]) },
63            ready_r: unsafe { OwnedFd::from_raw_fd(ready_fds[0]) },
64            ready_w: unsafe { OwnedFd::from_raw_fd(ready_fds[1]) },
65        })
66    }
67}
68
69// ============================================================
70// Pipe I/O helpers
71// ============================================================
72
73/// Write a `u32` as 4 little-endian bytes to a raw fd.
74pub(crate) fn write_u32_fd(fd: RawFd, val: u32) -> io::Result<()> {
75    let buf = val.to_le_bytes();
76    let mut written = 0usize;
77    while written < 4 {
78        let ret = unsafe {
79            libc::write(
80                fd,
81                buf[written..].as_ptr() as *const libc::c_void,
82                4 - written,
83            )
84        };
85        if ret < 0 {
86            return Err(io::Error::last_os_error());
87        }
88        written += ret as usize;
89    }
90    Ok(())
91}
92
93/// Read a `u32` (4 little-endian bytes, blocking) from a raw fd.
94pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result<u32> {
95    let mut buf = [0u8; 4];
96    let mut total = 0usize;
97    while total < 4 {
98        let ret = unsafe {
99            libc::read(
100                fd,
101                buf[total..].as_mut_ptr() as *mut libc::c_void,
102                4 - total,
103            )
104        };
105        if ret < 0 {
106            return Err(io::Error::last_os_error());
107        }
108        if ret == 0 {
109            return Err(io::Error::new(
110                io::ErrorKind::UnexpectedEof,
111                "pipe closed before 4 bytes read",
112            ));
113        }
114        total += ret as usize;
115    }
116    Ok(u32::from_le_bytes(buf))
117}
118
119// ============================================================
120// Syscall name → number mapping
121// ============================================================
122
123/// Map a syscall name to its `libc::SYS_*` number.
124///
125/// Covers all names in `DEFAULT_DENY_SYSCALLS` plus extras needed for
126/// notif and arg-filter lists.
127pub fn syscall_name_to_nr(name: &str) -> Option<u32> {
128    let nr: i64 = match name {
129        "mount" => libc::SYS_mount,
130        "umount2" => libc::SYS_umount2,
131        "pivot_root" => libc::SYS_pivot_root,
132        "swapon" => libc::SYS_swapon,
133        "swapoff" => libc::SYS_swapoff,
134        "reboot" => libc::SYS_reboot,
135        "sethostname" => libc::SYS_sethostname,
136        "setdomainname" => libc::SYS_setdomainname,
137        "kexec_load" => libc::SYS_kexec_load,
138        "init_module" => libc::SYS_init_module,
139        "finit_module" => libc::SYS_finit_module,
140        "delete_module" => libc::SYS_delete_module,
141        "unshare" => libc::SYS_unshare,
142        "setns" => libc::SYS_setns,
143        "perf_event_open" => libc::SYS_perf_event_open,
144        "bpf" => libc::SYS_bpf,
145        "userfaultfd" => libc::SYS_userfaultfd,
146        "keyctl" => libc::SYS_keyctl,
147        "add_key" => libc::SYS_add_key,
148        "request_key" => libc::SYS_request_key,
149        "ptrace" => libc::SYS_ptrace,
150        "process_vm_readv" => libc::SYS_process_vm_readv,
151        "process_vm_writev" => libc::SYS_process_vm_writev,
152        "open_by_handle_at" => libc::SYS_open_by_handle_at,
153        "name_to_handle_at" => libc::SYS_name_to_handle_at,
154        "ioperm" => libc::SYS_ioperm,
155        "iopl" => libc::SYS_iopl,
156        "quotactl" => libc::SYS_quotactl,
157        "acct" => libc::SYS_acct,
158        "lookup_dcookie" => libc::SYS_lookup_dcookie,
159        // nfsservctl was removed in Linux 3.1; no libc constant — skip
160        "io_uring_setup" => libc::SYS_io_uring_setup,
161        "io_uring_enter" => libc::SYS_io_uring_enter,
162        "io_uring_register" => libc::SYS_io_uring_register,
163        // Additional syscalls for notif/arg filters
164        "clone" => libc::SYS_clone,
165        "clone3" => libc::SYS_clone3,
166        "vfork" => libc::SYS_vfork,
167        "mmap" => libc::SYS_mmap,
168        "munmap" => libc::SYS_munmap,
169        "brk" => libc::SYS_brk,
170        "mremap" => libc::SYS_mremap,
171        "connect" => libc::SYS_connect,
172        "sendto" => libc::SYS_sendto,
173        "sendmsg" => libc::SYS_sendmsg,
174        "ioctl" => libc::SYS_ioctl,
175        "socket" => libc::SYS_socket,
176        "prctl" => libc::SYS_prctl,
177        "getrandom" => libc::SYS_getrandom,
178        "openat" => libc::SYS_openat,
179        "open" => libc::SYS_open,
180        "getdents64" => libc::SYS_getdents64,
181        "getdents" => libc::SYS_getdents,
182        "bind" => libc::SYS_bind,
183        "getsockname" => libc::SYS_getsockname,
184        "clock_gettime" => libc::SYS_clock_gettime,
185        "gettimeofday" => libc::SYS_gettimeofday,
186        "time" => libc::SYS_time,
187        "clock_nanosleep" => libc::SYS_clock_nanosleep,
188        "timerfd_settime" => libc::SYS_timerfd_settime,
189        "timer_settime" => libc::SYS_timer_settime,
190        "execve" => libc::SYS_execve,
191        "execveat" => libc::SYS_execveat,
192        // COW filesystem syscalls
193        "unlinkat" => libc::SYS_unlinkat,
194        "mkdirat" => libc::SYS_mkdirat,
195        "renameat2" => libc::SYS_renameat2,
196        "newfstatat" => libc::SYS_newfstatat,
197        "statx" => libc::SYS_statx,
198        "faccessat" => libc::SYS_faccessat,
199        "symlinkat" => libc::SYS_symlinkat,
200        "linkat" => libc::SYS_linkat,
201        "fchmodat" => libc::SYS_fchmodat,
202        "fchownat" => libc::SYS_fchownat,
203        "readlinkat" => libc::SYS_readlinkat,
204        "truncate" => libc::SYS_truncate,
205        "utimensat" => libc::SYS_utimensat,
206        "unlink" => libc::SYS_unlink,
207        "rmdir" => libc::SYS_rmdir,
208        "mkdir" => libc::SYS_mkdir,
209        "rename" => libc::SYS_rename,
210        "stat" => libc::SYS_stat,
211        "lstat" => libc::SYS_lstat,
212        "access" => libc::SYS_access,
213        "symlink" => libc::SYS_symlink,
214        "link" => libc::SYS_link,
215        "chmod" => libc::SYS_chmod,
216        "chown" => libc::SYS_chown,
217        "lchown" => libc::SYS_lchown,
218        "readlink" => libc::SYS_readlink,
219        "futimesat" => libc::SYS_futimesat,
220        "fork" => libc::SYS_fork,
221        _ => return None,
222    };
223    Some(nr as u32)
224}
225
226// ============================================================
227// Policy → syscall lists
228// ============================================================
229
230/// Determine which syscalls need `SECCOMP_RET_USER_NOTIF`.
231pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
232    let mut nrs = vec![
233        libc::SYS_clone as u32,
234        libc::SYS_clone3 as u32,
235        libc::SYS_vfork as u32,
236    ];
237
238    if policy.max_memory.is_some() {
239        nrs.push(libc::SYS_mmap as u32);
240        nrs.push(libc::SYS_munmap as u32);
241        nrs.push(libc::SYS_brk as u32);
242        nrs.push(libc::SYS_mremap as u32);
243        nrs.push(libc::SYS_shmget as u32);
244    }
245
246    if !policy.net_allow_hosts.is_empty() || policy.policy_fn.is_some() {
247        nrs.push(libc::SYS_connect as u32);
248        nrs.push(libc::SYS_sendto as u32);
249        nrs.push(libc::SYS_sendmsg as u32);
250        nrs.push(libc::SYS_bind as u32);
251    }
252
253    if policy.random_seed.is_some() {
254        nrs.push(libc::SYS_getrandom as u32);
255        // Also intercept openat so the supervisor can re-patch vDSO after exec.
256        nrs.push(libc::SYS_openat as u32);
257    }
258
259    if policy.time_start.is_some() {
260        nrs.extend_from_slice(&[
261            libc::SYS_clock_nanosleep as u32,
262            libc::SYS_timerfd_settime as u32,
263            libc::SYS_timer_settime as u32,
264        ]);
265        // Also intercept openat so the supervisor gets a notification after exec
266        // and can re-patch the vDSO (exec replaces vDSO with a fresh copy).
267        nrs.push(libc::SYS_openat as u32);
268    }
269
270    // /proc virtualization needs openat interception
271    if policy.num_cpus.is_some() || policy.max_memory.is_some() || policy.isolate_pids || policy.port_remap {
272        nrs.push(libc::SYS_openat as u32);
273    }
274    // Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
275    if policy.num_cpus.is_some() {
276        nrs.push(libc::SYS_sched_getaffinity as u32);
277    }
278    if policy.isolate_pids || policy.deterministic_dirs {
279        nrs.extend_from_slice(&[
280            libc::SYS_getdents64 as u32,
281            libc::SYS_getdents as u32,
282        ]);
283    }
284    if policy.hostname.is_some() {
285        nrs.push(libc::SYS_uname as u32);
286        nrs.push(libc::SYS_openat as u32);
287    }
288
289    // COW filesystem interception (seccomp-based, unprivileged)
290    if policy.workdir.is_some() && policy.fs_isolation == FsIsolation::None {
291        nrs.extend_from_slice(&[
292            libc::SYS_openat as u32,
293            libc::SYS_unlinkat as u32,
294            libc::SYS_mkdirat as u32,
295            libc::SYS_renameat2 as u32,
296            libc::SYS_symlinkat as u32,
297            libc::SYS_linkat as u32,
298            libc::SYS_fchmodat as u32,
299            libc::SYS_fchownat as u32,
300            libc::SYS_truncate as u32,
301            libc::SYS_newfstatat as u32,
302            libc::SYS_statx as u32,
303            libc::SYS_faccessat as u32,
304            libc::SYS_readlinkat as u32,
305            libc::SYS_getdents64 as u32,
306            libc::SYS_getdents as u32,
307        ]);
308    }
309
310    // Chroot path interception
311    if policy.chroot.is_some() {
312        nrs.extend_from_slice(&[
313            libc::SYS_openat as u32,
314            libc::SYS_execve as u32,
315            libc::SYS_execveat as u32,
316            libc::SYS_unlinkat as u32,
317            libc::SYS_mkdirat as u32,
318            libc::SYS_renameat2 as u32,
319            libc::SYS_symlinkat as u32,
320            libc::SYS_linkat as u32,
321            libc::SYS_fchmodat as u32,
322            libc::SYS_fchownat as u32,
323            libc::SYS_truncate as u32,
324            libc::SYS_newfstatat as u32,
325            libc::SYS_statx as u32,
326            libc::SYS_faccessat as u32,
327            libc::SYS_readlinkat as u32,
328            libc::SYS_getdents64 as u32,
329            libc::SYS_getdents as u32,
330            libc::SYS_chdir as u32,
331            libc::SYS_getcwd as u32,
332            libc::SYS_statfs as u32,
333            libc::SYS_utimensat as u32,
334        ]);
335    }
336
337    // Dynamic policy callback — intercept key syscalls for event emission
338    if policy.policy_fn.is_some() {
339        nrs.extend_from_slice(&[
340            libc::SYS_openat as u32,
341            libc::SYS_connect as u32,
342            libc::SYS_sendto as u32,
343            libc::SYS_bind as u32,
344            libc::SYS_execve as u32,
345            libc::SYS_execveat as u32,
346        ]);
347    }
348
349    // Port remapping
350    if policy.port_remap {
351        nrs.extend_from_slice(&[
352            libc::SYS_bind as u32,
353            libc::SYS_getsockname as u32,
354        ]);
355    }
356
357    nrs.sort_unstable();
358    nrs.dedup();
359    nrs
360}
361
362/// Resolve `deny_syscalls` names to numbers.
363///
364/// If both `deny_syscalls` and `allow_syscalls` are `None`, returns the
365/// numbers for `DEFAULT_DENY_SYSCALLS`.
366pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
367    if let Some(ref names) = policy.deny_syscalls {
368        names
369            .iter()
370            .filter_map(|n| syscall_name_to_nr(n))
371            .collect()
372    } else if policy.allow_syscalls.is_none() {
373        DEFAULT_DENY_SYSCALLS
374            .iter()
375            .filter_map(|n| syscall_name_to_nr(n))
376            .collect()
377    } else {
378        // allow_syscalls is set — no deny list
379        Vec::new()
380    }
381}
382
383/// Build argument-level seccomp filter instructions matching the Python
384/// `_build_arg_filters()` exactly.
385///
386/// Returns a `Vec<SockFilter>` containing self-contained BPF blocks for:
387///   - clone: block namespace creation flags
388///   - ioctl: block TIOCSTI, TIOCLINUX
389///   - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
390///   - socket: block NETLINK_SOCK_DIAG (with AF_NETLINK domain check)
391///   - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
392pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
393    let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
394    let nr_clone = libc::SYS_clone as u32;
395    let nr_ioctl = libc::SYS_ioctl as u32;
396    let nr_prctl = libc::SYS_prctl as u32;
397    let nr_socket = libc::SYS_socket as u32;
398
399    let mut insns: Vec<SockFilter> = Vec::new();
400
401    // --- clone: block namespace creation flags ---
402    // 5 instructions:
403    //   LD NR
404    //   JEQ clone → +0, skip 3
405    //   LD arg0
406    //   JSET NS_FLAGS → +0, skip 1
407    //   RET ERRNO
408    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
409    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_clone, 0, 3));
410    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
411    insns.push(jump(BPF_JMP | BPF_JSET | BPF_K, CLONE_NS_FLAGS as u32, 0, 1));
412    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
413
414    // --- ioctl: block dangerous commands (TIOCSTI, TIOCLINUX) ---
415    // Layout: LD NR, JEQ ioctl (skip 1 + N*2), LD arg1, [JEQ cmd, RET ERRNO] * N
416    let dangerous_ioctls: &[u32] = &[TIOCSTI as u32, TIOCLINUX as u32];
417    let n_ioctls = dangerous_ioctls.len();
418    let skip_count = (1 + n_ioctls * 2) as u8;
419    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
420    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_ioctl, 0, skip_count));
421    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
422    for &cmd in dangerous_ioctls {
423        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, cmd, 0, 1));
424        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
425    }
426
427    // --- prctl: block dangerous options ---
428    // Layout: LD NR, JEQ prctl (skip 1 + N*2), LD arg0, [JEQ op, RET ERRNO] * N
429    let dangerous_prctl_ops: &[u32] = &[PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER];
430    let n_ops = dangerous_prctl_ops.len();
431    let skip_count = (1 + n_ops * 2) as u8;
432    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
433    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_prctl, 0, skip_count));
434    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
435    for &op in dangerous_prctl_ops {
436        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, op, 0, 1));
437        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
438    }
439
440    // --- socket: block NETLINK_SOCK_DIAG (only on AF_NETLINK domain) ---
441    // 7 instructions:
442    //   LD NR
443    //   JEQ socket → +0, skip 5
444    //   LD arg0 (domain)
445    //   JEQ AF_NETLINK → +0, skip 3
446    //   LD arg2 (protocol)
447    //   JEQ NETLINK_SOCK_DIAG → +0, skip 1
448    //   RET ERRNO
449    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
450    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 5));
451    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
452    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 3));
453    insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS2_LO));
454    insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, NETLINK_SOCK_DIAG, 0, 1));
455    insns.push(stmt(BPF_RET | BPF_K, ret_errno));
456
457    // --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
458    let mut blocked_types: Vec<u32> = Vec::new();
459    if policy.no_raw_sockets {
460        blocked_types.push(SOCK_RAW);
461    }
462    if policy.no_udp {
463        blocked_types.push(SOCK_DGRAM);
464    }
465
466    if !blocked_types.is_empty() {
467        let n = blocked_types.len();
468        // Instructions after domain checks: 2 (load+AND) + N (JEQs) + 1 (RET)
469        let after_domain = 2 + n + 1;
470        // Total after NR check: 3 (load domain + 2 JEQs) + after_domain
471        let skip_all = (3 + after_domain) as u8;
472
473        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
474        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, skip_all));
475        // Load domain (arg0)
476        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
477        // AF_INET → skip to type check (jump over AF_INET6 check)
478        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET, 1, 0));
479        // AF_INET6 → type check; else skip everything remaining
480        insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_INET6, 0, after_domain as u8));
481        // Load type (arg1) and mask off SOCK_NONBLOCK|SOCK_CLOEXEC
482        insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS1_LO));
483        insns.push(stmt(BPF_ALU | BPF_AND | BPF_K, SOCK_TYPE_MASK));
484        // Check each blocked type
485        for (i, &sock_type) in blocked_types.iter().enumerate() {
486            let remaining = n - i - 1;
487            // Match → jump to RET ERRNO (skip 'remaining' JEQs ahead)
488            // No match on last type → skip past RET ERRNO (jf=1)
489            // No match on non-last → check next type (jf=0)
490            let jf: u8 = if remaining == 0 { 1 } else { 0 };
491            insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, sock_type, remaining as u8, jf));
492        }
493        // Deny return (reached by any matching JEQ)
494        insns.push(stmt(BPF_RET | BPF_K, ret_errno));
495    }
496
497    insns
498}
499
500// ============================================================
501// Close fds above threshold
502// ============================================================
503
504/// Close all file descriptors above `min_fd`, except those in `keep`.
505fn close_fds_above(min_fd: RawFd, keep: &[RawFd]) {
506    // Read /proc/self/fd to enumerate open fds.
507    // Collect all fd numbers first, then close them after dropping the directory
508    // iterator. This avoids closing the directory fd during iteration.
509    let fds_to_close: Vec<RawFd> = {
510        let dir = match std::fs::read_dir("/proc/self/fd") {
511            Ok(d) => d,
512            Err(_) => return,
513        };
514        dir.flatten()
515            .filter_map(|entry| {
516                entry.file_name().into_string().ok()
517                    .and_then(|name| name.parse::<RawFd>().ok())
518            })
519            .filter(|&fd| fd > min_fd && !keep.contains(&fd))
520            .collect()
521    };
522    // The directory is now closed; safe to close the collected fds.
523    for fd in fds_to_close {
524        unsafe { libc::close(fd) };
525    }
526}
527
528// ============================================================
529// COW filesystem config passed from parent to child
530// ============================================================
531
532/// Overlay mount configuration for the child process.
533pub(crate) struct CowConfig {
534    pub merged: PathBuf,
535    pub upper: PathBuf,
536    pub work: PathBuf,
537    pub lowers: Vec<PathBuf>,
538}
539
540/// Write uid/gid maps for an unprivileged user namespace.
541/// `real_uid`/`real_gid` must be captured *before* unshare(CLONE_NEWUSER),
542/// since getuid()/getgid() return the overflow id (65534) after unshare.
543fn write_id_maps(real_uid: u32, real_gid: u32) {
544    let _ = std::fs::write("/proc/self/uid_map", format!("0 {} 1\n", real_uid));
545    let _ = std::fs::write("/proc/self/setgroups", "deny\n");
546    let _ = std::fs::write("/proc/self/gid_map", format!("0 {} 1\n", real_gid));
547}
548
549/// Write uid/gid maps using the post-unshare overflow uid (65534).
550/// Used by the OverlayFS COW path which relies on this specific mapping.
551fn write_id_maps_overflow() {
552    let uid = unsafe { libc::getuid() };
553    let gid = unsafe { libc::getgid() };
554    write_id_maps(uid, gid);
555}
556
557// ============================================================
558// Child-side confinement (never returns)
559// ============================================================
560
561/// Apply irreversible confinement (Landlock + seccomp) then exec the command.
562///
563/// This function **never returns**: it calls `execvp` on success or
564/// `_exit(127)` on any error.
565pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, cow_config: Option<&CowConfig>, nested: bool) -> ! {
566    // Helper: abort child on error. Includes the OS error automatically.
567    macro_rules! fail {
568        ($msg:expr) => {{
569            let err = std::io::Error::last_os_error();
570            let _ = write!(std::io::stderr(), "sandlock child: {}: {}\n", $msg, err);
571            unsafe { libc::_exit(127) };
572        }};
573    }
574
575    use std::io::Write;
576
577    // 1. New process group
578    if unsafe { libc::setpgid(0, 0) } != 0 {
579        fail!("setpgid");
580    }
581
582    // 1b. If stdin is a terminal, become the foreground process group
583    //     so interactive shells can read from the TTY.
584    //     Must ignore SIGTTOU first — a background pgrp calling tcsetpgrp
585    //     gets stopped by SIGTTOU otherwise.
586    if unsafe { libc::isatty(0) } == 1 {
587        unsafe {
588            libc::signal(libc::SIGTTOU, libc::SIG_IGN);
589            libc::tcsetpgrp(0, libc::getpgrp());
590            libc::signal(libc::SIGTTOU, libc::SIG_DFL);
591        }
592    }
593
594    // 2. Die if parent exits
595    if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } != 0 {
596        fail!("prctl(PR_SET_PDEATHSIG)");
597    }
598
599    // 3. Check parent didn't die between fork and prctl
600    if unsafe { libc::getppid() } == 1 {
601        fail!("parent died before confinement");
602    }
603
604    // 4. Optional: disable ASLR
605    if policy.no_randomize_memory {
606        const ADDR_NO_RANDOMIZE: u64 = 0x0040000;
607        if unsafe { libc::personality(ADDR_NO_RANDOMIZE as libc::c_ulong) } == -1 {
608            fail!("personality(ADDR_NO_RANDOMIZE)");
609        }
610    }
611
612    // 4b. Optional: CPU core binding
613    if let Some(ref cores) = policy.cpu_cores {
614        if !cores.is_empty() {
615            let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
616            unsafe { libc::CPU_ZERO(&mut set) };
617            for &core in cores {
618                unsafe { libc::CPU_SET(core as usize, &mut set) };
619            }
620            if unsafe {
621                libc::sched_setaffinity(
622                    0,
623                    std::mem::size_of::<libc::cpu_set_t>(),
624                    &set,
625                )
626            } != 0
627            {
628                fail!("sched_setaffinity");
629            }
630        }
631    }
632
633    // 5. Optional: disable THP
634    if policy.no_huge_pages {
635        if unsafe { libc::prctl(libc::PR_SET_THP_DISABLE, 1, 0, 0, 0) } != 0 {
636            fail!("prctl(PR_SET_THP_DISABLE)");
637        }
638    }
639
640    // Capture real uid/gid before any unshare (after unshare they become 65534)
641    let real_uid = unsafe { libc::getuid() };
642    let real_gid = unsafe { libc::getgid() };
643
644    // 5b. User namespace for privileged mode (fake root) or OverlayFS COW
645    if policy.privileged && cow_config.is_none() {
646        if unsafe { libc::unshare(libc::CLONE_NEWUSER) } != 0 {
647            fail!("unshare(CLONE_NEWUSER)");
648        }
649        write_id_maps(real_uid, real_gid);
650    }
651
652    // 5c. User + mount namespace for OverlayFS COW (includes CLONE_NEWUSER)
653    if let Some(ref cow) = cow_config {
654        // unshare user + mount namespaces (unprivileged)
655        if unsafe { libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) } != 0 {
656            fail!("unshare(CLONE_NEWUSER | CLONE_NEWNS)");
657        }
658
659        // Write uid/gid maps using overflow uid (preserves existing COW behavior)
660        write_id_maps_overflow();
661
662        // Mount the overlay filesystem
663        let lowerdir = cow.lowers.iter()
664            .map(|p| p.display().to_string())
665            .collect::<Vec<_>>()
666            .join(":");
667        let opts = format!(
668            "lowerdir={},upperdir={},workdir={}",
669            lowerdir,
670            cow.upper.display(),
671            cow.work.display(),
672        );
673
674        let merged_cstr = match CString::new(cow.merged.to_str().unwrap_or("")) {
675            Ok(c) => c,
676            Err(_) => fail!("invalid merged path"),
677        };
678        let overlay_cstr = CString::new("overlay").unwrap();
679        let opts_cstr = match CString::new(opts) {
680            Ok(c) => c,
681            Err(_) => fail!("invalid overlay opts"),
682        };
683
684        let ret = unsafe {
685            libc::mount(
686                overlay_cstr.as_ptr(),
687                merged_cstr.as_ptr(),
688                overlay_cstr.as_ptr(),
689                0,
690                opts_cstr.as_ptr() as *const libc::c_void,
691            )
692        };
693        if ret != 0 {
694            fail!("mount overlay");
695        }
696    }
697
698    // 6. Optional: change working directory
699    // When chroot is set, default to the chroot root if no workdir specified
700    let effective_workdir = if let Some(ref workdir) = policy.workdir {
701        if let Some(ref chroot_root) = policy.chroot {
702            // Workdir is virtual (child-visible), translate to host path
703            Some(chroot_root.join(workdir.strip_prefix("/").unwrap_or(workdir)))
704        } else {
705            Some(workdir.clone())
706        }
707    } else if let Some(ref chroot_root) = policy.chroot {
708        // Default to chroot root
709        Some(chroot_root.clone())
710    } else {
711        None
712    };
713
714    if let Some(ref workdir) = effective_workdir {
715        let c_path = match CString::new(workdir.as_os_str().as_encoded_bytes()) {
716            Ok(p) => p,
717            Err(_) => fail!("invalid workdir path"),
718        };
719        if unsafe { libc::chdir(c_path.as_ptr()) } != 0 {
720            fail!("chdir");
721        }
722    }
723
724    // 7. Set NO_NEW_PRIVS (required for both Landlock and seccomp without CAP_SYS_ADMIN)
725    if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 {
726        fail!("prctl(PR_SET_NO_NEW_PRIVS)");
727    }
728
729    // 8. Apply Landlock confinement (IRREVERSIBLE)
730    if let Err(e) = crate::landlock::confine(policy) {
731        fail!(format!("landlock: {}", e));
732    }
733
734    // 9. Assemble and install seccomp filter (IRREVERSIBLE)
735    let deny = deny_syscall_numbers(policy);
736    let args = arg_filters(policy);
737    let mut keep_fd: i32 = -1;
738
739    if nested {
740        // Nested sandbox: deny-only filter (no supervisor — parent handles it).
741        // BPF filters are ANDed by the kernel, so each level can only tighten.
742        let filter = bpf::assemble_filter(&[], &deny, &args);
743        if let Err(e) = bpf::install_deny_filter(&filter) {
744            fail!(format!("seccomp deny filter: {}", e));
745        }
746        // Signal nested mode to parent (fd=0 means no supervisor needed)
747        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), 0) {
748            fail!(format!("write nested signal: {}", e));
749        }
750    } else {
751        // First-level sandbox: notif + deny filter with NEW_LISTENER.
752        let notif = notif_syscalls(policy);
753        let filter = bpf::assemble_filter(&notif, &deny, &args);
754        let notif_fd = match bpf::install_filter(&filter) {
755            Ok(fd) => fd,
756            Err(e) => fail!(format!("seccomp install: {}", e)),
757        };
758        keep_fd = notif_fd.as_raw_fd();
759        if let Err(e) = write_u32_fd(pipes.notif_w.as_raw_fd(), keep_fd as u32) {
760            fail!(format!("write notif fd: {}", e));
761        }
762        std::mem::forget(notif_fd);
763    }
764
765    // Mark this process as confined for in-process nesting detection
766    crate::sandbox::CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
767
768    // 10. Wait for parent to signal ready
769    match read_u32_fd(pipes.ready_r.as_raw_fd()) {
770        Ok(_) => {}
771        Err(e) => fail!(format!("read ready signal: {}", e)),
772    }
773
774    // 12. Optional: close all fds above stderr
775    if policy.close_fds {
776        if keep_fd >= 0 {
777            close_fds_above(2, &[keep_fd]);
778        } else {
779            close_fds_above(2, &[]);
780        }
781    }
782
783    // 13. Apply environment
784    if policy.clean_env {
785        // Clear all env vars first
786        for (key, _) in std::env::vars_os() {
787            std::env::remove_var(&key);
788        }
789    }
790    for (key, value) in &policy.env {
791        std::env::set_var(key, value);
792    }
793
794    // 13b. GPU device visibility
795    if let Some(ref devices) = policy.gpu_devices {
796        if !devices.is_empty() {
797            let vis = devices.iter().map(|d| d.to_string()).collect::<Vec<_>>().join(",");
798            std::env::set_var("CUDA_VISIBLE_DEVICES", &vis);
799            std::env::set_var("ROCR_VISIBLE_DEVICES", &vis);
800        }
801        // Empty list = all GPUs visible, don't set env vars
802    }
803
804    // 14. execvp
805    debug_assert!(!cmd.is_empty(), "cmd must not be empty");
806    let argv_ptrs: Vec<*const libc::c_char> = cmd
807        .iter()
808        .map(|s| s.as_ptr())
809        .chain(std::iter::once(std::ptr::null()))
810        .collect();
811
812    unsafe { libc::execvp(argv_ptrs[0], argv_ptrs.as_ptr()) };
813
814    // If we get here, exec failed
815    fail!(format!("execvp '{}'", cmd[0].to_string_lossy()));
816}
817
818// ============================================================
819// Tests
820// ============================================================
821
822#[cfg(test)]
823mod tests {
824    use super::*;
825
826    #[test]
827    fn test_pipe_pair_creation() {
828        let pipes = PipePair::new().expect("pipe creation failed");
829        // Verify fds are valid (non-negative)
830        assert!(pipes.notif_r.as_raw_fd() >= 0);
831        assert!(pipes.notif_w.as_raw_fd() >= 0);
832        assert!(pipes.ready_r.as_raw_fd() >= 0);
833        assert!(pipes.ready_w.as_raw_fd() >= 0);
834        // All four fds should be distinct
835        let fds = [
836            pipes.notif_r.as_raw_fd(),
837            pipes.notif_w.as_raw_fd(),
838            pipes.ready_r.as_raw_fd(),
839            pipes.ready_w.as_raw_fd(),
840        ];
841        for i in 0..4 {
842            for j in (i + 1)..4 {
843                assert_ne!(fds[i], fds[j]);
844            }
845        }
846    }
847
848    #[test]
849    fn test_write_read_u32() {
850        let pipes = PipePair::new().expect("pipe creation failed");
851        let val = 42u32;
852        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
853        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
854        assert_eq!(got, val);
855    }
856
857    #[test]
858    fn test_write_read_u32_large() {
859        let pipes = PipePair::new().expect("pipe creation failed");
860        let val = 0xDEAD_BEEFu32;
861        write_u32_fd(pipes.notif_w.as_raw_fd(), val).expect("write failed");
862        let got = read_u32_fd(pipes.notif_r.as_raw_fd()).expect("read failed");
863        assert_eq!(got, val);
864    }
865
866    #[test]
867    fn test_notif_syscalls_always_has_clone() {
868        let policy = Policy::builder().build().unwrap();
869        let nrs = notif_syscalls(&policy);
870        assert!(nrs.contains(&(libc::SYS_clone as u32)));
871        assert!(nrs.contains(&(libc::SYS_clone3 as u32)));
872        assert!(nrs.contains(&(libc::SYS_vfork as u32)));
873    }
874
875    #[test]
876    fn test_notif_syscalls_memory() {
877        let policy = Policy::builder()
878            .max_memory(crate::policy::ByteSize::mib(256))
879            .build()
880            .unwrap();
881        let nrs = notif_syscalls(&policy);
882        assert!(nrs.contains(&(libc::SYS_mmap as u32)));
883        assert!(nrs.contains(&(libc::SYS_munmap as u32)));
884        assert!(nrs.contains(&(libc::SYS_brk as u32)));
885        assert!(nrs.contains(&(libc::SYS_mremap as u32)));
886        assert!(nrs.contains(&(libc::SYS_shmget as u32)));
887    }
888
889    #[test]
890    fn test_notif_syscalls_net() {
891        let policy = Policy::builder()
892            .net_allow_host("example.com")
893            .build()
894            .unwrap();
895        let nrs = notif_syscalls(&policy);
896        assert!(nrs.contains(&(libc::SYS_connect as u32)));
897        assert!(nrs.contains(&(libc::SYS_sendto as u32)));
898        assert!(nrs.contains(&(libc::SYS_sendmsg as u32)));
899    }
900
901    #[test]
902    fn test_deny_syscall_numbers_default() {
903        let policy = Policy::builder().build().unwrap();
904        let nrs = deny_syscall_numbers(&policy);
905        // Should contain mount, ptrace, etc.
906        assert!(nrs.contains(&(libc::SYS_mount as u32)));
907        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
908        assert!(nrs.contains(&(libc::SYS_bpf as u32)));
909        // nfsservctl has no libc constant, so it is skipped
910        assert!(!nrs.is_empty());
911    }
912
913    #[test]
914    fn test_deny_syscall_numbers_custom() {
915        let policy = Policy::builder()
916            .deny_syscalls(vec!["mount".into(), "ptrace".into()])
917            .build()
918            .unwrap();
919        let nrs = deny_syscall_numbers(&policy);
920        assert_eq!(nrs.len(), 2);
921        assert!(nrs.contains(&(libc::SYS_mount as u32)));
922        assert!(nrs.contains(&(libc::SYS_ptrace as u32)));
923    }
924
925    #[test]
926    fn test_deny_syscall_numbers_empty_when_allow_set() {
927        let policy = Policy::builder()
928            .allow_syscalls(vec!["read".into(), "write".into()])
929            .build()
930            .unwrap();
931        let nrs = deny_syscall_numbers(&policy);
932        assert!(nrs.is_empty());
933    }
934
935    #[test]
936    fn test_arg_filters_has_clone_ioctl_prctl_socket() {
937        use crate::sys::structs::{
938            BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K,
939        };
940        let policy = Policy::builder().build().unwrap();
941        let filters = arg_filters(&policy);
942        // Should contain JEQ for clone syscall nr
943        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
944            && f.k == libc::SYS_clone as u32));
945        // Should contain JSET for CLONE_NS_FLAGS
946        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JSET | BPF_K)
947            && f.k == CLONE_NS_FLAGS as u32));
948        // Should contain JEQ for ioctl syscall nr
949        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
950            && f.k == libc::SYS_ioctl as u32));
951        // Should contain JEQ for TIOCSTI and TIOCLINUX
952        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
953            && f.k == TIOCSTI as u32));
954        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
955            && f.k == TIOCLINUX as u32));
956        // Should contain JEQ for prctl syscall nr
957        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
958            && f.k == libc::SYS_prctl as u32));
959        // Should contain JEQ for PR_SET_DUMPABLE
960        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
961            && f.k == PR_SET_DUMPABLE));
962        // Should contain JEQ for socket + AF_NETLINK + NETLINK_SOCK_DIAG
963        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
964            && f.k == AF_NETLINK));
965        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
966            && f.k == NETLINK_SOCK_DIAG));
967    }
968
969    #[test]
970    fn test_arg_filters_raw_sockets() {
971        use crate::sys::structs::{BPF_ALU, BPF_AND, BPF_JEQ, BPF_JMP, BPF_K};
972        let policy = Policy::builder().no_raw_sockets(true).build().unwrap();
973        let filters = arg_filters(&policy);
974        // Should have AF_INET check
975        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
976            && f.k == AF_INET));
977        // Should have AF_INET6 check
978        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
979            && f.k == AF_INET6));
980        // Should have ALU AND SOCK_TYPE_MASK
981        assert!(filters.iter().any(|f| f.code == (BPF_ALU | BPF_AND | BPF_K)
982            && f.k == SOCK_TYPE_MASK));
983        // Should have JEQ SOCK_RAW
984        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
985            && f.k == SOCK_RAW));
986    }
987
988    #[test]
989    fn test_arg_filters_no_udp() {
990        use crate::sys::structs::{BPF_JEQ, BPF_JMP, BPF_K};
991        let policy = Policy::builder().no_udp(true).build().unwrap();
992        let filters = arg_filters(&policy);
993        // Should have JEQ SOCK_DGRAM
994        assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
995            && f.k == SOCK_DGRAM));
996    }
997
998    #[test]
999    fn test_syscall_name_to_nr_covers_defaults() {
1000        // Every name in DEFAULT_DENY_SYSCALLS except nfsservctl should resolve
1001        let mut skipped = 0;
1002        for name in DEFAULT_DENY_SYSCALLS {
1003            match syscall_name_to_nr(name) {
1004                Some(_) => {}
1005                None => {
1006                    assert_eq!(*name, "nfsservctl", "unexpected unresolved syscall: {}", name);
1007                    skipped += 1;
1008                }
1009            }
1010        }
1011        assert_eq!(skipped, 1); // only nfsservctl
1012    }
1013}