Skip to main content

sandlock_core/
fork.rs

1//! COW fork — create lightweight clones of a sandboxed process.
2//!
3//! The template process runs `init_cmd` to load expensive state, then
4//! enters a fork-ready loop. The parent calls `fork(N)` to create N
5//! COW clones that share memory pages with the template. Each clone
6//! receives `CLONE_ID=0..N-1` and execs `work_cmd`.
7//!
8//! Uses raw `fork()` syscall (NR 57 on x86_64) to bypass seccomp
9//! notification — the BPF filter only intercepts `clone`/`clone3`.
10
11use std::os::unix::io::RawFd;
12
13// ============================================================
14// Raw fork (bypasses seccomp clone interception)
15// ============================================================
16
17/// Raw fork() syscall — NR 57 on x86_64.
18/// Unlike clone/clone3, this is NOT intercepted by the seccomp notif filter.
19fn raw_fork() -> std::io::Result<i32> {
20    #[cfg(target_arch = "x86_64")]
21    const NR_FORK: i64 = 57;
22
23    #[cfg(target_arch = "x86_64")]
24    {
25        let pid = unsafe { libc::syscall(NR_FORK) };
26        if pid < 0 {
27            Err(std::io::Error::last_os_error())
28        } else {
29            Ok(pid as i32)
30        }
31    }
32
33    #[cfg(target_arch = "aarch64")]
34    {
35        // aarch64 doesn't have fork(2), use clone with SIGCHLD only
36        let pid = unsafe { libc::fork() };
37        if pid < 0 {
38            Err(std::io::Error::last_os_error())
39        } else {
40            Ok(pid)
41        }
42    }
43}
44
45// ============================================================
46// Child side: fork-ready loop
47// ============================================================
48
49/// Fork N clones with per-clone stdout pipes.
50///
51/// `stdout_write_fds` contains the write ends of pipes created by the parent.
52/// Each clone's stdout is dup2'd to its corresponding write fd.
53///
54/// Wire protocol on ctrl_fd:
55///   N × 4 bytes: clone PIDs
56///   (after clones finish) N × 4 bytes: exit codes
57pub(crate) fn fork_ready_loop_fn(
58    ctrl_fd: RawFd,
59    n: u32,
60    work_fn: &dyn Fn(u32),
61    stdout_write_fds: &[RawFd],
62) {
63    let _ = unsafe { libc::fflush(std::ptr::null_mut()) };
64
65    let mut pids = Vec::with_capacity(n as usize);
66
67    for i in 0..n {
68        match raw_fork() {
69            Ok(0) => {
70                // === Clone child ===
71                unsafe { libc::close(ctrl_fd) };
72                // Redirect stdout to this clone's pipe
73                if (i as usize) < stdout_write_fds.len() && stdout_write_fds[i as usize] >= 0 {
74                    unsafe { libc::dup2(stdout_write_fds[i as usize], 1) };
75                }
76                // Close all write fds (belong to other clones)
77                for &wfd in stdout_write_fds {
78                    if wfd >= 0 { unsafe { libc::close(wfd) }; }
79                }
80                unsafe { libc::setpgid(0, 0) };
81                std::env::set_var("CLONE_ID", i.to_string());
82
83                work_fn(i);
84                unsafe { libc::fflush(std::ptr::null_mut()) };
85                unsafe { libc::_exit(0) };
86            }
87            Ok(pid) => {
88                pids.push(pid as u32);
89            }
90            Err(_) => {
91                pids.push(0);
92            }
93        }
94    }
95
96    // Close all write ends in template (parent has the read ends)
97    for &wfd in stdout_write_fds {
98        if wfd >= 0 { unsafe { libc::close(wfd) }; }
99    }
100
101    // Send PIDs
102    let pid_bytes: Vec<u8> = pids.iter().flat_map(|p| p.to_be_bytes()).collect();
103    unsafe { libc::write(ctrl_fd, pid_bytes.as_ptr() as *const _, pid_bytes.len()) };
104
105    // Wait for all clones and send exit codes
106    let mut exit_codes = Vec::with_capacity(pids.len());
107    for &pid in &pids {
108        if pid > 0 {
109            let mut status: i32 = 0;
110            unsafe { libc::waitpid(pid as i32, &mut status, 0) };
111            let code = if libc::WIFEXITED(status) { libc::WEXITSTATUS(status) } else { -1 };
112            exit_codes.push(code as i32);
113        } else {
114            exit_codes.push(-1);
115        }
116    }
117    let code_bytes: Vec<u8> = exit_codes.iter().flat_map(|c| c.to_be_bytes()).collect();
118    unsafe { libc::write(ctrl_fd, code_bytes.as_ptr() as *const _, code_bytes.len()) };
119}
120
121// ============================================================
122// Tests
123// ============================================================
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn test_raw_fork() {
131        let pid = raw_fork().unwrap();
132        if pid == 0 {
133            // child
134            unsafe { libc::_exit(42) };
135        }
136        // parent
137        let mut status: i32 = 0;
138        unsafe { libc::waitpid(pid, &mut status, 0) };
139        assert!(libc::WIFEXITED(status));
140        assert_eq!(libc::WEXITSTATUS(status), 42);
141    }
142}