Skip to main content

sandlock_core/
fork.rs

1//! COW fork — create lightweight clones of a sandboxed process.
2//!
3//! The template process runs `init_cmd` to load expensive state, then
4//! enters a fork-ready loop. The parent calls `fork(N)` to create N
5//! COW clones that share memory pages with the template. Each clone
6//! receives `CLONE_ID=0..N-1` and execs `work_cmd`.
7//!
8//! Uses raw `fork()` syscall (NR 57 on x86_64) to bypass seccomp
9//! notification — the BPF filter only intercepts `clone`/`clone3`.
10
11use std::os::unix::io::RawFd;
12
13// ============================================================
14// Raw fork (bypasses seccomp clone interception)
15// ============================================================
16
17/// Raw fork() syscall — NR 57 on x86_64.
18/// Unlike clone/clone3, this is NOT intercepted by the seccomp notif filter.
19fn raw_fork() -> std::io::Result<i32> {
20    #[cfg(target_arch = "x86_64")]
21    const NR_FORK: i64 = 57;
22    #[cfg(target_arch = "aarch64")]
23    const NR_FORK: i64 = -1; // aarch64 has no fork — use clone with minimal flags
24
25    #[cfg(target_arch = "x86_64")]
26    {
27        let pid = unsafe { libc::syscall(NR_FORK) };
28        if pid < 0 {
29            Err(std::io::Error::last_os_error())
30        } else {
31            Ok(pid as i32)
32        }
33    }
34
35    #[cfg(target_arch = "aarch64")]
36    {
37        // aarch64 doesn't have fork(2), use clone with SIGCHLD only
38        let pid = unsafe { libc::fork() };
39        if pid < 0 {
40            Err(std::io::Error::last_os_error())
41        } else {
42            Ok(pid)
43        }
44    }
45}
46
47// ============================================================
48// Child side: fork-ready loop
49// ============================================================
50
51/// Fork N clones with per-clone stdout pipes.
52///
53/// `stdout_write_fds` contains the write ends of pipes created by the parent.
54/// Each clone's stdout is dup2'd to its corresponding write fd.
55///
56/// Wire protocol on ctrl_fd:
57///   N × 4 bytes: clone PIDs
58///   (after clones finish) N × 4 bytes: exit codes
59pub(crate) fn fork_ready_loop_fn(
60    ctrl_fd: RawFd,
61    n: u32,
62    work_fn: &dyn Fn(u32),
63    stdout_write_fds: &[RawFd],
64) {
65    let _ = unsafe { libc::fflush(std::ptr::null_mut()) };
66
67    let mut pids = Vec::with_capacity(n as usize);
68
69    for i in 0..n {
70        match raw_fork() {
71            Ok(0) => {
72                // === Clone child ===
73                unsafe { libc::close(ctrl_fd) };
74                // Redirect stdout to this clone's pipe
75                if (i as usize) < stdout_write_fds.len() && stdout_write_fds[i as usize] >= 0 {
76                    unsafe { libc::dup2(stdout_write_fds[i as usize], 1) };
77                }
78                // Close all write fds (belong to other clones)
79                for &wfd in stdout_write_fds {
80                    if wfd >= 0 { unsafe { libc::close(wfd) }; }
81                }
82                unsafe { libc::setpgid(0, 0) };
83                std::env::set_var("CLONE_ID", i.to_string());
84
85                work_fn(i);
86                unsafe { libc::fflush(std::ptr::null_mut()) };
87                unsafe { libc::_exit(0) };
88            }
89            Ok(pid) => {
90                pids.push(pid as u32);
91            }
92            Err(_) => {
93                pids.push(0);
94            }
95        }
96    }
97
98    // Close all write ends in template (parent has the read ends)
99    for &wfd in stdout_write_fds {
100        if wfd >= 0 { unsafe { libc::close(wfd) }; }
101    }
102
103    // Send PIDs
104    let pid_bytes: Vec<u8> = pids.iter().flat_map(|p| p.to_be_bytes()).collect();
105    unsafe { libc::write(ctrl_fd, pid_bytes.as_ptr() as *const _, pid_bytes.len()) };
106
107    // Wait for all clones and send exit codes
108    let mut exit_codes = Vec::with_capacity(pids.len());
109    for &pid in &pids {
110        if pid > 0 {
111            let mut status: i32 = 0;
112            unsafe { libc::waitpid(pid as i32, &mut status, 0) };
113            let code = if libc::WIFEXITED(status) { libc::WEXITSTATUS(status) } else { -1 };
114            exit_codes.push(code as i32);
115        } else {
116            exit_codes.push(-1);
117        }
118    }
119    let code_bytes: Vec<u8> = exit_codes.iter().flat_map(|c| c.to_be_bytes()).collect();
120    unsafe { libc::write(ctrl_fd, code_bytes.as_ptr() as *const _, code_bytes.len()) };
121}
122
123// ============================================================
124// Tests
125// ============================================================
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn test_raw_fork() {
133        let pid = raw_fork().unwrap();
134        if pid == 0 {
135            // child
136            unsafe { libc::_exit(42) };
137        }
138        // parent
139        let mut status: i32 = 0;
140        unsafe { libc::waitpid(pid, &mut status, 0) };
141        assert!(libc::WIFEXITED(status));
142        assert_eq!(libc::WEXITSTATUS(status), 42);
143    }
144}