sandlock_core/fork.rs
1//! COW fork — create lightweight clones of a sandboxed process.
2//!
3//! The template process runs `init_cmd` to load expensive state, then
4//! enters a fork-ready loop. The parent calls `fork(N)` to create N
5//! COW clones that share memory pages with the template. Each clone
6//! receives `CLONE_ID=0..N-1` and execs `work_cmd`.
7//!
8//! Uses raw `fork()` syscall (NR 57 on x86_64) to bypass seccomp
9//! notification — the BPF filter only intercepts `clone`/`clone3`.
10
11use std::os::unix::io::RawFd;
12
13// ============================================================
14// Raw fork (bypasses seccomp clone interception)
15// ============================================================
16
17/// Raw fork() syscall — NR 57 on x86_64.
18/// Unlike clone/clone3, this is NOT intercepted by the seccomp notif filter.
19fn raw_fork() -> std::io::Result<i32> {
20 #[cfg(target_arch = "x86_64")]
21 const NR_FORK: i64 = 57;
22 #[cfg(target_arch = "aarch64")]
23 const NR_FORK: i64 = -1; // aarch64 has no fork — use clone with minimal flags
24
25 #[cfg(target_arch = "x86_64")]
26 {
27 let pid = unsafe { libc::syscall(NR_FORK) };
28 if pid < 0 {
29 Err(std::io::Error::last_os_error())
30 } else {
31 Ok(pid as i32)
32 }
33 }
34
35 #[cfg(target_arch = "aarch64")]
36 {
37 // aarch64 doesn't have fork(2), use clone with SIGCHLD only
38 let pid = unsafe { libc::fork() };
39 if pid < 0 {
40 Err(std::io::Error::last_os_error())
41 } else {
42 Ok(pid)
43 }
44 }
45}
46
47// ============================================================
48// Child side: fork-ready loop
49// ============================================================
50
51/// Fork N clones with per-clone stdout pipes.
52///
53/// `stdout_write_fds` contains the write ends of pipes created by the parent.
54/// Each clone's stdout is dup2'd to its corresponding write fd.
55///
56/// Wire protocol on ctrl_fd:
57/// N × 4 bytes: clone PIDs
58/// (after clones finish) N × 4 bytes: exit codes
59pub(crate) fn fork_ready_loop_fn(
60 ctrl_fd: RawFd,
61 n: u32,
62 work_fn: &dyn Fn(u32),
63 stdout_write_fds: &[RawFd],
64) {
65 let _ = unsafe { libc::fflush(std::ptr::null_mut()) };
66
67 let mut pids = Vec::with_capacity(n as usize);
68
69 for i in 0..n {
70 match raw_fork() {
71 Ok(0) => {
72 // === Clone child ===
73 unsafe { libc::close(ctrl_fd) };
74 // Redirect stdout to this clone's pipe
75 if (i as usize) < stdout_write_fds.len() && stdout_write_fds[i as usize] >= 0 {
76 unsafe { libc::dup2(stdout_write_fds[i as usize], 1) };
77 }
78 // Close all write fds (belong to other clones)
79 for &wfd in stdout_write_fds {
80 if wfd >= 0 { unsafe { libc::close(wfd) }; }
81 }
82 unsafe { libc::setpgid(0, 0) };
83 std::env::set_var("CLONE_ID", i.to_string());
84
85 work_fn(i);
86 unsafe { libc::fflush(std::ptr::null_mut()) };
87 unsafe { libc::_exit(0) };
88 }
89 Ok(pid) => {
90 pids.push(pid as u32);
91 }
92 Err(_) => {
93 pids.push(0);
94 }
95 }
96 }
97
98 // Close all write ends in template (parent has the read ends)
99 for &wfd in stdout_write_fds {
100 if wfd >= 0 { unsafe { libc::close(wfd) }; }
101 }
102
103 // Send PIDs
104 let pid_bytes: Vec<u8> = pids.iter().flat_map(|p| p.to_be_bytes()).collect();
105 unsafe { libc::write(ctrl_fd, pid_bytes.as_ptr() as *const _, pid_bytes.len()) };
106
107 // Wait for all clones and send exit codes
108 let mut exit_codes = Vec::with_capacity(pids.len());
109 for &pid in &pids {
110 if pid > 0 {
111 let mut status: i32 = 0;
112 unsafe { libc::waitpid(pid as i32, &mut status, 0) };
113 let code = if libc::WIFEXITED(status) { libc::WEXITSTATUS(status) } else { -1 };
114 exit_codes.push(code as i32);
115 } else {
116 exit_codes.push(-1);
117 }
118 }
119 let code_bytes: Vec<u8> = exit_codes.iter().flat_map(|c| c.to_be_bytes()).collect();
120 unsafe { libc::write(ctrl_fd, code_bytes.as_ptr() as *const _, code_bytes.len()) };
121}
122
123// ============================================================
124// Tests
125// ============================================================
126
127#[cfg(test)]
128mod tests {
129 use super::*;
130
131 #[test]
132 fn test_raw_fork() {
133 let pid = raw_fork().unwrap();
134 if pid == 0 {
135 // child
136 unsafe { libc::_exit(42) };
137 }
138 // parent
139 let mut status: i32 = 0;
140 unsafe { libc::waitpid(pid, &mut status, 0) };
141 assert!(libc::WIFEXITED(status));
142 assert_eq!(libc::WEXITSTATUS(status), 42);
143 }
144}