sandlock_core/fork.rs
1//! COW fork — create lightweight clones of a sandboxed process.
2//!
3//! The template process runs `init_cmd` to load expensive state, then
4//! enters a fork-ready loop. The parent calls `fork(N)` to create N
5//! COW clones that share memory pages with the template. Each clone
6//! receives `CLONE_ID=0..N-1` and execs `work_cmd`.
7//!
8//! Uses raw `fork()` syscall (NR 57 on x86_64) to bypass seccomp
9//! notification — the BPF filter only intercepts `clone`/`clone3`.
10
11use std::os::unix::io::RawFd;
12
13// ============================================================
14// Raw fork (bypasses seccomp clone interception)
15// ============================================================
16
17/// Raw fork() syscall — NR 57 on x86_64.
18/// Unlike clone/clone3, this is NOT intercepted by the seccomp notif filter.
19fn raw_fork() -> std::io::Result<i32> {
20 #[cfg(target_arch = "x86_64")]
21 const NR_FORK: i64 = 57;
22
23 #[cfg(target_arch = "x86_64")]
24 {
25 let pid = unsafe { libc::syscall(NR_FORK) };
26 if pid < 0 {
27 Err(std::io::Error::last_os_error())
28 } else {
29 Ok(pid as i32)
30 }
31 }
32
33 #[cfg(target_arch = "aarch64")]
34 {
35 // aarch64 doesn't have fork(2), use clone with SIGCHLD only
36 let pid = unsafe { libc::fork() };
37 if pid < 0 {
38 Err(std::io::Error::last_os_error())
39 } else {
40 Ok(pid)
41 }
42 }
43}
44
45// ============================================================
46// Child side: fork-ready loop
47// ============================================================
48
49/// Fork N clones with per-clone stdout pipes.
50///
51/// `stdout_write_fds` contains the write ends of pipes created by the parent.
52/// Each clone's stdout is dup2'd to its corresponding write fd.
53///
54/// Wire protocol on ctrl_fd:
55/// N × 4 bytes: clone PIDs
56/// (after clones finish) N × 4 bytes: exit codes
57pub(crate) fn fork_ready_loop_fn(
58 ctrl_fd: RawFd,
59 n: u32,
60 work_fn: &dyn Fn(u32),
61 stdout_write_fds: &[RawFd],
62) {
63 let _ = unsafe { libc::fflush(std::ptr::null_mut()) };
64
65 let mut pids = Vec::with_capacity(n as usize);
66
67 for i in 0..n {
68 match raw_fork() {
69 Ok(0) => {
70 // === Clone child ===
71 unsafe { libc::close(ctrl_fd) };
72 // Redirect stdout to this clone's pipe
73 if (i as usize) < stdout_write_fds.len() && stdout_write_fds[i as usize] >= 0 {
74 unsafe { libc::dup2(stdout_write_fds[i as usize], 1) };
75 }
76 // Close all write fds (belong to other clones)
77 for &wfd in stdout_write_fds {
78 if wfd >= 0 { unsafe { libc::close(wfd) }; }
79 }
80 unsafe { libc::setpgid(0, 0) };
81 std::env::set_var("CLONE_ID", i.to_string());
82
83 work_fn(i);
84 unsafe { libc::fflush(std::ptr::null_mut()) };
85 unsafe { libc::_exit(0) };
86 }
87 Ok(pid) => {
88 pids.push(pid as u32);
89 }
90 Err(_) => {
91 pids.push(0);
92 }
93 }
94 }
95
96 // Close all write ends in template (parent has the read ends)
97 for &wfd in stdout_write_fds {
98 if wfd >= 0 { unsafe { libc::close(wfd) }; }
99 }
100
101 // Send PIDs
102 let pid_bytes: Vec<u8> = pids.iter().flat_map(|p| p.to_be_bytes()).collect();
103 unsafe { libc::write(ctrl_fd, pid_bytes.as_ptr() as *const _, pid_bytes.len()) };
104
105 // Wait for all clones and send exit codes
106 let mut exit_codes = Vec::with_capacity(pids.len());
107 for &pid in &pids {
108 if pid > 0 {
109 let mut status: i32 = 0;
110 unsafe { libc::waitpid(pid as i32, &mut status, 0) };
111 let code = if libc::WIFEXITED(status) { libc::WEXITSTATUS(status) } else { -1 };
112 exit_codes.push(code as i32);
113 } else {
114 exit_codes.push(-1);
115 }
116 }
117 let code_bytes: Vec<u8> = exit_codes.iter().flat_map(|c| c.to_be_bytes()).collect();
118 unsafe { libc::write(ctrl_fd, code_bytes.as_ptr() as *const _, code_bytes.len()) };
119}
120
121// ============================================================
122// Tests
123// ============================================================
124
125#[cfg(test)]
126mod tests {
127 use super::*;
128
129 #[test]
130 fn test_raw_fork() {
131 let pid = raw_fork().unwrap();
132 if pid == 0 {
133 // child
134 unsafe { libc::_exit(42) };
135 }
136 // parent
137 let mut status: i32 = 0;
138 unsafe { libc::waitpid(pid, &mut status, 0) };
139 assert!(libc::WIFEXITED(status));
140 assert_eq!(libc::WEXITSTATUS(status), 42);
141 }
142}