Skip to main content

sandlock_core/
sandbox.rs

1// Sandbox orchestrator — public API that coordinates fork, confinement,
2// and async supervision of sandboxed child processes.
3
4use std::ffi::CString;
5use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd};
6use std::sync::Arc;
7use std::time::Duration;
8
9use tokio::sync::Mutex;
10use tokio::task::JoinHandle;
11
12use std::sync::atomic::{AtomicBool, Ordering};
13
14use crate::context::{self, CowConfig, PipePair, read_u32_fd, write_u32_fd};
15use crate::cow::{CowBranch, overlayfs::OverlayBranch, branchfs::BranchFsBranch};
16use crate::error::{SandboxError, SandlockError};
17use crate::network;
18use crate::policy::{BranchAction, FsIsolation, Policy};
19use crate::result::{ExitStatus, RunResult};
20use crate::seccomp::notif::{self, NotifPolicy, SupervisorState};
21use crate::sys::syscall;
22
23// ============================================================
24// Nesting detection
25// ============================================================
26
27/// Set after seccomp confinement in the child process.
28/// Any subsequent Sandbox in this process is nested.
29pub(crate) static CONFINED: AtomicBool = AtomicBool::new(false);
30
31/// Detect if this process is already inside a sandbox.
32///
33/// Checks both the in-process flag and /proc/self/status (Seccomp: 2)
34/// to catch cross-process nesting (e.g. `sandlock run -- python agent.py`
35/// where agent.py creates inner sandboxes).
36pub fn is_nested() -> bool {
37    if CONFINED.load(Ordering::Relaxed) {
38        return true;
39    }
40    // Check /proc/self/status for active seccomp filter
41    if let Ok(status) = std::fs::read_to_string("/proc/self/status") {
42        for line in status.lines() {
43            if line.starts_with("Seccomp:") {
44                return line.trim().ends_with('2');
45            }
46        }
47    }
48    false
49}
50
51// ============================================================
52// SandboxState
53// ============================================================
54
55enum SandboxState {
56    Created,
57    Running,
58    Paused,
59    Stopped(ExitStatus),
60}
61
62// ============================================================
63// Sandbox
64// ============================================================
65
66/// The main user-facing sandbox API.
67///
68/// Orchestrates fork, confinement (Landlock + seccomp), and async
69/// notification-based supervision of the sandboxed child process.
70pub struct Sandbox {
71    policy: Policy,
72    state: SandboxState,
73    child_pid: Option<i32>,
74    pidfd: Option<OwnedFd>,
75    notif_handle: Option<JoinHandle<()>>,
76    throttle_handle: Option<JoinHandle<()>>,
77    /// Capture pipe read ends — kept alive so the child doesn't get SIGPIPE.
78    _stdout_read: Option<OwnedFd>,
79    _stderr_read: Option<OwnedFd>,
80    /// COW filesystem branch (OverlayFS or BranchFS).
81    cow_branch: Option<Box<dyn CowBranch>>,
82    /// Shared supervisor state for freeze/thaw support.
83    supervisor_state: Option<Arc<Mutex<SupervisorState>>>,
84    /// Control pipe for fork commands (parent end).
85    ctrl_fd: Option<OwnedFd>,
86    /// Stdout pipe read end (for fork clones — used by reduce).
87    stdout_pipe: Option<OwnedFd>,
88    /// Init function (runs once in child before fork).
89    init_fn: Option<Box<dyn FnOnce() + Send + 'static>>,
90    /// Work function (runs in each fork clone).
91    work_fn: Option<Arc<dyn Fn(u32) + Send + Sync + 'static>>,
92    /// Optional fd overrides for stdin/stdout/stderr (used by Pipeline).
93    io_overrides: Option<(Option<i32>, Option<i32>, Option<i32>)>,
94}
95
96impl Sandbox {
97    /// Create a new sandbox in the `Created` state.
98    pub fn new(policy: &Policy) -> Result<Self, SandlockError> {
99        Ok(Self::create(policy))
100    }
101
102    /// Create a sandbox with init and work functions for COW forking.
103    ///
104    /// `init_fn` runs once in the child to load expensive state.
105    /// `work_fn` runs in each COW clone created by `fork(N)`.
106    ///
107    /// ```ignore
108    /// let mut sb = Sandbox::new_with_fns(&policy,
109    ///     || { load_model(); },
110    ///     |clone_id| { rollout(clone_id); },
111    /// )?;
112    /// let clones = sb.fork(1000).await?;
113    /// ```
114    pub fn new_with_fns(
115        policy: &Policy,
116        init_fn: impl FnOnce() + Send + 'static,
117        work_fn: impl Fn(u32) + Send + Sync + 'static,
118    ) -> Result<Self, SandlockError> {
119        let mut sb = Self::create(policy);
120        sb.init_fn = Some(Box::new(init_fn));
121        sb.work_fn = Some(Arc::new(work_fn));
122        Ok(sb)
123    }
124
125    fn create(policy: &Policy) -> Self {
126        Self {
127            policy: policy.clone(),
128            state: SandboxState::Created,
129            child_pid: None,
130            pidfd: None,
131            notif_handle: None,
132            throttle_handle: None,
133            _stdout_read: None,
134            _stderr_read: None,
135            cow_branch: None,
136            supervisor_state: None,
137            ctrl_fd: None,
138            stdout_pipe: None,
139            init_fn: None,
140            work_fn: None,
141            io_overrides: None,
142        }
143    }
144
145    /// One-shot: spawn a sandboxed process, wait for it to exit, and return
146    /// the result. Stdout and stderr are captured.
147    pub async fn run(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
148        let mut sb = Self::new(policy)?;
149        sb.do_spawn(cmd, true).await?;
150        sb.wait().await
151    }
152
153    /// Run a sandboxed process with inherited stdio (interactive mode).
154    pub async fn run_interactive(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
155        let mut sb = Self::new(policy)?;
156        sb.do_spawn(cmd, false).await?;
157        sb.wait().await
158    }
159
160    /// Dry-run: spawn, wait, collect filesystem changes, then abort.
161    /// Returns the run result plus a list of changes that would have been
162    /// committed. The workdir is left unchanged.
163    pub async fn dry_run(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
164        let mut policy = policy.clone();
165        policy.on_exit = BranchAction::Keep;
166        policy.on_error = BranchAction::Keep;
167
168        let mut sb = Self::new(&policy)?;
169        sb.do_spawn(cmd, true).await?;
170        let run_result = sb.wait().await?;
171        let changes = sb.collect_changes().await;
172        sb.do_abort().await;
173        Ok(crate::dry_run::DryRunResult { run_result, changes })
174    }
175
176    /// Dry-run with inherited stdio (interactive mode).
177    pub async fn dry_run_interactive(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
178        let mut policy = policy.clone();
179        policy.on_exit = BranchAction::Keep;
180        policy.on_error = BranchAction::Keep;
181
182        let mut sb = Self::new(&policy)?;
183        sb.do_spawn(cmd, false).await?;
184        let run_result = sb.wait().await?;
185        let changes = sb.collect_changes().await;
186        sb.do_abort().await;
187        Ok(crate::dry_run::DryRunResult { run_result, changes })
188    }
189
190    /// Collect changes from whichever COW branch exists.
191    async fn collect_changes(&self) -> Vec<crate::dry_run::Change> {
192        // Check OverlayFS/BranchFS COW branch
193        if let Some(ref branch) = self.cow_branch {
194            return branch.changes().unwrap_or_default();
195        }
196
197        // Check seccomp-based COW branch
198        if let Some(ref state) = self.supervisor_state {
199            if let Ok(st) = state.try_lock() {
200                if let Some(ref cow) = st.cow_branch {
201                    return cow.changes().unwrap_or_default();
202                }
203            }
204        }
205
206        Vec::new()
207    }
208
209    /// Abort both COW branch types (used by dry_run to discard changes).
210    async fn do_abort(&mut self) {
211        if let Some(branch) = self.cow_branch.take() {
212            let _ = branch.abort();
213        }
214        if let Some(ref state) = self.supervisor_state {
215            if let Ok(mut st) = state.try_lock() {
216                if let Some(ref mut cow) = st.cow_branch {
217                    let _ = cow.abort();
218                }
219            }
220        }
221    }
222
223    /// Create N COW clones of this sandbox.
224    ///
225    /// Requires `new_with_fns()`. Forks a confined child, runs `init_fn`,
226    /// then forks N times using raw `fork()` (bypasses seccomp). Each
227    /// clone gets `CLONE_ID=0..N-1` and runs `work_fn(clone_id)`.
228    ///
229    /// Memory pages from `init_fn` are shared copy-on-write across all
230    /// clones — 1000 clones of a 50MB process use ~50MB total.
231    ///
232    /// Returns PIDs of all clones. Use `waitpid` to collect them.
233    /// Create N COW clones, each runs `work_fn(clone_id)`.
234    ///
235    /// Returns a Vec of Sandbox handles — one per clone. Each clone is
236    /// a live process that can be waited on, killed, or paused.
237    ///
238    /// ```ignore
239    /// let clones = sb.fork(4).await?;
240    /// for mut c in clones { c.wait().await?; }
241    /// ```
242    pub async fn fork(&mut self, n: u32) -> Result<Vec<Sandbox>, SandlockError> {
243        let init_fn = self.init_fn.take()
244            .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
245        let work_fn = self.work_fn.take()
246            .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
247
248        let policy = self.policy.clone();
249
250
251        // Create control pipe
252        let mut ctrl_fds = [0i32; 2];
253        if unsafe { libc::pipe2(ctrl_fds.as_mut_ptr(), 0) } < 0 {
254            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
255        }
256        let ctrl_parent = unsafe { OwnedFd::from_raw_fd(ctrl_fds[0]) };
257        let ctrl_child_fd = ctrl_fds[1];
258
259        // Create per-clone stdout pipes (parent keeps read ends)
260        let mut pipe_read_ends: Vec<OwnedFd> = Vec::with_capacity(n as usize);
261        let mut pipe_write_fds: Vec<i32> = Vec::with_capacity(n as usize);
262        for _ in 0..n {
263            let mut pfds = [0i32; 2];
264            if unsafe { libc::pipe(pfds.as_mut_ptr()) } >= 0 {
265                pipe_read_ends.push(unsafe { OwnedFd::from_raw_fd(pfds[0]) });
266                pipe_write_fds.push(pfds[1]);
267            } else {
268                pipe_write_fds.push(-1);
269            }
270        }
271
272        // Fork the template child
273        let pid = unsafe { libc::fork() };
274        if pid < 0 {
275            unsafe { libc::close(ctrl_child_fd) };
276            return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
277        }
278
279        if pid == 0 {
280            // ===== CHILD (template) =====
281            drop(ctrl_parent);
282
283            unsafe { libc::setpgid(0, 0) };
284            unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) };
285            unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
286
287            let _ = crate::landlock::confine(&policy);
288
289            let deny = crate::context::deny_syscall_numbers(&policy);
290            let args = crate::context::arg_filters(&policy);
291            let filter = crate::seccomp::bpf::assemble_filter(&[], &deny, &args);
292            let _ = crate::seccomp::bpf::install_deny_filter(&filter);
293
294            CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
295
296            // Run init (loads expensive state, shared via COW)
297            init_fn();
298
299            // Close read ends in template (parent owns them)
300            drop(pipe_read_ends);
301
302            // Fork N clones, send PIDs, wait for all
303            crate::fork::fork_ready_loop_fn(ctrl_child_fd, n, &*work_fn, &pipe_write_fds);
304            unsafe { libc::_exit(0) };
305        }
306
307        // ===== PARENT =====
308        unsafe { libc::close(ctrl_child_fd) };
309        // Close write ends in parent (template/clones own them)
310        for wfd in &pipe_write_fds {
311            if *wfd >= 0 { unsafe { libc::close(*wfd) }; }
312        }
313        self.child_pid = Some(pid);
314        self.state = SandboxState::Running;
315
316        // Read N clone PIDs
317        let ctrl_fd = ctrl_parent.as_raw_fd();
318        let mut pid_buf = vec![0u8; n as usize * 4];
319        read_exact(ctrl_fd, &mut pid_buf);
320
321        let clone_pids: Vec<i32> = pid_buf.chunks(4)
322            .map(|c| u32::from_be_bytes(c.try_into().unwrap_or([0; 4])) as i32)
323            .collect();
324        let live_count = clone_pids.iter().filter(|&&p| p > 0).count();
325
326        // Read exit codes (template waits for all clones first)
327        let mut code_buf = vec![0u8; live_count * 4];
328        read_exact(ctrl_fd, &mut code_buf);
329        self.ctrl_fd = Some(ctrl_parent);
330
331        // Wait for template to exit
332        let mut status = 0i32;
333        unsafe { libc::waitpid(pid, &mut status, 0) };
334
335        // Create clone handles with stdout pipe read ends
336        let mut code_idx = 0;
337        let mut clones = Vec::with_capacity(live_count);
338        let mut pipe_iter = pipe_read_ends.into_iter();
339
340        for &clone_pid in &clone_pids {
341            let pipe = pipe_iter.next();
342            if clone_pid <= 0 { continue; }
343
344            let code = i32::from_be_bytes(
345                code_buf[code_idx * 4..(code_idx + 1) * 4].try_into().unwrap_or([0; 4])
346            );
347            code_idx += 1;
348
349            let mut sb = Sandbox::create(&policy);
350            sb.child_pid = Some(clone_pid);
351            sb.state = SandboxState::Stopped(if code == 0 {
352                ExitStatus::Code(0)
353            } else if code > 0 {
354                ExitStatus::Code(code)
355            } else {
356                ExitStatus::Killed
357            });
358            sb.stdout_pipe = pipe;
359            clones.push(sb);
360        }
361
362        Ok(clones)
363    }
364
365    /// Reduce: wait for all clones, then run a reducer command.
366    ///
367    /// Waits for every clone to finish, then runs `cmd` in this sandbox.
368    /// The reducer can read clone results from shared files, tmpdir, etc.
369    ///
370    /// ```ignore
371    /// let clones = mapper.fork(4).await?;
372    /// let result = reducer.reduce(&["python3", "sum.py"], &mut clones).await?;
373    /// ```
374    pub async fn reduce(
375        &self,
376        cmd: &[&str],
377        clones: &mut [Sandbox],
378    ) -> Result<RunResult, SandlockError> {
379        // Read each clone's stdout pipe and concatenate
380        let mut combined = Vec::new();
381        for clone in clones.iter_mut() {
382            if let Some(pipe) = clone.stdout_pipe.take() {
383                combined.extend_from_slice(&read_fd_to_end(pipe));
384            }
385        }
386
387        // Create a pipe to feed combined data to reducer's stdin
388        let mut stdin_fds = [0i32; 2];
389        if unsafe { libc::pipe2(stdin_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
390            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
391        }
392
393        // Write combined data in a blocking thread (avoid deadlock with large data)
394        let write_fd = stdin_fds[1];
395        let write_handle = tokio::task::spawn_blocking(move || {
396            unsafe {
397                libc::write(write_fd, combined.as_ptr() as *const _, combined.len());
398                libc::close(write_fd);
399            }
400        });
401
402        // Spawn reducer with stdin from pipe, capture stdout
403        let mut reducer = Sandbox::new(&self.policy)?;
404        reducer.io_overrides = Some((Some(stdin_fds[0]), None, None));
405        reducer.do_spawn(cmd, true).await?;
406        unsafe { libc::close(stdin_fds[0]) };
407
408        let _ = write_handle.await;
409        reducer.wait().await
410    }
411
412    /// Wait for the child process to exit.
413    pub async fn wait(&mut self) -> Result<RunResult, SandlockError> {
414        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
415
416        if let SandboxState::Stopped(ref es) = self.state {
417            return Ok(RunResult {
418                exit_status: es.clone(),
419                stdout: None,
420                stderr: None,
421            });
422        }
423
424        // Blocking waitpid in a blocking thread so we don't block the tokio runtime.
425        let exit_status = tokio::task::spawn_blocking(move || -> ExitStatus {
426            let mut status: i32 = 0;
427            loop {
428                let ret = unsafe { libc::waitpid(pid, &mut status, 0) };
429                if ret < 0 {
430                    let err = std::io::Error::last_os_error();
431                    if err.raw_os_error() == Some(libc::EINTR) {
432                        continue;
433                    }
434                    // Child already reaped or invalid pid
435                    return ExitStatus::Killed;
436                }
437                break;
438            }
439            wait_status_to_exit(status)
440        })
441        .await
442        .unwrap_or(ExitStatus::Killed);
443
444        self.state = SandboxState::Stopped(exit_status.clone());
445
446        // Abort supervisor tasks now that the child is gone.
447        if let Some(h) = self.notif_handle.take() {
448            h.abort();
449        }
450        if let Some(h) = self.throttle_handle.take() {
451            h.abort();
452        }
453
454        // Drain captured stdout/stderr if available
455        let stdout = self._stdout_read.take().map(|fd| read_fd_to_end(fd));
456        let stderr = self._stderr_read.take().map(|fd| read_fd_to_end(fd));
457
458        Ok(RunResult {
459            exit_status,
460            stdout,
461            stderr,
462        })
463    }
464
465    /// Send SIGSTOP to the child's process group.
466    pub fn pause(&mut self) -> Result<(), SandlockError> {
467        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
468        let ret = unsafe { libc::killpg(pid, libc::SIGSTOP) };
469        if ret < 0 {
470            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
471        }
472        self.state = SandboxState::Paused;
473        Ok(())
474    }
475
476    /// Send SIGCONT to the child's process group.
477    pub fn resume(&mut self) -> Result<(), SandlockError> {
478        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
479        let ret = unsafe { libc::killpg(pid, libc::SIGCONT) };
480        if ret < 0 {
481            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
482        }
483        self.state = SandboxState::Running;
484        Ok(())
485    }
486
487    /// Send SIGKILL to the child's process group.
488    pub fn kill(&mut self) -> Result<(), SandlockError> {
489        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
490        let ret = unsafe { libc::killpg(pid, libc::SIGKILL) };
491        if ret < 0 {
492            let err = std::io::Error::last_os_error();
493            // ESRCH means the process is already gone — not an error.
494            if err.raw_os_error() != Some(libc::ESRCH) {
495                return Err(SandboxError::Io(err).into());
496            }
497        }
498        Ok(())
499    }
500
501    /// Return the child PID, if spawned.
502    pub fn pid(&self) -> Option<i32> {
503        self.child_pid
504    }
505
506    /// Return whether the child is currently running.
507    #[doc(hidden)]
508    pub fn is_running(&self) -> bool {
509        matches!(self.state, SandboxState::Running | SandboxState::Paused)
510    }
511
512    /// Return a reference to the policy.
513    pub fn policy(&self) -> &Policy {
514        &self.policy
515    }
516
517    /// Commit COW writes to the original directory.
518    #[doc(hidden)]
519    pub async fn commit(&mut self) -> Result<(), SandlockError> {
520        if let Some(branch) = self.cow_branch.take() {
521            branch.commit().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
522        }
523        Ok(())
524    }
525
526    /// Discard COW writes.
527    #[doc(hidden)]
528    pub async fn abort_branch(&mut self) -> Result<(), SandlockError> {
529        if let Some(branch) = self.cow_branch.take() {
530            branch.abort().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
531        }
532        Ok(())
533    }
534
535    /// Freeze the sandbox: hold all fork notifications + SIGSTOP the process group.
536    pub(crate) async fn freeze(&self) -> Result<(), SandlockError> {
537        let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
538
539        // Set hold_forks in supervisor state
540        if let Some(ref state) = self.supervisor_state {
541            let mut st = state.lock().await;
542            st.hold_forks = true;
543        }
544
545        // SIGSTOP the process group
546        unsafe { libc::killpg(pid, libc::SIGSTOP); }
547        Ok(())
548    }
549
550    /// Thaw the sandbox: release held fork notifications + SIGCONT.
551    pub(crate) async fn thaw(&self) -> Result<(), SandlockError> {
552        let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
553
554        // Release held forks
555        if let Some(ref state) = self.supervisor_state {
556            let mut st = state.lock().await;
557            st.hold_forks = false;
558            st.held_notif_ids.clear();
559        }
560
561        // SIGCONT the process group
562        unsafe { libc::killpg(pid, libc::SIGCONT); }
563        Ok(())
564    }
565
566    /// Spawn a sandboxed process without waiting for it to exit.
567    /// Use `wait()` to collect the exit status when done.
568    #[doc(hidden)]
569    pub async fn spawn(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
570        self.do_spawn(cmd, false).await
571    }
572
573    /// Like `spawn` but captures stdout and stderr (available via `wait()`).
574    /// Not part of the public API — used by the FFI crate.
575    #[doc(hidden)]
576    pub async fn spawn_captured(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
577        self.do_spawn(cmd, true).await
578    }
579
580    /// Spawn with explicit stdin/stdout/stderr fd redirection.
581    ///
582    /// Each `Option<RawFd>` overrides the corresponding fd in the child:
583    /// - `stdin_fd`: dup2'd to fd 0
584    /// - `stdout_fd`: dup2'd to fd 1
585    /// - `stderr_fd`: dup2'd to fd 2
586    ///
587    /// The caller is responsible for closing the fds after this call.
588    #[doc(hidden)]
589    pub async fn spawn_with_io(
590        &mut self,
591        cmd: &[&str],
592        stdin_fd: Option<std::os::unix::io::RawFd>,
593        stdout_fd: Option<std::os::unix::io::RawFd>,
594        stderr_fd: Option<std::os::unix::io::RawFd>,
595    ) -> Result<(), SandlockError> {
596        self.io_overrides = Some((stdin_fd, stdout_fd, stderr_fd));
597        self.do_spawn(cmd, false).await
598    }
599
600    /// Capture a checkpoint of the running sandbox.
601    pub async fn checkpoint(&self) -> Result<crate::checkpoint::Checkpoint, SandlockError> {
602        let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
603
604        // Freeze
605        self.freeze().await?;
606
607        // Capture state
608        let cp = crate::checkpoint::capture(pid, &self.policy);
609
610        // Thaw regardless of capture result
611        self.thaw().await?;
612
613        cp
614    }
615
616    // ============================================================
617    // Internal: do_spawn
618    // ============================================================
619
620    /// Fork a child, apply confinement, and start the supervisor.
621    async fn do_spawn(&mut self, cmd: &[&str], capture: bool) -> Result<(), SandlockError> {
622        // 1. Validate state
623        if !matches!(self.state, SandboxState::Created) {
624            return Err(SandboxError::Child("sandbox already spawned".into()).into());
625        }
626
627        if cmd.is_empty() {
628            return Err(SandboxError::Child("empty command".into()).into());
629        }
630
631        // 2. Convert cmd to Vec<CString>
632        let c_cmd: Vec<CString> = cmd
633            .iter()
634            .map(|s| CString::new(*s).map_err(|_| SandboxError::Child("invalid command string".into())))
635            .collect::<Result<Vec<_>, _>>()?;
636
637        // 3. Detect nesting (before fork, in parent)
638        let nested = is_nested();
639
640        // 4. Create synchronization pipes
641        let pipes = PipePair::new().map_err(SandboxError::Io)?;
642
643        // 4. Resolve net_allow_hosts to IPs (async, before fork)
644        let resolved_ips = if !self.policy.net_allow_hosts.is_empty() {
645            network::resolve_hosts(&self.policy.net_allow_hosts)
646                .await
647                .map_err(SandboxError::Io)?
648        } else {
649            std::collections::HashSet::new()
650        };
651
652        // 5. Create COW branch if requested
653        let cow_branch: Option<Box<dyn CowBranch>> = match self.policy.fs_isolation {
654            FsIsolation::OverlayFs => {
655                let workdir = self.policy.workdir.as_ref()
656                    .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("OverlayFs requires workdir".into())))?;
657                let storage = self.policy.fs_storage.as_ref()
658                    .cloned()
659                    .unwrap_or_else(|| std::env::temp_dir().join("sandlock-overlay"));
660                std::fs::create_dir_all(&storage)
661                    .map_err(|e| SandlockError::Sandbox(SandboxError::Io(e)))?;
662                let branch = OverlayBranch::create(workdir, &storage)
663                    .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
664                Some(Box::new(branch))
665            }
666            FsIsolation::BranchFs => {
667                let workdir = self.policy.workdir.as_ref()
668                    .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("BranchFs requires workdir".into())))?;
669                let branch = BranchFsBranch::create(workdir)
670                    .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
671                Some(Box::new(branch))
672            }
673            FsIsolation::None => None,
674        };
675
676        // Build CowConfig for child if OverlayFS
677        let cow_config = if let Some(ref branch) = cow_branch {
678            if self.policy.fs_isolation == FsIsolation::OverlayFs {
679                // Downcast to get overlay-specific paths
680                // The branch_path is the merged dir; we need upper/work/lowers too.
681                // We stored this info in the OverlayBranch; extract via CowConfig.
682                // Since we can't downcast easily, we'll build CowConfig from policy info.
683                let workdir = self.policy.workdir.as_ref().unwrap();
684                let merged = branch.branch_path().to_path_buf();
685                // Derive upper/work from merged's parent (storage/uuid/)
686                let branch_dir = merged.parent().unwrap();
687                let upper = branch_dir.join("upper");
688                let work = branch_dir.join("work");
689                Some(CowConfig {
690                    merged,
691                    upper,
692                    work,
693                    lowers: vec![workdir.clone()],
694                })
695            } else {
696                None
697            }
698        } else {
699            None
700        };
701
702        // 6. Create stdout/stderr capture pipes (if capture mode)
703        let (stdout_r, stderr_r) = if capture {
704            let mut stdout_fds = [0i32; 2];
705            let mut stderr_fds = [0i32; 2];
706            if unsafe { libc::pipe2(stdout_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
707                return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
708            }
709            if unsafe { libc::pipe2(stderr_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
710                unsafe {
711                    libc::close(stdout_fds[0]);
712                    libc::close(stdout_fds[1]);
713                }
714                return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
715            }
716            (
717                Some((
718                    unsafe { OwnedFd::from_raw_fd(stdout_fds[0]) },
719                    unsafe { OwnedFd::from_raw_fd(stdout_fds[1]) },
720                )),
721                Some((
722                    unsafe { OwnedFd::from_raw_fd(stderr_fds[0]) },
723                    unsafe { OwnedFd::from_raw_fd(stderr_fds[1]) },
724                )),
725            )
726        } else {
727            (None, None)
728        };
729
730        // 6. Fork
731        let pid = unsafe { libc::fork() };
732        if pid < 0 {
733            return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
734        }
735
736        if pid == 0 {
737            // ===== CHILD PROCESS =====
738            // Drop parent's pipe ends by leaking them (they are OwnedFd and would
739            // close the fd on drop, but we only want to close OUR ends).
740            // The child does not use notif_r or ready_w.
741            // We must forget them so that Drop doesn't close the raw fds that
742            // confine_child may still use.
743            //
744            // We use std::mem::forget on the read end of notif and write end of ready
745            // because confine_child uses notif_w and ready_r (via the PipePair reference).
746            // The parent's ends (notif_r, ready_w) need to be closed in the child.
747            // However, since PipePair owns all four fds and confine_child takes
748            // a reference to it, we pass the whole PipePair and let confine_child
749            // handle it. confine_child never returns.
750
751            // Apply io_overrides (from spawn_with_io / pipeline)
752            if let Some((stdin_fd, stdout_fd, stderr_fd)) = self.io_overrides {
753                if let Some(fd) = stdin_fd {
754                    unsafe { libc::dup2(fd, 0) };
755                }
756                if let Some(fd) = stdout_fd {
757                    unsafe { libc::dup2(fd, 1) };
758                }
759                if let Some(fd) = stderr_fd {
760                    unsafe { libc::dup2(fd, 2) };
761                }
762            }
763
764            // Redirect stdout/stderr if capturing
765            if let Some((_, ref stdout_w)) = stdout_r {
766                unsafe { libc::dup2(stdout_w.as_raw_fd(), 1) };
767            }
768            if let Some((_, ref stderr_w)) = stderr_r {
769                unsafe { libc::dup2(stderr_w.as_raw_fd(), 2) };
770            }
771            // Drop capture pipe read ends in child (they belong to parent).
772            // The write ends will be closed by O_CLOEXEC on exec.
773            drop(stdout_r);
774            drop(stderr_r);
775
776            // This never returns.
777            context::confine_child(&self.policy, &c_cmd, &pipes, cow_config.as_ref(), nested);
778        }
779
780        // ===== PARENT PROCESS =====
781
782        // Store COW branch in parent
783        self.cow_branch = cow_branch;
784
785        // 7. Close child's pipe ends
786        drop(pipes.notif_w);
787        drop(pipes.ready_r);
788
789        // Drop capture pipe write ends in parent (they belong to child).
790        // Store the read ends so the child doesn't get SIGPIPE.
791        self._stdout_read = stdout_r.map(|(r, _w)| r);
792        self._stderr_read = stderr_r.map(|(r, _w)| r);
793
794        // 8. Set child_pid, state=Running
795        self.child_pid = Some(pid);
796        self.state = SandboxState::Running;
797
798        // 9. Open pidfd via syscall::pidfd_open
799        let pidfd = match syscall::pidfd_open(pid as u32, 0) {
800            Ok(fd) => Some(fd),
801            Err(_) => None, // pidfd not available on older kernels — proceed without
802        };
803
804        // 10. Read notif fd number from pipe (what child wrote)
805        //     0 = nested mode (no supervisor needed)
806        let notif_fd_num = read_u32_fd(pipes.notif_r.as_raw_fd())
807            .map_err(|e| SandboxError::Child(format!("read notif fd from child: {}", e)))?;
808
809        let is_nested = notif_fd_num == 0;
810
811        // 11. Copy notif fd from child (skip if nested)
812        let notif_fd = if is_nested {
813            None
814        } else if let Some(ref pfd) = pidfd {
815            Some(syscall::pidfd_getfd(pfd, notif_fd_num as i32, 0)
816                .map_err(|e| SandboxError::Child(format!("pidfd_getfd: {}", e)))?)
817        } else {
818            let path = format!("/proc/{}/fd/{}", pid, notif_fd_num);
819            let cpath = CString::new(path).unwrap();
820            let raw = unsafe { libc::open(cpath.as_ptr(), libc::O_RDWR) };
821            if raw < 0 {
822                return Err(
823                    SandboxError::Child("failed to open notif fd from /proc".into()).into(),
824                );
825            }
826            Some(unsafe { OwnedFd::from_raw_fd(raw) })
827        };
828
829        // 11b–14. Supervisor setup (skip in nested mode)
830        if let Some(notif_fd) = notif_fd {
831            // vDSO patching for determinism
832            if self.policy.time_start.is_some() || self.policy.random_seed.is_some() {
833                let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
834                if let Err(e) = crate::vdso::patch(pid, time_offset, self.policy.random_seed.is_some()) {
835                    eprintln!("sandlock: pre-exec vDSO patching failed (will retry after exec): {}", e);
836                }
837            }
838
839            // Build NotifPolicy
840            let time_offset_val = self.policy.time_start
841                .map(|t| crate::time::calculate_time_offset(t))
842                .unwrap_or(0);
843
844            let notif_policy = NotifPolicy {
845                max_memory_bytes: self.policy.max_memory.map(|m| m.0).unwrap_or(0),
846                max_processes: self.policy.max_processes,
847                has_memory_limit: self.policy.max_memory.is_some(),
848                has_net_allowlist: !self.policy.net_allow_hosts.is_empty()
849                    || self.policy.policy_fn.is_some(),
850                has_random_seed: self.policy.random_seed.is_some(),
851                has_time_start: self.policy.time_start.is_some(),
852                time_offset: time_offset_val,
853                num_cpus: self.policy.num_cpus,
854                has_proc_virt: self.policy.num_cpus.is_some() || self.policy.max_memory.is_some() || self.policy.isolate_pids || self.policy.port_remap,
855                isolate_pids: self.policy.isolate_pids,
856                port_remap: self.policy.port_remap,
857                cow_enabled: self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None,
858                chroot_root: self.policy.chroot.clone(),
859                chroot_readable: self.policy.fs_readable.clone(),
860                chroot_writable: self.policy.fs_writable.clone(),
861                deterministic_dirs: self.policy.deterministic_dirs,
862                hostname: self.policy.hostname.clone(),
863            };
864
865            // Create SupervisorState
866            use rand::SeedableRng;
867            use rand_chacha::ChaCha8Rng;
868
869            let random_state = self.policy.random_seed.map(|seed| ChaCha8Rng::seed_from_u64(seed));
870            let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
871
872            let mut sup_state = SupervisorState::new(
873                notif_policy.max_memory_bytes,
874                notif_policy.max_processes,
875                time_offset,
876                random_state,
877            );
878            sup_state.network_policy = if self.policy.net_allow_hosts.is_empty() {
879                crate::seccomp::notif::NetworkPolicy::Unrestricted
880            } else {
881                crate::seccomp::notif::NetworkPolicy::AllowList(resolved_ips)
882            };
883
884            if let Some(ref pfd) = pidfd {
885                use std::os::unix::io::AsRawFd;
886                sup_state.child_pidfd = Some(pfd.as_raw_fd());
887            }
888
889            // Seccomp COW branch
890            if self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None {
891                let workdir = self.policy.workdir.as_ref().unwrap();
892                let storage = self.policy.fs_storage.as_deref();
893                match crate::cow::seccomp::SeccompCowBranch::create(workdir, storage) {
894                    Ok(branch) => { sup_state.cow_branch = Some(branch); }
895                    Err(e) => { eprintln!("sandlock: seccomp COW branch creation failed: {}", e); }
896                }
897            }
898
899            // Policy callback thread
900            if let Some(ref callback) = self.policy.policy_fn {
901                let live = crate::policy_fn::LivePolicy {
902                    allowed_ips: match &sup_state.network_policy {
903                        crate::seccomp::notif::NetworkPolicy::AllowList(ips) => ips.clone(),
904                        crate::seccomp::notif::NetworkPolicy::Unrestricted => std::collections::HashSet::new(),
905                    },
906                    max_memory_bytes: notif_policy.max_memory_bytes,
907                    max_processes: notif_policy.max_processes,
908                };
909                let ceiling = live.clone();
910                let live = std::sync::Arc::new(std::sync::RwLock::new(live));
911                let denied_paths = sup_state.denied_paths.clone();
912                let pid_overrides = sup_state.pid_ip_overrides.clone();
913                // Store live_policy reference so supervisor reads dynamic updates
914                sup_state.live_policy = Some(live.clone());
915                let tx = crate::policy_fn::spawn_policy_fn(
916                    callback.clone(), live, ceiling, pid_overrides, denied_paths,
917                );
918                sup_state.policy_event_tx = Some(tx);
919            }
920
921            let sup_state = Arc::new(Mutex::new(sup_state));
922            self.supervisor_state = Some(Arc::clone(&sup_state));
923
924            // Spawn notif supervisor
925            self.notif_handle = Some(tokio::spawn(
926                notif::supervisor(notif_fd, notif_policy, sup_state),
927            ));
928        }
929
930        // 15. Optionally spawn CPU throttle task
931        if let Some(cpu_pct) = self.policy.max_cpu {
932            if cpu_pct < 100 {
933                let child_pid = pid;
934                self.throttle_handle = Some(tokio::spawn(throttle_cpu(child_pid, cpu_pct)));
935            }
936        }
937
938        // 16. Signal child "ready" via pipe
939        write_u32_fd(pipes.ready_w.as_raw_fd(), 1)
940            .map_err(|e| SandboxError::Child(format!("write ready signal: {}", e)))?;
941
942        // 17. Store pidfd
943        self.pidfd = pidfd;
944
945        Ok(())
946    }
947}
948
949// ============================================================
950// Drop — kill and reap child if still running
951// ============================================================
952
953impl Drop for Sandbox {
954    fn drop(&mut self) {
955        if let Some(pid) = self.child_pid {
956            if matches!(self.state, SandboxState::Running | SandboxState::Paused) {
957                // Kill the entire process group
958                unsafe { libc::killpg(pid, libc::SIGKILL) };
959                // Reap the zombie
960                let mut status: i32 = 0;
961                unsafe { libc::waitpid(pid, &mut status, 0) };
962            }
963        }
964
965        if let Some(h) = self.notif_handle.take() {
966            h.abort();
967        }
968        if let Some(h) = self.throttle_handle.take() {
969            h.abort();
970        }
971
972        // COW cleanup based on exit status
973        if let Some(ref branch) = self.cow_branch {
974            let is_error = matches!(
975                self.state,
976                SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
977            );
978            let action = if is_error {
979                &self.policy.on_error
980            } else {
981                &self.policy.on_exit
982            };
983            match action {
984                BranchAction::Commit => { let _ = branch.commit(); }
985                BranchAction::Abort => { let _ = branch.abort(); }
986                BranchAction::Keep => {} // leave COW layer in place
987            }
988        }
989
990        // Seccomp-based COW cleanup
991        if let Some(ref state) = self.supervisor_state {
992            let Ok(mut st) = state.try_lock() else { return; };
993            if let Some(ref mut cow) = st.cow_branch {
994                let is_error = matches!(
995                    self.state,
996                    SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
997                );
998                let action = if is_error {
999                    &self.policy.on_error
1000                } else {
1001                    &self.policy.on_exit
1002                };
1003                match action {
1004                    BranchAction::Commit => { let _ = cow.commit(); }
1005                    BranchAction::Abort => { let _ = cow.abort(); }
1006                    BranchAction::Keep => {}
1007                }
1008            }
1009        }
1010    }
1011}
1012
1013// ============================================================
1014// CPU throttle
1015// ============================================================
1016
1017/// Periodically SIGSTOP/SIGCONT the child process group to throttle CPU usage.
1018async fn throttle_cpu(pid: i32, cpu_pct: u8) {
1019    let period = Duration::from_millis(100);
1020    let run_time = period * cpu_pct as u32 / 100;
1021    let stop_time = period - run_time;
1022
1023    loop {
1024        tokio::time::sleep(run_time).await;
1025        if unsafe { libc::killpg(pid, libc::SIGSTOP) } < 0 {
1026            break;
1027        }
1028        tokio::time::sleep(stop_time).await;
1029        if unsafe { libc::killpg(pid, libc::SIGCONT) } < 0 {
1030            break;
1031        }
1032    }
1033}
1034
1035// ============================================================
1036// Helpers
1037// ============================================================
1038
1039/// Convert a raw waitpid status to our ExitStatus enum.
1040/// Read all bytes from a file descriptor until EOF.
1041/// Read exactly `buf.len()` bytes from a raw fd.
1042fn read_exact(fd: i32, buf: &mut [u8]) {
1043    let mut off = 0;
1044    while off < buf.len() {
1045        let r = unsafe { libc::read(fd, buf[off..].as_mut_ptr() as *mut _, buf.len() - off) };
1046        if r <= 0 { break; }
1047        off += r as usize;
1048    }
1049}
1050
1051fn read_fd_to_end(fd: OwnedFd) -> Vec<u8> {
1052    use std::io::Read;
1053    let mut file = unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) };
1054    let mut buf = Vec::new();
1055    let _ = file.read_to_end(&mut buf);
1056    buf
1057}
1058
1059fn wait_status_to_exit(status: i32) -> ExitStatus {
1060    if libc::WIFEXITED(status) {
1061        ExitStatus::Code(libc::WEXITSTATUS(status))
1062    } else if libc::WIFSIGNALED(status) {
1063        let sig = libc::WTERMSIG(status);
1064        if sig == libc::SIGKILL {
1065            ExitStatus::Killed
1066        } else {
1067            ExitStatus::Signal(sig)
1068        }
1069    } else {
1070        ExitStatus::Killed
1071    }
1072}