Skip to main content

sandlock_core/
sandbox.rs

1// Sandbox orchestrator — public API that coordinates fork, confinement,
2// and async supervision of sandboxed child processes.
3
4use std::ffi::CString;
5use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd};
6use std::sync::Arc;
7use std::time::Duration;
8
9use tokio::sync::Mutex;
10use tokio::task::JoinHandle;
11
12use std::sync::atomic::{AtomicBool, Ordering};
13
14use crate::context::{self, CowConfig, PipePair, read_u32_fd, write_u32_fd};
15use crate::cow::{CowBranch, overlayfs::OverlayBranch, branchfs::BranchFsBranch};
16use crate::error::{SandboxError, SandlockError};
17use crate::network;
18use crate::policy::{BranchAction, FsIsolation, Policy};
19use crate::result::{ExitStatus, RunResult};
20use crate::seccomp::notif::{self, NotifPolicy, SupervisorState};
21use crate::sys::syscall;
22
23// ============================================================
24// Nesting detection
25// ============================================================
26
27/// Set after seccomp confinement in the child process.
28/// Any subsequent Sandbox in this process is nested.
29pub(crate) static CONFINED: AtomicBool = AtomicBool::new(false);
30
31/// Detect if this process is already inside a sandbox.
32///
33/// Checks both the in-process flag and /proc/self/status (Seccomp: 2)
34/// to catch cross-process nesting (e.g. `sandlock run -- python agent.py`
35/// where agent.py creates inner sandboxes).
36pub fn is_nested() -> bool {
37    if CONFINED.load(Ordering::Relaxed) {
38        return true;
39    }
40    // Check /proc/self/status for active seccomp filter
41    if let Ok(status) = std::fs::read_to_string("/proc/self/status") {
42        for line in status.lines() {
43            if line.starts_with("Seccomp:") {
44                return line.trim().ends_with('2');
45            }
46        }
47    }
48    false
49}
50
51// ============================================================
52// SandboxState
53// ============================================================
54
55enum SandboxState {
56    Created,
57    Running,
58    Paused,
59    Stopped(ExitStatus),
60}
61
62// ============================================================
63// Sandbox
64// ============================================================
65
66/// The main user-facing sandbox API.
67///
68/// Orchestrates fork, confinement (Landlock + seccomp), and async
69/// notification-based supervision of the sandboxed child process.
70pub struct Sandbox {
71    policy: Policy,
72    state: SandboxState,
73    child_pid: Option<i32>,
74    pidfd: Option<OwnedFd>,
75    notif_handle: Option<JoinHandle<()>>,
76    throttle_handle: Option<JoinHandle<()>>,
77    /// Capture pipe read ends — kept alive so the child doesn't get SIGPIPE.
78    _stdout_read: Option<OwnedFd>,
79    _stderr_read: Option<OwnedFd>,
80    /// COW filesystem branch (OverlayFS or BranchFS).
81    cow_branch: Option<Box<dyn CowBranch>>,
82    /// Shared supervisor state for freeze/thaw support.
83    supervisor_state: Option<Arc<Mutex<SupervisorState>>>,
84    /// Control pipe for fork commands (parent end).
85    ctrl_fd: Option<OwnedFd>,
86    /// Stdout pipe read end (for fork clones — used by reduce).
87    stdout_pipe: Option<OwnedFd>,
88    /// Init function (runs once in child before fork).
89    init_fn: Option<Box<dyn FnOnce() + Send + 'static>>,
90    /// Work function (runs in each fork clone).
91    work_fn: Option<Arc<dyn Fn(u32) + Send + Sync + 'static>>,
92    /// Optional fd overrides for stdin/stdout/stderr (used by Pipeline).
93    io_overrides: Option<(Option<i32>, Option<i32>, Option<i32>)>,
94}
95
96impl Sandbox {
97    /// Create a new sandbox in the `Created` state.
98    pub fn new(policy: &Policy) -> Result<Self, SandlockError> {
99        Ok(Self::create(policy))
100    }
101
102    /// Create a sandbox with init and work functions for COW forking.
103    ///
104    /// `init_fn` runs once in the child to load expensive state.
105    /// `work_fn` runs in each COW clone created by `fork(N)`.
106    ///
107    /// ```ignore
108    /// let mut sb = Sandbox::new_with_fns(&policy,
109    ///     || { load_model(); },
110    ///     |clone_id| { rollout(clone_id); },
111    /// )?;
112    /// let clones = sb.fork(1000).await?;
113    /// ```
114    pub fn new_with_fns(
115        policy: &Policy,
116        init_fn: impl FnOnce() + Send + 'static,
117        work_fn: impl Fn(u32) + Send + Sync + 'static,
118    ) -> Result<Self, SandlockError> {
119        let mut sb = Self::create(policy);
120        sb.init_fn = Some(Box::new(init_fn));
121        sb.work_fn = Some(Arc::new(work_fn));
122        Ok(sb)
123    }
124
125    fn create(policy: &Policy) -> Self {
126        Self {
127            policy: policy.clone(),
128            state: SandboxState::Created,
129            child_pid: None,
130            pidfd: None,
131            notif_handle: None,
132            throttle_handle: None,
133            _stdout_read: None,
134            _stderr_read: None,
135            cow_branch: None,
136            supervisor_state: None,
137            ctrl_fd: None,
138            stdout_pipe: None,
139            init_fn: None,
140            work_fn: None,
141            io_overrides: None,
142        }
143    }
144
145    /// One-shot: spawn a sandboxed process, wait for it to exit, and return
146    /// the result. Stdout and stderr are captured.
147    pub async fn run(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
148        let mut sb = Self::new(policy)?;
149        sb.do_spawn(cmd, true).await?;
150        sb.wait().await
151    }
152
153    /// Run a sandboxed process with inherited stdio (interactive mode).
154    pub async fn run_interactive(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
155        let mut sb = Self::new(policy)?;
156        sb.do_spawn(cmd, false).await?;
157        sb.wait().await
158    }
159
160    /// Create N COW clones of this sandbox.
161    ///
162    /// Requires `new_with_fns()`. Forks a confined child, runs `init_fn`,
163    /// then forks N times using raw `fork()` (bypasses seccomp). Each
164    /// clone gets `CLONE_ID=0..N-1` and runs `work_fn(clone_id)`.
165    ///
166    /// Memory pages from `init_fn` are shared copy-on-write across all
167    /// clones — 1000 clones of a 50MB process use ~50MB total.
168    ///
169    /// Returns PIDs of all clones. Use `waitpid` to collect them.
170    /// Create N COW clones, each runs `work_fn(clone_id)`.
171    ///
172    /// Returns a Vec of Sandbox handles — one per clone. Each clone is
173    /// a live process that can be waited on, killed, or paused.
174    ///
175    /// ```ignore
176    /// let clones = sb.fork(4).await?;
177    /// for mut c in clones { c.wait().await?; }
178    /// ```
179    pub async fn fork(&mut self, n: u32) -> Result<Vec<Sandbox>, SandlockError> {
180        let init_fn = self.init_fn.take()
181            .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
182        let work_fn = self.work_fn.take()
183            .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
184
185        let policy = self.policy.clone();
186
187
188        // Create control pipe
189        let mut ctrl_fds = [0i32; 2];
190        if unsafe { libc::pipe2(ctrl_fds.as_mut_ptr(), 0) } < 0 {
191            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
192        }
193        let ctrl_parent = unsafe { OwnedFd::from_raw_fd(ctrl_fds[0]) };
194        let ctrl_child_fd = ctrl_fds[1];
195
196        // Create per-clone stdout pipes (parent keeps read ends)
197        let mut pipe_read_ends: Vec<OwnedFd> = Vec::with_capacity(n as usize);
198        let mut pipe_write_fds: Vec<i32> = Vec::with_capacity(n as usize);
199        for _ in 0..n {
200            let mut pfds = [0i32; 2];
201            if unsafe { libc::pipe(pfds.as_mut_ptr()) } >= 0 {
202                pipe_read_ends.push(unsafe { OwnedFd::from_raw_fd(pfds[0]) });
203                pipe_write_fds.push(pfds[1]);
204            } else {
205                pipe_write_fds.push(-1);
206            }
207        }
208
209        // Fork the template child
210        let pid = unsafe { libc::fork() };
211        if pid < 0 {
212            unsafe { libc::close(ctrl_child_fd) };
213            return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
214        }
215
216        if pid == 0 {
217            // ===== CHILD (template) =====
218            drop(ctrl_parent);
219
220            unsafe { libc::setpgid(0, 0) };
221            unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) };
222            unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
223
224            let _ = crate::landlock::confine(&policy);
225
226            let deny = crate::context::deny_syscall_numbers(&policy);
227            let args = crate::context::arg_filters(&policy);
228            let filter = crate::seccomp::bpf::assemble_filter(&[], &deny, &args);
229            let _ = crate::seccomp::bpf::install_deny_filter(&filter);
230
231            CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
232
233            // Run init (loads expensive state, shared via COW)
234            init_fn();
235
236            // Close read ends in template (parent owns them)
237            drop(pipe_read_ends);
238
239            // Fork N clones, send PIDs, wait for all
240            crate::fork::fork_ready_loop_fn(ctrl_child_fd, n, &*work_fn, &pipe_write_fds);
241            unsafe { libc::_exit(0) };
242        }
243
244        // ===== PARENT =====
245        unsafe { libc::close(ctrl_child_fd) };
246        // Close write ends in parent (template/clones own them)
247        for wfd in &pipe_write_fds {
248            if *wfd >= 0 { unsafe { libc::close(*wfd) }; }
249        }
250        self.child_pid = Some(pid);
251        self.state = SandboxState::Running;
252
253        // Read N clone PIDs
254        let ctrl_fd = ctrl_parent.as_raw_fd();
255        let mut pid_buf = vec![0u8; n as usize * 4];
256        read_exact(ctrl_fd, &mut pid_buf);
257
258        let clone_pids: Vec<i32> = pid_buf.chunks(4)
259            .map(|c| u32::from_be_bytes(c.try_into().unwrap_or([0; 4])) as i32)
260            .collect();
261        let live_count = clone_pids.iter().filter(|&&p| p > 0).count();
262
263        // Read exit codes (template waits for all clones first)
264        let mut code_buf = vec![0u8; live_count * 4];
265        read_exact(ctrl_fd, &mut code_buf);
266        self.ctrl_fd = Some(ctrl_parent);
267
268        // Wait for template to exit
269        let mut status = 0i32;
270        unsafe { libc::waitpid(pid, &mut status, 0) };
271
272        // Create clone handles with stdout pipe read ends
273        let mut code_idx = 0;
274        let mut clones = Vec::with_capacity(live_count);
275        let mut pipe_iter = pipe_read_ends.into_iter();
276
277        for &clone_pid in &clone_pids {
278            let pipe = pipe_iter.next();
279            if clone_pid <= 0 { continue; }
280
281            let code = i32::from_be_bytes(
282                code_buf[code_idx * 4..(code_idx + 1) * 4].try_into().unwrap_or([0; 4])
283            );
284            code_idx += 1;
285
286            let mut sb = Sandbox::create(&policy);
287            sb.child_pid = Some(clone_pid);
288            sb.state = SandboxState::Stopped(if code == 0 {
289                ExitStatus::Code(0)
290            } else if code > 0 {
291                ExitStatus::Code(code)
292            } else {
293                ExitStatus::Killed
294            });
295            sb.stdout_pipe = pipe;
296            clones.push(sb);
297        }
298
299        Ok(clones)
300    }
301
302    /// Reduce: wait for all clones, then run a reducer command.
303    ///
304    /// Waits for every clone to finish, then runs `cmd` in this sandbox.
305    /// The reducer can read clone results from shared files, tmpdir, etc.
306    ///
307    /// ```ignore
308    /// let clones = mapper.fork(4).await?;
309    /// let result = reducer.reduce(&["python3", "sum.py"], &mut clones).await?;
310    /// ```
311    pub async fn reduce(
312        &self,
313        cmd: &[&str],
314        clones: &mut [Sandbox],
315    ) -> Result<RunResult, SandlockError> {
316        // Read each clone's stdout pipe and concatenate
317        let mut combined = Vec::new();
318        for clone in clones.iter_mut() {
319            if let Some(pipe) = clone.stdout_pipe.take() {
320                combined.extend_from_slice(&read_fd_to_end(pipe));
321            }
322        }
323
324        // Create a pipe to feed combined data to reducer's stdin
325        let mut stdin_fds = [0i32; 2];
326        if unsafe { libc::pipe2(stdin_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
327            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
328        }
329
330        // Write combined data in a blocking thread (avoid deadlock with large data)
331        let write_fd = stdin_fds[1];
332        let write_handle = tokio::task::spawn_blocking(move || {
333            unsafe {
334                libc::write(write_fd, combined.as_ptr() as *const _, combined.len());
335                libc::close(write_fd);
336            }
337        });
338
339        // Spawn reducer with stdin from pipe, capture stdout
340        let mut reducer = Sandbox::new(&self.policy)?;
341        reducer.io_overrides = Some((Some(stdin_fds[0]), None, None));
342        reducer.do_spawn(cmd, true).await?;
343        unsafe { libc::close(stdin_fds[0]) };
344
345        let _ = write_handle.await;
346        reducer.wait().await
347    }
348
349    /// Wait for the child process to exit.
350    pub async fn wait(&mut self) -> Result<RunResult, SandlockError> {
351        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
352
353        if let SandboxState::Stopped(ref es) = self.state {
354            return Ok(RunResult {
355                exit_status: es.clone(),
356                stdout: None,
357                stderr: None,
358            });
359        }
360
361        // Blocking waitpid in a blocking thread so we don't block the tokio runtime.
362        let exit_status = tokio::task::spawn_blocking(move || -> ExitStatus {
363            let mut status: i32 = 0;
364            loop {
365                let ret = unsafe { libc::waitpid(pid, &mut status, 0) };
366                if ret < 0 {
367                    let err = std::io::Error::last_os_error();
368                    if err.raw_os_error() == Some(libc::EINTR) {
369                        continue;
370                    }
371                    // Child already reaped or invalid pid
372                    return ExitStatus::Killed;
373                }
374                break;
375            }
376            wait_status_to_exit(status)
377        })
378        .await
379        .unwrap_or(ExitStatus::Killed);
380
381        self.state = SandboxState::Stopped(exit_status.clone());
382
383        // Abort supervisor tasks now that the child is gone.
384        if let Some(h) = self.notif_handle.take() {
385            h.abort();
386        }
387        if let Some(h) = self.throttle_handle.take() {
388            h.abort();
389        }
390
391        // Drain captured stdout/stderr if available
392        let stdout = self._stdout_read.take().map(|fd| read_fd_to_end(fd));
393        let stderr = self._stderr_read.take().map(|fd| read_fd_to_end(fd));
394
395        Ok(RunResult {
396            exit_status,
397            stdout,
398            stderr,
399        })
400    }
401
402    /// Send SIGSTOP to the child's process group.
403    pub fn pause(&mut self) -> Result<(), SandlockError> {
404        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
405        let ret = unsafe { libc::killpg(pid, libc::SIGSTOP) };
406        if ret < 0 {
407            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
408        }
409        self.state = SandboxState::Paused;
410        Ok(())
411    }
412
413    /// Send SIGCONT to the child's process group.
414    pub fn resume(&mut self) -> Result<(), SandlockError> {
415        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
416        let ret = unsafe { libc::killpg(pid, libc::SIGCONT) };
417        if ret < 0 {
418            return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
419        }
420        self.state = SandboxState::Running;
421        Ok(())
422    }
423
424    /// Send SIGKILL to the child's process group.
425    pub fn kill(&mut self) -> Result<(), SandlockError> {
426        let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
427        let ret = unsafe { libc::killpg(pid, libc::SIGKILL) };
428        if ret < 0 {
429            let err = std::io::Error::last_os_error();
430            // ESRCH means the process is already gone — not an error.
431            if err.raw_os_error() != Some(libc::ESRCH) {
432                return Err(SandboxError::Io(err).into());
433            }
434        }
435        Ok(())
436    }
437
438    /// Return the child PID, if spawned.
439    pub fn pid(&self) -> Option<i32> {
440        self.child_pid
441    }
442
443    /// Return whether the child is currently running.
444    #[doc(hidden)]
445    pub fn is_running(&self) -> bool {
446        matches!(self.state, SandboxState::Running | SandboxState::Paused)
447    }
448
449    /// Return a reference to the policy.
450    pub fn policy(&self) -> &Policy {
451        &self.policy
452    }
453
454    /// Commit COW writes to the original directory.
455    #[doc(hidden)]
456    pub async fn commit(&mut self) -> Result<(), SandlockError> {
457        if let Some(branch) = self.cow_branch.take() {
458            branch.commit().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
459        }
460        Ok(())
461    }
462
463    /// Discard COW writes.
464    #[doc(hidden)]
465    pub async fn abort_branch(&mut self) -> Result<(), SandlockError> {
466        if let Some(branch) = self.cow_branch.take() {
467            branch.abort().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
468        }
469        Ok(())
470    }
471
472    /// Freeze the sandbox: hold all fork notifications + SIGSTOP the process group.
473    pub(crate) async fn freeze(&self) -> Result<(), SandlockError> {
474        let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
475
476        // Set hold_forks in supervisor state
477        if let Some(ref state) = self.supervisor_state {
478            let mut st = state.lock().await;
479            st.hold_forks = true;
480        }
481
482        // SIGSTOP the process group
483        unsafe { libc::killpg(pid, libc::SIGSTOP); }
484        Ok(())
485    }
486
487    /// Thaw the sandbox: release held fork notifications + SIGCONT.
488    pub(crate) async fn thaw(&self) -> Result<(), SandlockError> {
489        let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
490
491        // Release held forks
492        if let Some(ref state) = self.supervisor_state {
493            let mut st = state.lock().await;
494            st.hold_forks = false;
495            st.held_notif_ids.clear();
496        }
497
498        // SIGCONT the process group
499        unsafe { libc::killpg(pid, libc::SIGCONT); }
500        Ok(())
501    }
502
503    /// Spawn a sandboxed process without waiting for it to exit.
504    /// Use `wait()` to collect the exit status when done.
505    #[doc(hidden)]
506    pub async fn spawn(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
507        self.do_spawn(cmd, false).await
508    }
509
510    /// Like `spawn` but captures stdout and stderr (available via `wait()`).
511    /// Not part of the public API — used by the FFI crate.
512    #[doc(hidden)]
513    pub async fn spawn_captured(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
514        self.do_spawn(cmd, true).await
515    }
516
517    /// Spawn with explicit stdin/stdout/stderr fd redirection.
518    ///
519    /// Each `Option<RawFd>` overrides the corresponding fd in the child:
520    /// - `stdin_fd`: dup2'd to fd 0
521    /// - `stdout_fd`: dup2'd to fd 1
522    /// - `stderr_fd`: dup2'd to fd 2
523    ///
524    /// The caller is responsible for closing the fds after this call.
525    #[doc(hidden)]
526    pub async fn spawn_with_io(
527        &mut self,
528        cmd: &[&str],
529        stdin_fd: Option<std::os::unix::io::RawFd>,
530        stdout_fd: Option<std::os::unix::io::RawFd>,
531        stderr_fd: Option<std::os::unix::io::RawFd>,
532    ) -> Result<(), SandlockError> {
533        self.io_overrides = Some((stdin_fd, stdout_fd, stderr_fd));
534        self.do_spawn(cmd, false).await
535    }
536
537    /// Capture a checkpoint of the running sandbox.
538    pub async fn checkpoint(&self) -> Result<crate::checkpoint::Checkpoint, SandlockError> {
539        let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
540
541        // Freeze
542        self.freeze().await?;
543
544        // Capture state
545        let cp = crate::checkpoint::capture(pid, &self.policy);
546
547        // Thaw regardless of capture result
548        self.thaw().await?;
549
550        cp
551    }
552
553    // ============================================================
554    // Internal: do_spawn
555    // ============================================================
556
557    /// Fork a child, apply confinement, and start the supervisor.
558    async fn do_spawn(&mut self, cmd: &[&str], capture: bool) -> Result<(), SandlockError> {
559        // 1. Validate state
560        if !matches!(self.state, SandboxState::Created) {
561            return Err(SandboxError::Child("sandbox already spawned".into()).into());
562        }
563
564        if cmd.is_empty() {
565            return Err(SandboxError::Child("empty command".into()).into());
566        }
567
568        // 2. Convert cmd to Vec<CString>
569        let c_cmd: Vec<CString> = cmd
570            .iter()
571            .map(|s| CString::new(*s).map_err(|_| SandboxError::Child("invalid command string".into())))
572            .collect::<Result<Vec<_>, _>>()?;
573
574        // 3. Detect nesting (before fork, in parent)
575        let nested = is_nested();
576
577        // 4. Create synchronization pipes
578        let pipes = PipePair::new().map_err(SandboxError::Io)?;
579
580        // 4. Resolve net_allow_hosts to IPs (async, before fork)
581        let resolved_ips = if !self.policy.net_allow_hosts.is_empty() {
582            network::resolve_hosts(&self.policy.net_allow_hosts)
583                .await
584                .map_err(SandboxError::Io)?
585        } else {
586            std::collections::HashSet::new()
587        };
588
589        // 5. Create COW branch if requested
590        let cow_branch: Option<Box<dyn CowBranch>> = match self.policy.fs_isolation {
591            FsIsolation::OverlayFs => {
592                let workdir = self.policy.workdir.as_ref()
593                    .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("OverlayFs requires workdir".into())))?;
594                let storage = self.policy.fs_storage.as_ref()
595                    .cloned()
596                    .unwrap_or_else(|| std::env::temp_dir().join("sandlock-overlay"));
597                std::fs::create_dir_all(&storage)
598                    .map_err(|e| SandlockError::Sandbox(SandboxError::Io(e)))?;
599                let branch = OverlayBranch::create(workdir, &storage)
600                    .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
601                Some(Box::new(branch))
602            }
603            FsIsolation::BranchFs => {
604                let workdir = self.policy.workdir.as_ref()
605                    .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("BranchFs requires workdir".into())))?;
606                let branch = BranchFsBranch::create(workdir)
607                    .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
608                Some(Box::new(branch))
609            }
610            FsIsolation::None => None,
611        };
612
613        // Build CowConfig for child if OverlayFS
614        let cow_config = if let Some(ref branch) = cow_branch {
615            if self.policy.fs_isolation == FsIsolation::OverlayFs {
616                // Downcast to get overlay-specific paths
617                // The branch_path is the merged dir; we need upper/work/lowers too.
618                // We stored this info in the OverlayBranch; extract via CowConfig.
619                // Since we can't downcast easily, we'll build CowConfig from policy info.
620                let workdir = self.policy.workdir.as_ref().unwrap();
621                let merged = branch.branch_path().to_path_buf();
622                // Derive upper/work from merged's parent (storage/uuid/)
623                let branch_dir = merged.parent().unwrap();
624                let upper = branch_dir.join("upper");
625                let work = branch_dir.join("work");
626                Some(CowConfig {
627                    merged,
628                    upper,
629                    work,
630                    lowers: vec![workdir.clone()],
631                })
632            } else {
633                None
634            }
635        } else {
636            None
637        };
638
639        // 6. Create stdout/stderr capture pipes (if capture mode)
640        let (stdout_r, stderr_r) = if capture {
641            let mut stdout_fds = [0i32; 2];
642            let mut stderr_fds = [0i32; 2];
643            if unsafe { libc::pipe2(stdout_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
644                return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
645            }
646            if unsafe { libc::pipe2(stderr_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
647                unsafe {
648                    libc::close(stdout_fds[0]);
649                    libc::close(stdout_fds[1]);
650                }
651                return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
652            }
653            (
654                Some((
655                    unsafe { OwnedFd::from_raw_fd(stdout_fds[0]) },
656                    unsafe { OwnedFd::from_raw_fd(stdout_fds[1]) },
657                )),
658                Some((
659                    unsafe { OwnedFd::from_raw_fd(stderr_fds[0]) },
660                    unsafe { OwnedFd::from_raw_fd(stderr_fds[1]) },
661                )),
662            )
663        } else {
664            (None, None)
665        };
666
667        // 6. Fork
668        let pid = unsafe { libc::fork() };
669        if pid < 0 {
670            return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
671        }
672
673        if pid == 0 {
674            // ===== CHILD PROCESS =====
675            // Drop parent's pipe ends by leaking them (they are OwnedFd and would
676            // close the fd on drop, but we only want to close OUR ends).
677            // The child does not use notif_r or ready_w.
678            // We must forget them so that Drop doesn't close the raw fds that
679            // confine_child may still use.
680            //
681            // We use std::mem::forget on the read end of notif and write end of ready
682            // because confine_child uses notif_w and ready_r (via the PipePair reference).
683            // The parent's ends (notif_r, ready_w) need to be closed in the child.
684            // However, since PipePair owns all four fds and confine_child takes
685            // a reference to it, we pass the whole PipePair and let confine_child
686            // handle it. confine_child never returns.
687
688            // Apply io_overrides (from spawn_with_io / pipeline)
689            if let Some((stdin_fd, stdout_fd, stderr_fd)) = self.io_overrides {
690                if let Some(fd) = stdin_fd {
691                    unsafe { libc::dup2(fd, 0) };
692                }
693                if let Some(fd) = stdout_fd {
694                    unsafe { libc::dup2(fd, 1) };
695                }
696                if let Some(fd) = stderr_fd {
697                    unsafe { libc::dup2(fd, 2) };
698                }
699            }
700
701            // Redirect stdout/stderr if capturing
702            if let Some((_, ref stdout_w)) = stdout_r {
703                unsafe { libc::dup2(stdout_w.as_raw_fd(), 1) };
704            }
705            if let Some((_, ref stderr_w)) = stderr_r {
706                unsafe { libc::dup2(stderr_w.as_raw_fd(), 2) };
707            }
708            // Drop capture pipe read ends in child (they belong to parent).
709            // The write ends will be closed by O_CLOEXEC on exec.
710            drop(stdout_r);
711            drop(stderr_r);
712
713            // This never returns.
714            context::confine_child(&self.policy, &c_cmd, &pipes, cow_config.as_ref(), nested);
715        }
716
717        // ===== PARENT PROCESS =====
718
719        // Store COW branch in parent
720        self.cow_branch = cow_branch;
721
722        // 7. Close child's pipe ends
723        drop(pipes.notif_w);
724        drop(pipes.ready_r);
725
726        // Drop capture pipe write ends in parent (they belong to child).
727        // Store the read ends so the child doesn't get SIGPIPE.
728        self._stdout_read = stdout_r.map(|(r, _w)| r);
729        self._stderr_read = stderr_r.map(|(r, _w)| r);
730
731        // 8. Set child_pid, state=Running
732        self.child_pid = Some(pid);
733        self.state = SandboxState::Running;
734
735        // 9. Open pidfd via syscall::pidfd_open
736        let pidfd = match syscall::pidfd_open(pid as u32, 0) {
737            Ok(fd) => Some(fd),
738            Err(_) => None, // pidfd not available on older kernels — proceed without
739        };
740
741        // 10. Read notif fd number from pipe (what child wrote)
742        //     0 = nested mode (no supervisor needed)
743        let notif_fd_num = read_u32_fd(pipes.notif_r.as_raw_fd())
744            .map_err(|e| SandboxError::Child(format!("read notif fd from child: {}", e)))?;
745
746        let is_nested = notif_fd_num == 0;
747
748        // 11. Copy notif fd from child (skip if nested)
749        let notif_fd = if is_nested {
750            None
751        } else if let Some(ref pfd) = pidfd {
752            Some(syscall::pidfd_getfd(pfd, notif_fd_num as i32, 0)
753                .map_err(|e| SandboxError::Child(format!("pidfd_getfd: {}", e)))?)
754        } else {
755            let path = format!("/proc/{}/fd/{}", pid, notif_fd_num);
756            let cpath = CString::new(path).unwrap();
757            let raw = unsafe { libc::open(cpath.as_ptr(), libc::O_RDWR) };
758            if raw < 0 {
759                return Err(
760                    SandboxError::Child("failed to open notif fd from /proc".into()).into(),
761                );
762            }
763            Some(unsafe { OwnedFd::from_raw_fd(raw) })
764        };
765
766        // 11b–14. Supervisor setup (skip in nested mode)
767        if let Some(notif_fd) = notif_fd {
768            // vDSO patching for determinism
769            if self.policy.time_start.is_some() || self.policy.random_seed.is_some() {
770                let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
771                if let Err(e) = crate::vdso::patch(pid, time_offset, self.policy.random_seed.is_some()) {
772                    eprintln!("sandlock: pre-exec vDSO patching failed (will retry after exec): {}", e);
773                }
774            }
775
776            // Build NotifPolicy
777            let time_offset_val = self.policy.time_start
778                .map(|t| crate::time::calculate_time_offset(t))
779                .unwrap_or(0);
780
781            let notif_policy = NotifPolicy {
782                max_memory_bytes: self.policy.max_memory.map(|m| m.0).unwrap_or(0),
783                max_processes: self.policy.max_processes,
784                has_memory_limit: self.policy.max_memory.is_some(),
785                has_net_allowlist: !self.policy.net_allow_hosts.is_empty()
786                    || self.policy.policy_fn.is_some(),
787                has_random_seed: self.policy.random_seed.is_some(),
788                has_time_start: self.policy.time_start.is_some(),
789                time_offset: time_offset_val,
790                num_cpus: self.policy.num_cpus,
791                has_proc_virt: self.policy.num_cpus.is_some() || self.policy.max_memory.is_some() || self.policy.isolate_pids || self.policy.port_remap,
792                isolate_pids: self.policy.isolate_pids,
793                port_remap: self.policy.port_remap,
794                cow_enabled: self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None,
795                chroot_root: self.policy.chroot.clone(),
796                chroot_readable: self.policy.fs_readable.clone(),
797                chroot_writable: self.policy.fs_writable.clone(),
798                deterministic_dirs: self.policy.deterministic_dirs,
799                hostname: self.policy.hostname.clone(),
800            };
801
802            // Create SupervisorState
803            use rand::SeedableRng;
804            use rand_chacha::ChaCha8Rng;
805
806            let random_state = self.policy.random_seed.map(|seed| ChaCha8Rng::seed_from_u64(seed));
807            let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
808
809            let mut sup_state = SupervisorState::new(
810                notif_policy.max_memory_bytes,
811                notif_policy.max_processes,
812                time_offset,
813                random_state,
814            );
815            sup_state.network_policy = if self.policy.net_allow_hosts.is_empty() {
816                crate::seccomp::notif::NetworkPolicy::Unrestricted
817            } else {
818                crate::seccomp::notif::NetworkPolicy::AllowList(resolved_ips)
819            };
820
821            if let Some(ref pfd) = pidfd {
822                use std::os::unix::io::AsRawFd;
823                sup_state.child_pidfd = Some(pfd.as_raw_fd());
824            }
825
826            // Seccomp COW branch
827            if self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None {
828                let workdir = self.policy.workdir.as_ref().unwrap();
829                let storage = self.policy.fs_storage.as_deref();
830                match crate::cow::seccomp::SeccompCowBranch::create(workdir, storage) {
831                    Ok(branch) => { sup_state.cow_branch = Some(branch); }
832                    Err(e) => { eprintln!("sandlock: seccomp COW branch creation failed: {}", e); }
833                }
834            }
835
836            // Policy callback thread
837            if let Some(ref callback) = self.policy.policy_fn {
838                let live = crate::policy_fn::LivePolicy {
839                    allowed_ips: match &sup_state.network_policy {
840                        crate::seccomp::notif::NetworkPolicy::AllowList(ips) => ips.clone(),
841                        crate::seccomp::notif::NetworkPolicy::Unrestricted => std::collections::HashSet::new(),
842                    },
843                    max_memory_bytes: notif_policy.max_memory_bytes,
844                    max_processes: notif_policy.max_processes,
845                };
846                let ceiling = live.clone();
847                let live = std::sync::Arc::new(std::sync::RwLock::new(live));
848                let denied_paths = sup_state.denied_paths.clone();
849                let pid_overrides = sup_state.pid_ip_overrides.clone();
850                // Store live_policy reference so supervisor reads dynamic updates
851                sup_state.live_policy = Some(live.clone());
852                let tx = crate::policy_fn::spawn_policy_fn(
853                    callback.clone(), live, ceiling, pid_overrides, denied_paths,
854                );
855                sup_state.policy_event_tx = Some(tx);
856            }
857
858            let sup_state = Arc::new(Mutex::new(sup_state));
859            self.supervisor_state = Some(Arc::clone(&sup_state));
860
861            // Spawn notif supervisor
862            self.notif_handle = Some(tokio::spawn(
863                notif::supervisor(notif_fd, notif_policy, sup_state),
864            ));
865        }
866
867        // 15. Optionally spawn CPU throttle task
868        if let Some(cpu_pct) = self.policy.max_cpu {
869            if cpu_pct < 100 {
870                let child_pid = pid;
871                self.throttle_handle = Some(tokio::spawn(throttle_cpu(child_pid, cpu_pct)));
872            }
873        }
874
875        // 16. Signal child "ready" via pipe
876        write_u32_fd(pipes.ready_w.as_raw_fd(), 1)
877            .map_err(|e| SandboxError::Child(format!("write ready signal: {}", e)))?;
878
879        // 17. Store pidfd
880        self.pidfd = pidfd;
881
882        Ok(())
883    }
884}
885
886// ============================================================
887// Drop — kill and reap child if still running
888// ============================================================
889
890impl Drop for Sandbox {
891    fn drop(&mut self) {
892        if let Some(pid) = self.child_pid {
893            if matches!(self.state, SandboxState::Running | SandboxState::Paused) {
894                // Kill the entire process group
895                unsafe { libc::killpg(pid, libc::SIGKILL) };
896                // Reap the zombie
897                let mut status: i32 = 0;
898                unsafe { libc::waitpid(pid, &mut status, 0) };
899            }
900        }
901
902        if let Some(h) = self.notif_handle.take() {
903            h.abort();
904        }
905        if let Some(h) = self.throttle_handle.take() {
906            h.abort();
907        }
908
909        // COW cleanup based on exit status
910        if let Some(ref branch) = self.cow_branch {
911            let is_error = matches!(
912                self.state,
913                SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
914            );
915            let action = if is_error {
916                &self.policy.on_error
917            } else {
918                &self.policy.on_exit
919            };
920            match action {
921                BranchAction::Commit => { let _ = branch.commit(); }
922                BranchAction::Abort => { let _ = branch.abort(); }
923                BranchAction::Keep => {} // leave COW layer in place
924            }
925        }
926
927        // Seccomp-based COW cleanup
928        if let Some(ref state) = self.supervisor_state {
929            let Ok(mut st) = state.try_lock() else { return; };
930            if let Some(ref mut cow) = st.cow_branch {
931                let is_error = matches!(
932                    self.state,
933                    SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
934                );
935                let action = if is_error {
936                    &self.policy.on_error
937                } else {
938                    &self.policy.on_exit
939                };
940                match action {
941                    BranchAction::Commit => { let _ = cow.commit(); }
942                    BranchAction::Abort => { let _ = cow.abort(); }
943                    BranchAction::Keep => {}
944                }
945            }
946        }
947    }
948}
949
950// ============================================================
951// CPU throttle
952// ============================================================
953
954/// Periodically SIGSTOP/SIGCONT the child process group to throttle CPU usage.
955async fn throttle_cpu(pid: i32, cpu_pct: u8) {
956    let period = Duration::from_millis(100);
957    let run_time = period * cpu_pct as u32 / 100;
958    let stop_time = period - run_time;
959
960    loop {
961        tokio::time::sleep(run_time).await;
962        if unsafe { libc::killpg(pid, libc::SIGSTOP) } < 0 {
963            break;
964        }
965        tokio::time::sleep(stop_time).await;
966        if unsafe { libc::killpg(pid, libc::SIGCONT) } < 0 {
967            break;
968        }
969    }
970}
971
972// ============================================================
973// Helpers
974// ============================================================
975
976/// Convert a raw waitpid status to our ExitStatus enum.
977/// Read all bytes from a file descriptor until EOF.
978/// Read exactly `buf.len()` bytes from a raw fd.
979fn read_exact(fd: i32, buf: &mut [u8]) {
980    let mut off = 0;
981    while off < buf.len() {
982        let r = unsafe { libc::read(fd, buf[off..].as_mut_ptr() as *mut _, buf.len() - off) };
983        if r <= 0 { break; }
984        off += r as usize;
985    }
986}
987
988fn read_fd_to_end(fd: OwnedFd) -> Vec<u8> {
989    use std::io::Read;
990    let mut file = unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) };
991    let mut buf = Vec::new();
992    let _ = file.read_to_end(&mut buf);
993    buf
994}
995
996fn wait_status_to_exit(status: i32) -> ExitStatus {
997    if libc::WIFEXITED(status) {
998        ExitStatus::Code(libc::WEXITSTATUS(status))
999    } else if libc::WIFSIGNALED(status) {
1000        let sig = libc::WTERMSIG(status);
1001        if sig == libc::SIGKILL {
1002            ExitStatus::Killed
1003        } else {
1004            ExitStatus::Signal(sig)
1005        }
1006    } else {
1007        ExitStatus::Killed
1008    }
1009}