1use std::ffi::CString;
5use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd};
6use std::sync::Arc;
7use std::time::Duration;
8
9use tokio::sync::Mutex;
10use tokio::task::JoinHandle;
11
12use std::sync::atomic::{AtomicBool, Ordering};
13
14use crate::context::{self, CowConfig, PipePair, read_u32_fd, write_u32_fd};
15use crate::cow::{CowBranch, overlayfs::OverlayBranch, branchfs::BranchFsBranch};
16use crate::error::{SandboxError, SandlockError};
17use crate::network;
18use crate::policy::{BranchAction, FsIsolation, Policy};
19use crate::result::{ExitStatus, RunResult};
20use crate::seccomp::notif::{self, NotifPolicy, SupervisorState};
21use crate::sys::syscall;
22
23pub(crate) static CONFINED: AtomicBool = AtomicBool::new(false);
30
31pub fn is_nested() -> bool {
37 if CONFINED.load(Ordering::Relaxed) {
38 return true;
39 }
40 if let Ok(status) = std::fs::read_to_string("/proc/self/status") {
42 for line in status.lines() {
43 if line.starts_with("Seccomp:") {
44 return line.trim().ends_with('2');
45 }
46 }
47 }
48 false
49}
50
51enum SandboxState {
56 Created,
57 Running,
58 Paused,
59 Stopped(ExitStatus),
60}
61
62pub struct Sandbox {
71 policy: Policy,
72 state: SandboxState,
73 child_pid: Option<i32>,
74 pidfd: Option<OwnedFd>,
75 notif_handle: Option<JoinHandle<()>>,
76 throttle_handle: Option<JoinHandle<()>>,
77 _stdout_read: Option<OwnedFd>,
79 _stderr_read: Option<OwnedFd>,
80 cow_branch: Option<Box<dyn CowBranch>>,
82 supervisor_state: Option<Arc<Mutex<SupervisorState>>>,
84 ctrl_fd: Option<OwnedFd>,
86 stdout_pipe: Option<OwnedFd>,
88 init_fn: Option<Box<dyn FnOnce() + Send + 'static>>,
90 work_fn: Option<Arc<dyn Fn(u32) + Send + Sync + 'static>>,
92 io_overrides: Option<(Option<i32>, Option<i32>, Option<i32>)>,
94}
95
96impl Sandbox {
97 pub fn new(policy: &Policy) -> Result<Self, SandlockError> {
99 Ok(Self::create(policy))
100 }
101
102 pub fn new_with_fns(
115 policy: &Policy,
116 init_fn: impl FnOnce() + Send + 'static,
117 work_fn: impl Fn(u32) + Send + Sync + 'static,
118 ) -> Result<Self, SandlockError> {
119 let mut sb = Self::create(policy);
120 sb.init_fn = Some(Box::new(init_fn));
121 sb.work_fn = Some(Arc::new(work_fn));
122 Ok(sb)
123 }
124
125 fn create(policy: &Policy) -> Self {
126 Self {
127 policy: policy.clone(),
128 state: SandboxState::Created,
129 child_pid: None,
130 pidfd: None,
131 notif_handle: None,
132 throttle_handle: None,
133 _stdout_read: None,
134 _stderr_read: None,
135 cow_branch: None,
136 supervisor_state: None,
137 ctrl_fd: None,
138 stdout_pipe: None,
139 init_fn: None,
140 work_fn: None,
141 io_overrides: None,
142 }
143 }
144
145 pub async fn run(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
148 let mut sb = Self::new(policy)?;
149 sb.do_spawn(cmd, true).await?;
150 sb.wait().await
151 }
152
153 pub async fn run_interactive(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
155 let mut sb = Self::new(policy)?;
156 sb.do_spawn(cmd, false).await?;
157 sb.wait().await
158 }
159
160 pub async fn dry_run(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
164 let mut policy = policy.clone();
165 policy.on_exit = BranchAction::Keep;
166 policy.on_error = BranchAction::Keep;
167
168 let mut sb = Self::new(&policy)?;
169 sb.do_spawn(cmd, true).await?;
170 let run_result = sb.wait().await?;
171 let changes = sb.collect_changes().await;
172 sb.do_abort().await;
173 Ok(crate::dry_run::DryRunResult { run_result, changes })
174 }
175
176 pub async fn dry_run_interactive(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
178 let mut policy = policy.clone();
179 policy.on_exit = BranchAction::Keep;
180 policy.on_error = BranchAction::Keep;
181
182 let mut sb = Self::new(&policy)?;
183 sb.do_spawn(cmd, false).await?;
184 let run_result = sb.wait().await?;
185 let changes = sb.collect_changes().await;
186 sb.do_abort().await;
187 Ok(crate::dry_run::DryRunResult { run_result, changes })
188 }
189
190 async fn collect_changes(&self) -> Vec<crate::dry_run::Change> {
192 if let Some(ref branch) = self.cow_branch {
194 return branch.changes().unwrap_or_default();
195 }
196
197 if let Some(ref state) = self.supervisor_state {
199 if let Ok(st) = state.try_lock() {
200 if let Some(ref cow) = st.cow_branch {
201 return cow.changes().unwrap_or_default();
202 }
203 }
204 }
205
206 Vec::new()
207 }
208
209 async fn do_abort(&mut self) {
211 if let Some(branch) = self.cow_branch.take() {
212 let _ = branch.abort();
213 }
214 if let Some(ref state) = self.supervisor_state {
215 if let Ok(mut st) = state.try_lock() {
216 if let Some(ref mut cow) = st.cow_branch {
217 let _ = cow.abort();
218 }
219 }
220 }
221 }
222
223 pub async fn fork(&mut self, n: u32) -> Result<Vec<Sandbox>, SandlockError> {
243 let init_fn = self.init_fn.take()
244 .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
245 let work_fn = self.work_fn.take()
246 .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
247
248 let policy = self.policy.clone();
249
250
251 let mut ctrl_fds = [0i32; 2];
253 if unsafe { libc::pipe2(ctrl_fds.as_mut_ptr(), 0) } < 0 {
254 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
255 }
256 let ctrl_parent = unsafe { OwnedFd::from_raw_fd(ctrl_fds[0]) };
257 let ctrl_child_fd = ctrl_fds[1];
258
259 let mut pipe_read_ends: Vec<OwnedFd> = Vec::with_capacity(n as usize);
261 let mut pipe_write_fds: Vec<i32> = Vec::with_capacity(n as usize);
262 for _ in 0..n {
263 let mut pfds = [0i32; 2];
264 if unsafe { libc::pipe(pfds.as_mut_ptr()) } >= 0 {
265 pipe_read_ends.push(unsafe { OwnedFd::from_raw_fd(pfds[0]) });
266 pipe_write_fds.push(pfds[1]);
267 } else {
268 pipe_write_fds.push(-1);
269 }
270 }
271
272 let pid = unsafe { libc::fork() };
274 if pid < 0 {
275 unsafe { libc::close(ctrl_child_fd) };
276 return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
277 }
278
279 if pid == 0 {
280 drop(ctrl_parent);
282
283 unsafe { libc::setpgid(0, 0) };
284 unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) };
285 unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
286
287 let _ = crate::landlock::confine(&policy);
288
289 let deny = crate::context::deny_syscall_numbers(&policy);
290 let args = crate::context::arg_filters(&policy);
291 let filter = crate::seccomp::bpf::assemble_filter(&[], &deny, &args);
292 let _ = crate::seccomp::bpf::install_deny_filter(&filter);
293
294 CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
295
296 init_fn();
298
299 drop(pipe_read_ends);
301
302 crate::fork::fork_ready_loop_fn(ctrl_child_fd, n, &*work_fn, &pipe_write_fds);
304 unsafe { libc::_exit(0) };
305 }
306
307 unsafe { libc::close(ctrl_child_fd) };
309 for wfd in &pipe_write_fds {
311 if *wfd >= 0 { unsafe { libc::close(*wfd) }; }
312 }
313 self.child_pid = Some(pid);
314 self.state = SandboxState::Running;
315
316 let ctrl_fd = ctrl_parent.as_raw_fd();
318 let mut pid_buf = vec![0u8; n as usize * 4];
319 read_exact(ctrl_fd, &mut pid_buf);
320
321 let clone_pids: Vec<i32> = pid_buf.chunks(4)
322 .map(|c| u32::from_be_bytes(c.try_into().unwrap_or([0; 4])) as i32)
323 .collect();
324 let live_count = clone_pids.iter().filter(|&&p| p > 0).count();
325
326 let mut code_buf = vec![0u8; live_count * 4];
328 read_exact(ctrl_fd, &mut code_buf);
329 self.ctrl_fd = Some(ctrl_parent);
330
331 let mut status = 0i32;
333 unsafe { libc::waitpid(pid, &mut status, 0) };
334
335 let mut code_idx = 0;
337 let mut clones = Vec::with_capacity(live_count);
338 let mut pipe_iter = pipe_read_ends.into_iter();
339
340 for &clone_pid in &clone_pids {
341 let pipe = pipe_iter.next();
342 if clone_pid <= 0 { continue; }
343
344 let code = i32::from_be_bytes(
345 code_buf[code_idx * 4..(code_idx + 1) * 4].try_into().unwrap_or([0; 4])
346 );
347 code_idx += 1;
348
349 let mut sb = Sandbox::create(&policy);
350 sb.child_pid = Some(clone_pid);
351 sb.state = SandboxState::Stopped(if code == 0 {
352 ExitStatus::Code(0)
353 } else if code > 0 {
354 ExitStatus::Code(code)
355 } else {
356 ExitStatus::Killed
357 });
358 sb.stdout_pipe = pipe;
359 clones.push(sb);
360 }
361
362 Ok(clones)
363 }
364
365 pub async fn reduce(
375 &self,
376 cmd: &[&str],
377 clones: &mut [Sandbox],
378 ) -> Result<RunResult, SandlockError> {
379 let mut combined = Vec::new();
381 for clone in clones.iter_mut() {
382 if let Some(pipe) = clone.stdout_pipe.take() {
383 combined.extend_from_slice(&read_fd_to_end(pipe));
384 }
385 }
386
387 let mut stdin_fds = [0i32; 2];
389 if unsafe { libc::pipe2(stdin_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
390 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
391 }
392
393 let write_fd = stdin_fds[1];
395 let write_handle = tokio::task::spawn_blocking(move || {
396 unsafe {
397 libc::write(write_fd, combined.as_ptr() as *const _, combined.len());
398 libc::close(write_fd);
399 }
400 });
401
402 let mut reducer = Sandbox::new(&self.policy)?;
404 reducer.io_overrides = Some((Some(stdin_fds[0]), None, None));
405 reducer.do_spawn(cmd, true).await?;
406 unsafe { libc::close(stdin_fds[0]) };
407
408 let _ = write_handle.await;
409 reducer.wait().await
410 }
411
412 pub async fn wait(&mut self) -> Result<RunResult, SandlockError> {
414 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
415
416 if let SandboxState::Stopped(ref es) = self.state {
417 return Ok(RunResult {
418 exit_status: es.clone(),
419 stdout: None,
420 stderr: None,
421 });
422 }
423
424 let exit_status = tokio::task::spawn_blocking(move || -> ExitStatus {
426 let mut status: i32 = 0;
427 loop {
428 let ret = unsafe { libc::waitpid(pid, &mut status, 0) };
429 if ret < 0 {
430 let err = std::io::Error::last_os_error();
431 if err.raw_os_error() == Some(libc::EINTR) {
432 continue;
433 }
434 return ExitStatus::Killed;
436 }
437 break;
438 }
439 wait_status_to_exit(status)
440 })
441 .await
442 .unwrap_or(ExitStatus::Killed);
443
444 self.state = SandboxState::Stopped(exit_status.clone());
445
446 if let Some(h) = self.notif_handle.take() {
448 h.abort();
449 }
450 if let Some(h) = self.throttle_handle.take() {
451 h.abort();
452 }
453
454 let stdout = self._stdout_read.take().map(|fd| read_fd_to_end(fd));
456 let stderr = self._stderr_read.take().map(|fd| read_fd_to_end(fd));
457
458 Ok(RunResult {
459 exit_status,
460 stdout,
461 stderr,
462 })
463 }
464
465 pub fn pause(&mut self) -> Result<(), SandlockError> {
467 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
468 let ret = unsafe { libc::killpg(pid, libc::SIGSTOP) };
469 if ret < 0 {
470 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
471 }
472 self.state = SandboxState::Paused;
473 Ok(())
474 }
475
476 pub fn resume(&mut self) -> Result<(), SandlockError> {
478 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
479 let ret = unsafe { libc::killpg(pid, libc::SIGCONT) };
480 if ret < 0 {
481 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
482 }
483 self.state = SandboxState::Running;
484 Ok(())
485 }
486
487 pub fn kill(&mut self) -> Result<(), SandlockError> {
489 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
490 let ret = unsafe { libc::killpg(pid, libc::SIGKILL) };
491 if ret < 0 {
492 let err = std::io::Error::last_os_error();
493 if err.raw_os_error() != Some(libc::ESRCH) {
495 return Err(SandboxError::Io(err).into());
496 }
497 }
498 Ok(())
499 }
500
501 pub fn pid(&self) -> Option<i32> {
503 self.child_pid
504 }
505
506 #[doc(hidden)]
508 pub fn is_running(&self) -> bool {
509 matches!(self.state, SandboxState::Running | SandboxState::Paused)
510 }
511
512 pub fn policy(&self) -> &Policy {
514 &self.policy
515 }
516
517 #[doc(hidden)]
519 pub async fn commit(&mut self) -> Result<(), SandlockError> {
520 if let Some(branch) = self.cow_branch.take() {
521 branch.commit().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
522 }
523 Ok(())
524 }
525
526 #[doc(hidden)]
528 pub async fn abort_branch(&mut self) -> Result<(), SandlockError> {
529 if let Some(branch) = self.cow_branch.take() {
530 branch.abort().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
531 }
532 Ok(())
533 }
534
535 pub(crate) async fn freeze(&self) -> Result<(), SandlockError> {
537 let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
538
539 if let Some(ref state) = self.supervisor_state {
541 let mut st = state.lock().await;
542 st.hold_forks = true;
543 }
544
545 unsafe { libc::killpg(pid, libc::SIGSTOP); }
547 Ok(())
548 }
549
550 pub(crate) async fn thaw(&self) -> Result<(), SandlockError> {
552 let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
553
554 if let Some(ref state) = self.supervisor_state {
556 let mut st = state.lock().await;
557 st.hold_forks = false;
558 st.held_notif_ids.clear();
559 }
560
561 unsafe { libc::killpg(pid, libc::SIGCONT); }
563 Ok(())
564 }
565
566 #[doc(hidden)]
569 pub async fn spawn(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
570 self.do_spawn(cmd, false).await
571 }
572
573 #[doc(hidden)]
576 pub async fn spawn_captured(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
577 self.do_spawn(cmd, true).await
578 }
579
580 #[doc(hidden)]
589 pub async fn spawn_with_io(
590 &mut self,
591 cmd: &[&str],
592 stdin_fd: Option<std::os::unix::io::RawFd>,
593 stdout_fd: Option<std::os::unix::io::RawFd>,
594 stderr_fd: Option<std::os::unix::io::RawFd>,
595 ) -> Result<(), SandlockError> {
596 self.io_overrides = Some((stdin_fd, stdout_fd, stderr_fd));
597 self.do_spawn(cmd, false).await
598 }
599
600 pub async fn checkpoint(&self) -> Result<crate::checkpoint::Checkpoint, SandlockError> {
602 let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
603
604 self.freeze().await?;
606
607 let cp = crate::checkpoint::capture(pid, &self.policy);
609
610 self.thaw().await?;
612
613 cp
614 }
615
616 async fn do_spawn(&mut self, cmd: &[&str], capture: bool) -> Result<(), SandlockError> {
622 if !matches!(self.state, SandboxState::Created) {
624 return Err(SandboxError::Child("sandbox already spawned".into()).into());
625 }
626
627 if cmd.is_empty() {
628 return Err(SandboxError::Child("empty command".into()).into());
629 }
630
631 let c_cmd: Vec<CString> = cmd
633 .iter()
634 .map(|s| CString::new(*s).map_err(|_| SandboxError::Child("invalid command string".into())))
635 .collect::<Result<Vec<_>, _>>()?;
636
637 let nested = is_nested();
639
640 let pipes = PipePair::new().map_err(SandboxError::Io)?;
642
643 let resolved_ips = if !self.policy.net_allow_hosts.is_empty() {
645 network::resolve_hosts(&self.policy.net_allow_hosts)
646 .await
647 .map_err(SandboxError::Io)?
648 } else {
649 std::collections::HashSet::new()
650 };
651
652 let cow_branch: Option<Box<dyn CowBranch>> = match self.policy.fs_isolation {
654 FsIsolation::OverlayFs => {
655 let workdir = self.policy.workdir.as_ref()
656 .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("OverlayFs requires workdir".into())))?;
657 let storage = self.policy.fs_storage.as_ref()
658 .cloned()
659 .unwrap_or_else(|| std::env::temp_dir().join("sandlock-overlay"));
660 std::fs::create_dir_all(&storage)
661 .map_err(|e| SandlockError::Sandbox(SandboxError::Io(e)))?;
662 let branch = OverlayBranch::create(workdir, &storage)
663 .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
664 Some(Box::new(branch))
665 }
666 FsIsolation::BranchFs => {
667 let workdir = self.policy.workdir.as_ref()
668 .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("BranchFs requires workdir".into())))?;
669 let branch = BranchFsBranch::create(workdir)
670 .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
671 Some(Box::new(branch))
672 }
673 FsIsolation::None => None,
674 };
675
676 let cow_config = if let Some(ref branch) = cow_branch {
678 if self.policy.fs_isolation == FsIsolation::OverlayFs {
679 let workdir = self.policy.workdir.as_ref().unwrap();
684 let merged = branch.branch_path().to_path_buf();
685 let branch_dir = merged.parent().unwrap();
687 let upper = branch_dir.join("upper");
688 let work = branch_dir.join("work");
689 Some(CowConfig {
690 merged,
691 upper,
692 work,
693 lowers: vec![workdir.clone()],
694 })
695 } else {
696 None
697 }
698 } else {
699 None
700 };
701
702 let (stdout_r, stderr_r) = if capture {
704 let mut stdout_fds = [0i32; 2];
705 let mut stderr_fds = [0i32; 2];
706 if unsafe { libc::pipe2(stdout_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
707 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
708 }
709 if unsafe { libc::pipe2(stderr_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
710 unsafe {
711 libc::close(stdout_fds[0]);
712 libc::close(stdout_fds[1]);
713 }
714 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
715 }
716 (
717 Some((
718 unsafe { OwnedFd::from_raw_fd(stdout_fds[0]) },
719 unsafe { OwnedFd::from_raw_fd(stdout_fds[1]) },
720 )),
721 Some((
722 unsafe { OwnedFd::from_raw_fd(stderr_fds[0]) },
723 unsafe { OwnedFd::from_raw_fd(stderr_fds[1]) },
724 )),
725 )
726 } else {
727 (None, None)
728 };
729
730 let pid = unsafe { libc::fork() };
732 if pid < 0 {
733 return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
734 }
735
736 if pid == 0 {
737 if let Some((stdin_fd, stdout_fd, stderr_fd)) = self.io_overrides {
753 if let Some(fd) = stdin_fd {
754 unsafe { libc::dup2(fd, 0) };
755 }
756 if let Some(fd) = stdout_fd {
757 unsafe { libc::dup2(fd, 1) };
758 }
759 if let Some(fd) = stderr_fd {
760 unsafe { libc::dup2(fd, 2) };
761 }
762 }
763
764 if let Some((_, ref stdout_w)) = stdout_r {
766 unsafe { libc::dup2(stdout_w.as_raw_fd(), 1) };
767 }
768 if let Some((_, ref stderr_w)) = stderr_r {
769 unsafe { libc::dup2(stderr_w.as_raw_fd(), 2) };
770 }
771 drop(stdout_r);
774 drop(stderr_r);
775
776 context::confine_child(&self.policy, &c_cmd, &pipes, cow_config.as_ref(), nested);
778 }
779
780 self.cow_branch = cow_branch;
784
785 drop(pipes.notif_w);
787 drop(pipes.ready_r);
788
789 self._stdout_read = stdout_r.map(|(r, _w)| r);
792 self._stderr_read = stderr_r.map(|(r, _w)| r);
793
794 self.child_pid = Some(pid);
796 self.state = SandboxState::Running;
797
798 let pidfd = match syscall::pidfd_open(pid as u32, 0) {
800 Ok(fd) => Some(fd),
801 Err(_) => None, };
803
804 let notif_fd_num = read_u32_fd(pipes.notif_r.as_raw_fd())
807 .map_err(|e| SandboxError::Child(format!("read notif fd from child: {}", e)))?;
808
809 let is_nested = notif_fd_num == 0;
810
811 let notif_fd = if is_nested {
813 None
814 } else if let Some(ref pfd) = pidfd {
815 Some(syscall::pidfd_getfd(pfd, notif_fd_num as i32, 0)
816 .map_err(|e| SandboxError::Child(format!("pidfd_getfd: {}", e)))?)
817 } else {
818 let path = format!("/proc/{}/fd/{}", pid, notif_fd_num);
819 let cpath = CString::new(path).unwrap();
820 let raw = unsafe { libc::open(cpath.as_ptr(), libc::O_RDWR) };
821 if raw < 0 {
822 return Err(
823 SandboxError::Child("failed to open notif fd from /proc".into()).into(),
824 );
825 }
826 Some(unsafe { OwnedFd::from_raw_fd(raw) })
827 };
828
829 if let Some(notif_fd) = notif_fd {
831 if self.policy.time_start.is_some() || self.policy.random_seed.is_some() {
833 let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
834 if let Err(e) = crate::vdso::patch(pid, time_offset, self.policy.random_seed.is_some()) {
835 eprintln!("sandlock: pre-exec vDSO patching failed (will retry after exec): {}", e);
836 }
837 }
838
839 let time_offset_val = self.policy.time_start
841 .map(|t| crate::time::calculate_time_offset(t))
842 .unwrap_or(0);
843
844 let notif_policy = NotifPolicy {
845 max_memory_bytes: self.policy.max_memory.map(|m| m.0).unwrap_or(0),
846 max_processes: self.policy.max_processes,
847 has_memory_limit: self.policy.max_memory.is_some(),
848 has_net_allowlist: !self.policy.net_allow_hosts.is_empty()
849 || self.policy.policy_fn.is_some(),
850 has_random_seed: self.policy.random_seed.is_some(),
851 has_time_start: self.policy.time_start.is_some(),
852 time_offset: time_offset_val,
853 num_cpus: self.policy.num_cpus,
854 has_proc_virt: self.policy.num_cpus.is_some() || self.policy.max_memory.is_some() || self.policy.isolate_pids || self.policy.port_remap,
855 isolate_pids: self.policy.isolate_pids,
856 port_remap: self.policy.port_remap,
857 cow_enabled: self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None,
858 chroot_root: self.policy.chroot.clone(),
859 chroot_readable: self.policy.fs_readable.clone(),
860 chroot_writable: self.policy.fs_writable.clone(),
861 deterministic_dirs: self.policy.deterministic_dirs,
862 hostname: self.policy.hostname.clone(),
863 };
864
865 use rand::SeedableRng;
867 use rand_chacha::ChaCha8Rng;
868
869 let random_state = self.policy.random_seed.map(|seed| ChaCha8Rng::seed_from_u64(seed));
870 let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
871
872 let mut sup_state = SupervisorState::new(
873 notif_policy.max_memory_bytes,
874 notif_policy.max_processes,
875 time_offset,
876 random_state,
877 );
878 sup_state.network_policy = if self.policy.net_allow_hosts.is_empty() {
879 crate::seccomp::notif::NetworkPolicy::Unrestricted
880 } else {
881 crate::seccomp::notif::NetworkPolicy::AllowList(resolved_ips)
882 };
883
884 if let Some(ref pfd) = pidfd {
885 use std::os::unix::io::AsRawFd;
886 sup_state.child_pidfd = Some(pfd.as_raw_fd());
887 }
888
889 if self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None {
891 let workdir = self.policy.workdir.as_ref().unwrap();
892 let storage = self.policy.fs_storage.as_deref();
893 match crate::cow::seccomp::SeccompCowBranch::create(workdir, storage) {
894 Ok(branch) => { sup_state.cow_branch = Some(branch); }
895 Err(e) => { eprintln!("sandlock: seccomp COW branch creation failed: {}", e); }
896 }
897 }
898
899 if let Some(ref callback) = self.policy.policy_fn {
901 let live = crate::policy_fn::LivePolicy {
902 allowed_ips: match &sup_state.network_policy {
903 crate::seccomp::notif::NetworkPolicy::AllowList(ips) => ips.clone(),
904 crate::seccomp::notif::NetworkPolicy::Unrestricted => std::collections::HashSet::new(),
905 },
906 max_memory_bytes: notif_policy.max_memory_bytes,
907 max_processes: notif_policy.max_processes,
908 };
909 let ceiling = live.clone();
910 let live = std::sync::Arc::new(std::sync::RwLock::new(live));
911 let denied_paths = sup_state.denied_paths.clone();
912 let pid_overrides = sup_state.pid_ip_overrides.clone();
913 sup_state.live_policy = Some(live.clone());
915 let tx = crate::policy_fn::spawn_policy_fn(
916 callback.clone(), live, ceiling, pid_overrides, denied_paths,
917 );
918 sup_state.policy_event_tx = Some(tx);
919 }
920
921 let sup_state = Arc::new(Mutex::new(sup_state));
922 self.supervisor_state = Some(Arc::clone(&sup_state));
923
924 self.notif_handle = Some(tokio::spawn(
926 notif::supervisor(notif_fd, notif_policy, sup_state),
927 ));
928 }
929
930 if let Some(cpu_pct) = self.policy.max_cpu {
932 if cpu_pct < 100 {
933 let child_pid = pid;
934 self.throttle_handle = Some(tokio::spawn(throttle_cpu(child_pid, cpu_pct)));
935 }
936 }
937
938 write_u32_fd(pipes.ready_w.as_raw_fd(), 1)
940 .map_err(|e| SandboxError::Child(format!("write ready signal: {}", e)))?;
941
942 self.pidfd = pidfd;
944
945 Ok(())
946 }
947}
948
949impl Drop for Sandbox {
954 fn drop(&mut self) {
955 if let Some(pid) = self.child_pid {
956 if matches!(self.state, SandboxState::Running | SandboxState::Paused) {
957 unsafe { libc::killpg(pid, libc::SIGKILL) };
959 let mut status: i32 = 0;
961 unsafe { libc::waitpid(pid, &mut status, 0) };
962 }
963 }
964
965 if let Some(h) = self.notif_handle.take() {
966 h.abort();
967 }
968 if let Some(h) = self.throttle_handle.take() {
969 h.abort();
970 }
971
972 if let Some(ref branch) = self.cow_branch {
974 let is_error = matches!(
975 self.state,
976 SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
977 );
978 let action = if is_error {
979 &self.policy.on_error
980 } else {
981 &self.policy.on_exit
982 };
983 match action {
984 BranchAction::Commit => { let _ = branch.commit(); }
985 BranchAction::Abort => { let _ = branch.abort(); }
986 BranchAction::Keep => {} }
988 }
989
990 if let Some(ref state) = self.supervisor_state {
992 let Ok(mut st) = state.try_lock() else { return; };
993 if let Some(ref mut cow) = st.cow_branch {
994 let is_error = matches!(
995 self.state,
996 SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
997 );
998 let action = if is_error {
999 &self.policy.on_error
1000 } else {
1001 &self.policy.on_exit
1002 };
1003 match action {
1004 BranchAction::Commit => { let _ = cow.commit(); }
1005 BranchAction::Abort => { let _ = cow.abort(); }
1006 BranchAction::Keep => {}
1007 }
1008 }
1009 }
1010 }
1011}
1012
1013async fn throttle_cpu(pid: i32, cpu_pct: u8) {
1019 let period = Duration::from_millis(100);
1020 let run_time = period * cpu_pct as u32 / 100;
1021 let stop_time = period - run_time;
1022
1023 loop {
1024 tokio::time::sleep(run_time).await;
1025 if unsafe { libc::killpg(pid, libc::SIGSTOP) } < 0 {
1026 break;
1027 }
1028 tokio::time::sleep(stop_time).await;
1029 if unsafe { libc::killpg(pid, libc::SIGCONT) } < 0 {
1030 break;
1031 }
1032 }
1033}
1034
1035fn read_exact(fd: i32, buf: &mut [u8]) {
1043 let mut off = 0;
1044 while off < buf.len() {
1045 let r = unsafe { libc::read(fd, buf[off..].as_mut_ptr() as *mut _, buf.len() - off) };
1046 if r <= 0 { break; }
1047 off += r as usize;
1048 }
1049}
1050
1051fn read_fd_to_end(fd: OwnedFd) -> Vec<u8> {
1052 use std::io::Read;
1053 let mut file = unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) };
1054 let mut buf = Vec::new();
1055 let _ = file.read_to_end(&mut buf);
1056 buf
1057}
1058
1059fn wait_status_to_exit(status: i32) -> ExitStatus {
1060 if libc::WIFEXITED(status) {
1061 ExitStatus::Code(libc::WEXITSTATUS(status))
1062 } else if libc::WIFSIGNALED(status) {
1063 let sig = libc::WTERMSIG(status);
1064 if sig == libc::SIGKILL {
1065 ExitStatus::Killed
1066 } else {
1067 ExitStatus::Signal(sig)
1068 }
1069 } else {
1070 ExitStatus::Killed
1071 }
1072}