1use std::ffi::CString;
5use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd};
6use std::sync::Arc;
7use std::time::Duration;
8
9use tokio::sync::Mutex;
10use tokio::task::JoinHandle;
11
12use std::sync::atomic::{AtomicBool, Ordering};
13
14use crate::context::{self, PipePair, read_u32_fd, write_u32_fd};
15use crate::cow::{CowBranch, overlayfs::OverlayBranch, branchfs::BranchFsBranch};
16use crate::error::{SandboxError, SandlockError};
17use crate::network;
18use crate::policy::{BranchAction, FsIsolation, Policy};
19use crate::result::{ExitStatus, RunResult};
20use crate::seccomp::ctx::SupervisorCtx;
21use crate::seccomp::notif::{self, NotifPolicy};
22use crate::seccomp::state::{ChrootState, CowState, NetworkState, PolicyFnState, ProcfsState, ResourceState, TimeRandomState};
23use crate::sys::syscall;
24
25pub(crate) static CONFINED: AtomicBool = AtomicBool::new(false);
32
33pub fn is_nested() -> bool {
39 if CONFINED.load(Ordering::Relaxed) {
40 return true;
41 }
42 if let Ok(status) = std::fs::read_to_string("/proc/self/status") {
44 for line in status.lines() {
45 if line.starts_with("Seccomp:") {
46 return line.trim().ends_with('2');
47 }
48 }
49 }
50 false
51}
52
53enum SandboxState {
58 Created,
59 Running,
60 Paused,
61 Stopped(ExitStatus),
62}
63
64pub struct Sandbox {
73 policy: Policy,
74 state: SandboxState,
75 child_pid: Option<i32>,
76 pidfd: Option<OwnedFd>,
77 notif_handle: Option<JoinHandle<()>>,
78 throttle_handle: Option<JoinHandle<()>>,
79 loadavg_handle: Option<JoinHandle<()>>,
80 _stdout_read: Option<OwnedFd>,
82 _stderr_read: Option<OwnedFd>,
83 cow_branch: Option<Box<dyn CowBranch>>,
85 seccomp_cow: Option<crate::cow::seccomp::SeccompCowBranch>,
87 supervisor_resource: Option<Arc<Mutex<ResourceState>>>,
89 supervisor_cow: Option<Arc<Mutex<CowState>>>,
91 supervisor_network: Option<Arc<Mutex<NetworkState>>>,
93 ctrl_fd: Option<OwnedFd>,
95 stdout_pipe: Option<OwnedFd>,
97 init_fn: Option<Box<dyn FnOnce() + Send + 'static>>,
99 work_fn: Option<Arc<dyn Fn(u32) + Send + Sync + 'static>>,
101 io_overrides: Option<(Option<i32>, Option<i32>, Option<i32>)>,
103 extra_fds: Vec<(i32, i32)>,
106 http_acl_handle: Option<crate::http_acl::HttpAclProxyHandle>,
108 #[allow(clippy::type_complexity)]
110 on_bind: Option<Box<dyn Fn(&std::collections::HashMap<u16, u16>) + Send + Sync>>,
111}
112
113impl Sandbox {
114 pub fn new(policy: &Policy) -> Result<Self, SandlockError> {
116 Ok(Self::create(policy))
117 }
118
119 pub fn new_with_fns(
132 policy: &Policy,
133 init_fn: impl FnOnce() + Send + 'static,
134 work_fn: impl Fn(u32) + Send + Sync + 'static,
135 ) -> Result<Self, SandlockError> {
136 let mut sb = Self::create(policy);
137 sb.init_fn = Some(Box::new(init_fn));
138 sb.work_fn = Some(Arc::new(work_fn));
139 Ok(sb)
140 }
141
142 fn create(policy: &Policy) -> Self {
143 Self {
144 policy: policy.clone(),
145 state: SandboxState::Created,
146 child_pid: None,
147 pidfd: None,
148 notif_handle: None,
149 throttle_handle: None,
150 loadavg_handle: None,
151 _stdout_read: None,
152 _stderr_read: None,
153 cow_branch: None,
154 seccomp_cow: None,
155 supervisor_resource: None,
156 supervisor_cow: None,
157 supervisor_network: None,
158 ctrl_fd: None,
159 stdout_pipe: None,
160 init_fn: None,
161 work_fn: None,
162 io_overrides: None,
163 extra_fds: Vec::new(),
164 http_acl_handle: None,
165 on_bind: None,
166 }
167 }
168
169 pub async fn run(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
172 let mut sb = Self::new(policy)?;
173 sb.do_spawn(cmd, true).await?;
174 sb.wait().await
175 }
176
177 pub async fn run_interactive(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
179 let mut sb = Self::new(policy)?;
180 sb.do_spawn(cmd, false).await?;
181 sb.wait().await
182 }
183
184 pub async fn dry_run(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
188 let mut policy = policy.clone();
189 policy.on_exit = BranchAction::Keep;
190 policy.on_error = BranchAction::Keep;
191
192 let mut sb = Self::new(&policy)?;
193 sb.do_spawn(cmd, true).await?;
194 let run_result = sb.wait().await?;
195 let changes = sb.collect_changes().await;
196 sb.do_abort().await;
197 Ok(crate::dry_run::DryRunResult { run_result, changes })
198 }
199
200 pub async fn dry_run_interactive(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
202 let mut policy = policy.clone();
203 policy.on_exit = BranchAction::Keep;
204 policy.on_error = BranchAction::Keep;
205
206 let mut sb = Self::new(&policy)?;
207 sb.do_spawn(cmd, false).await?;
208 let run_result = sb.wait().await?;
209 let changes = sb.collect_changes().await;
210 sb.do_abort().await;
211 Ok(crate::dry_run::DryRunResult { run_result, changes })
212 }
213
214 async fn collect_changes(&self) -> Vec<crate::dry_run::Change> {
216 if let Some(ref branch) = self.cow_branch {
217 return branch.changes().unwrap_or_default();
218 }
219 if let Some(ref cow) = self.seccomp_cow {
220 return cow.changes().unwrap_or_default();
221 }
222 Vec::new()
223 }
224
225 async fn do_abort(&mut self) {
227 if let Some(branch) = self.cow_branch.take() {
228 let _ = branch.abort();
229 }
230 if let Some(ref mut cow) = self.seccomp_cow {
231 let _ = cow.abort();
232 }
233 }
234
235 pub async fn fork(&mut self, n: u32) -> Result<Vec<Sandbox>, SandlockError> {
255 let init_fn = self.init_fn.take()
256 .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
257 let work_fn = self.work_fn.take()
258 .ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
259
260 let policy = self.policy.clone();
261
262
263 let mut ctrl_fds = [0i32; 2];
265 if unsafe { libc::pipe2(ctrl_fds.as_mut_ptr(), 0) } < 0 {
266 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
267 }
268 let ctrl_parent = unsafe { OwnedFd::from_raw_fd(ctrl_fds[0]) };
269 let ctrl_child_fd = ctrl_fds[1];
270
271 let mut pipe_read_ends: Vec<OwnedFd> = Vec::with_capacity(n as usize);
273 let mut pipe_write_fds: Vec<i32> = Vec::with_capacity(n as usize);
274 for _ in 0..n {
275 let mut pfds = [0i32; 2];
276 if unsafe { libc::pipe(pfds.as_mut_ptr()) } >= 0 {
277 pipe_read_ends.push(unsafe { OwnedFd::from_raw_fd(pfds[0]) });
278 pipe_write_fds.push(pfds[1]);
279 } else {
280 pipe_write_fds.push(-1);
281 }
282 }
283
284 let pid = unsafe { libc::fork() };
286 if pid < 0 {
287 unsafe { libc::close(ctrl_child_fd) };
288 return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
289 }
290
291 if pid == 0 {
292 drop(ctrl_parent);
294
295 unsafe { libc::setpgid(0, 0) };
296 unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) };
297 unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
298
299 let _ = crate::landlock::confine(&policy);
300
301 let deny = crate::context::deny_syscall_numbers(&policy);
302 let args = crate::context::arg_filters(&policy);
303 let filter = match crate::seccomp::bpf::assemble_filter(&[], &deny, &args) {
304 Ok(f) => f,
305 Err(_) => unsafe { libc::_exit(1) },
306 };
307 let _ = crate::seccomp::bpf::install_deny_filter(&filter);
308
309 CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
310
311 init_fn();
313
314 drop(pipe_read_ends);
316
317 crate::fork::fork_ready_loop_fn(ctrl_child_fd, n, &*work_fn, &pipe_write_fds);
319 unsafe { libc::_exit(0) };
320 }
321
322 unsafe { libc::close(ctrl_child_fd) };
324 for wfd in &pipe_write_fds {
326 if *wfd >= 0 { unsafe { libc::close(*wfd) }; }
327 }
328 self.child_pid = Some(pid);
329 self.state = SandboxState::Running;
330
331 let ctrl_fd = ctrl_parent.as_raw_fd();
333 let mut pid_buf = vec![0u8; n as usize * 4];
334 read_exact(ctrl_fd, &mut pid_buf);
335
336 let clone_pids: Vec<i32> = pid_buf.chunks(4)
337 .map(|c| u32::from_be_bytes(c.try_into().unwrap_or([0; 4])) as i32)
338 .collect();
339 let live_count = clone_pids.iter().filter(|&&p| p > 0).count();
340
341 let mut code_buf = vec![0u8; live_count * 4];
343 read_exact(ctrl_fd, &mut code_buf);
344 self.ctrl_fd = Some(ctrl_parent);
345
346 let mut status = 0i32;
348 unsafe { libc::waitpid(pid, &mut status, 0) };
349
350 let mut code_idx = 0;
352 let mut clones = Vec::with_capacity(live_count);
353 let mut pipe_iter = pipe_read_ends.into_iter();
354
355 for &clone_pid in &clone_pids {
356 let pipe = pipe_iter.next();
357 if clone_pid <= 0 { continue; }
358
359 let code = i32::from_be_bytes(
360 code_buf[code_idx * 4..(code_idx + 1) * 4].try_into().unwrap_or([0; 4])
361 );
362 code_idx += 1;
363
364 let mut sb = Sandbox::create(&policy);
365 sb.child_pid = Some(clone_pid);
366 sb.state = SandboxState::Stopped(if code == 0 {
367 ExitStatus::Code(0)
368 } else if code > 0 {
369 ExitStatus::Code(code)
370 } else {
371 ExitStatus::Killed
372 });
373 sb.stdout_pipe = pipe;
374 clones.push(sb);
375 }
376
377 Ok(clones)
378 }
379
380 pub async fn reduce(
390 &self,
391 cmd: &[&str],
392 clones: &mut [Sandbox],
393 ) -> Result<RunResult, SandlockError> {
394 let mut combined = Vec::new();
396 for clone in clones.iter_mut() {
397 if let Some(pipe) = clone.stdout_pipe.take() {
398 combined.extend_from_slice(&read_fd_to_end(pipe));
399 }
400 }
401
402 let mut stdin_fds = [0i32; 2];
404 if unsafe { libc::pipe2(stdin_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
405 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
406 }
407
408 let write_fd = stdin_fds[1];
410 let write_handle = tokio::task::spawn_blocking(move || {
411 unsafe {
412 libc::write(write_fd, combined.as_ptr() as *const _, combined.len());
413 libc::close(write_fd);
414 }
415 });
416
417 let mut reducer = Sandbox::new(&self.policy)?;
419 reducer.io_overrides = Some((Some(stdin_fds[0]), None, None));
420 reducer.do_spawn(cmd, true).await?;
421 unsafe { libc::close(stdin_fds[0]) };
422
423 let _ = write_handle.await;
424 reducer.wait().await
425 }
426
427 pub async fn wait(&mut self) -> Result<RunResult, SandlockError> {
429 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
430
431 if let SandboxState::Stopped(ref es) = self.state {
432 return Ok(RunResult {
433 exit_status: es.clone(),
434 stdout: None,
435 stderr: None,
436 });
437 }
438
439 let exit_status = tokio::task::spawn_blocking(move || -> ExitStatus {
441 let mut status: i32 = 0;
442 loop {
443 let ret = unsafe { libc::waitpid(pid, &mut status, 0) };
444 if ret < 0 {
445 let err = std::io::Error::last_os_error();
446 if err.raw_os_error() == Some(libc::EINTR) {
447 continue;
448 }
449 return ExitStatus::Killed;
451 }
452 break;
453 }
454 wait_status_to_exit(status)
455 })
456 .await
457 .unwrap_or(ExitStatus::Killed);
458
459 self.state = SandboxState::Stopped(exit_status.clone());
460
461 if let Some(h) = self.notif_handle.take() {
463 h.abort();
464 }
465 if let Some(h) = self.throttle_handle.take() {
466 h.abort();
467 }
468 if let Some(h) = self.loadavg_handle.take() {
469 h.abort();
470 }
471
472 if let Some(ref cow_state) = self.supervisor_cow {
476 let mut cow = cow_state.lock().await;
477 self.seccomp_cow = cow.branch.take();
478 }
479
480 let stdout = self._stdout_read.take().map(|fd| read_fd_to_end(fd));
482 let stderr = self._stderr_read.take().map(|fd| read_fd_to_end(fd));
483
484 Ok(RunResult {
485 exit_status,
486 stdout,
487 stderr,
488 })
489 }
490
491 pub fn pause(&mut self) -> Result<(), SandlockError> {
493 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
494 let ret = unsafe { libc::killpg(pid, libc::SIGSTOP) };
495 if ret < 0 {
496 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
497 }
498 self.state = SandboxState::Paused;
499 Ok(())
500 }
501
502 pub fn resume(&mut self) -> Result<(), SandlockError> {
504 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
505 let ret = unsafe { libc::killpg(pid, libc::SIGCONT) };
506 if ret < 0 {
507 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
508 }
509 self.state = SandboxState::Running;
510 Ok(())
511 }
512
513 pub fn kill(&mut self) -> Result<(), SandlockError> {
515 let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
516 let ret = unsafe { libc::killpg(pid, libc::SIGKILL) };
517 if ret < 0 {
518 let err = std::io::Error::last_os_error();
519 if err.raw_os_error() != Some(libc::ESRCH) {
521 return Err(SandboxError::Io(err).into());
522 }
523 }
524 Ok(())
525 }
526
527 pub fn pid(&self) -> Option<i32> {
529 self.child_pid
530 }
531
532 pub fn set_on_bind(&mut self, cb: impl Fn(&std::collections::HashMap<u16, u16>) + Send + Sync + 'static) {
534 self.on_bind = Some(Box::new(cb));
535 }
536
537 pub async fn port_mappings(&self) -> std::collections::HashMap<u16, u16> {
543 if let Some(ref net) = self.supervisor_network {
544 let ns = net.lock().await;
545 ns.port_map.virtual_to_real.clone()
546 } else {
547 std::collections::HashMap::new()
548 }
549 }
550
551 #[doc(hidden)]
553 pub fn is_running(&self) -> bool {
554 matches!(self.state, SandboxState::Running | SandboxState::Paused)
555 }
556
557 pub fn policy(&self) -> &Policy {
559 &self.policy
560 }
561
562 #[doc(hidden)]
564 pub async fn commit(&mut self) -> Result<(), SandlockError> {
565 if let Some(branch) = self.cow_branch.take() {
566 branch.commit().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
567 }
568 Ok(())
569 }
570
571 #[doc(hidden)]
573 pub async fn abort_branch(&mut self) -> Result<(), SandlockError> {
574 if let Some(branch) = self.cow_branch.take() {
575 branch.abort().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
576 }
577 Ok(())
578 }
579
580 pub(crate) async fn freeze(&self) -> Result<(), SandlockError> {
582 let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
583
584 if let Some(ref resource) = self.supervisor_resource {
586 let mut rs = resource.lock().await;
587 rs.hold_forks = true;
588 }
589
590 unsafe { libc::killpg(pid, libc::SIGSTOP); }
592 Ok(())
593 }
594
595 pub(crate) async fn thaw(&self) -> Result<(), SandlockError> {
597 let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
598
599 if let Some(ref resource) = self.supervisor_resource {
601 let mut rs = resource.lock().await;
602 rs.hold_forks = false;
603 rs.held_notif_ids.clear();
604 }
605
606 unsafe { libc::killpg(pid, libc::SIGCONT); }
608 Ok(())
609 }
610
611 #[doc(hidden)]
614 pub async fn spawn(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
615 self.do_spawn(cmd, false).await
616 }
617
618 #[doc(hidden)]
621 pub async fn spawn_captured(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
622 self.do_spawn(cmd, true).await
623 }
624
625 #[doc(hidden)]
634 pub async fn spawn_with_io(
635 &mut self,
636 cmd: &[&str],
637 stdin_fd: Option<std::os::unix::io::RawFd>,
638 stdout_fd: Option<std::os::unix::io::RawFd>,
639 stderr_fd: Option<std::os::unix::io::RawFd>,
640 ) -> Result<(), SandlockError> {
641 self.io_overrides = Some((stdin_fd, stdout_fd, stderr_fd));
642 self.do_spawn(cmd, false).await
643 }
644
645 #[doc(hidden)]
648 pub async fn spawn_with_gather_io(
649 &mut self,
650 cmd: &[&str],
651 stdin_fd: Option<std::os::unix::io::RawFd>,
652 stdout_fd: Option<std::os::unix::io::RawFd>,
653 stderr_fd: Option<std::os::unix::io::RawFd>,
654 extra_fds: Vec<(i32, i32)>,
655 ) -> Result<(), SandlockError> {
656 self.io_overrides = Some((stdin_fd, stdout_fd, stderr_fd));
657 self.extra_fds = extra_fds;
658 self.do_spawn(cmd, false).await
659 }
660
661 pub async fn checkpoint(&self) -> Result<crate::checkpoint::Checkpoint, SandlockError> {
663 let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
664
665 self.freeze().await?;
667
668 let cp = crate::checkpoint::capture(pid, &self.policy);
670
671 self.thaw().await?;
673
674 cp
675 }
676
677 async fn do_spawn(&mut self, cmd: &[&str], capture: bool) -> Result<(), SandlockError> {
683 if !matches!(self.state, SandboxState::Created) {
685 return Err(SandboxError::Child("sandbox already spawned".into()).into());
686 }
687
688 if cmd.is_empty() {
689 return Err(SandboxError::Child("empty command".into()).into());
690 }
691
692 let c_cmd: Vec<CString> = cmd
694 .iter()
695 .map(|s| CString::new(*s).map_err(|_| SandboxError::Child("invalid command string".into())))
696 .collect::<Result<Vec<_>, _>>()?;
697
698 let nested = is_nested();
700
701 let pipes = PipePair::new().map_err(SandboxError::Io)?;
703
704 let (resolved_ips, virtual_etc_hosts) = match self.policy.net_allow_hosts.as_deref() {
711 None => (std::collections::HashSet::new(), None),
712 Some([]) => (
713 std::collections::HashSet::new(),
714 Some(String::new()),
715 ),
716 Some(hosts) => {
717 let resolved = network::resolve_hosts(hosts)
718 .await
719 .map_err(SandboxError::Io)?;
720 (resolved.ips, Some(resolved.etc_hosts))
721 }
722 };
723
724 if !self.policy.http_allow.is_empty() || !self.policy.http_deny.is_empty() {
726 let handle = crate::http_acl::spawn_http_acl_proxy(
727 self.policy.http_allow.clone(),
728 self.policy.http_deny.clone(),
729 self.policy.https_ca.as_deref(),
730 self.policy.https_key.as_deref(),
731 ).await.map_err(SandboxError::Io)?;
732 self.http_acl_handle = Some(handle);
733 }
734
735 let cow_branch: Option<Box<dyn CowBranch>> = match self.policy.fs_isolation {
737 FsIsolation::OverlayFs => {
738 let workdir = self.policy.workdir.as_ref()
739 .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("OverlayFs requires workdir".into())))?;
740 let storage = self.policy.fs_storage.as_ref()
741 .cloned()
742 .unwrap_or_else(|| std::env::temp_dir().join("sandlock-overlay"));
743 std::fs::create_dir_all(&storage)
744 .map_err(|e| SandlockError::Sandbox(SandboxError::Io(e)))?;
745 let branch = OverlayBranch::create(workdir, &storage)
746 .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
747 Some(Box::new(branch))
748 }
749 FsIsolation::BranchFs => {
750 let workdir = self.policy.workdir.as_ref()
751 .ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("BranchFs requires workdir".into())))?;
752 let branch = BranchFsBranch::create(workdir)
753 .map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
754 Some(Box::new(branch))
755 }
756 FsIsolation::None => None,
757 };
758
759 let cow_config = cow_branch.as_ref().and_then(|b| b.child_mount_config());
761
762 let (stdout_r, stderr_r) = if capture {
764 let mut stdout_fds = [0i32; 2];
765 let mut stderr_fds = [0i32; 2];
766 if unsafe { libc::pipe2(stdout_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
767 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
768 }
769 if unsafe { libc::pipe2(stderr_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
770 unsafe {
771 libc::close(stdout_fds[0]);
772 libc::close(stdout_fds[1]);
773 }
774 return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
775 }
776 (
777 Some((
778 unsafe { OwnedFd::from_raw_fd(stdout_fds[0]) },
779 unsafe { OwnedFd::from_raw_fd(stdout_fds[1]) },
780 )),
781 Some((
782 unsafe { OwnedFd::from_raw_fd(stderr_fds[0]) },
783 unsafe { OwnedFd::from_raw_fd(stderr_fds[1]) },
784 )),
785 )
786 } else {
787 (None, None)
788 };
789
790 let pid = unsafe { libc::fork() };
792 if pid < 0 {
793 return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
794 }
795
796 if pid == 0 {
797 if let Some((stdin_fd, stdout_fd, stderr_fd)) = self.io_overrides {
813 if let Some(fd) = stdin_fd {
814 unsafe { libc::dup2(fd, 0) };
815 }
816 if let Some(fd) = stdout_fd {
817 unsafe { libc::dup2(fd, 1) };
818 }
819 if let Some(fd) = stderr_fd {
820 unsafe { libc::dup2(fd, 2) };
821 }
822 }
823
824 for &(target_fd, source_fd) in &self.extra_fds {
826 unsafe { libc::dup2(source_fd, target_fd) };
827 }
828
829 if let Some((_, ref stdout_w)) = stdout_r {
831 unsafe { libc::dup2(stdout_w.as_raw_fd(), 1) };
832 }
833 if let Some((_, ref stderr_w)) = stderr_r {
834 unsafe { libc::dup2(stderr_w.as_raw_fd(), 2) };
835 }
836 drop(stdout_r);
839 drop(stderr_r);
840
841 let gather_keep_fds: Vec<i32> = self.extra_fds.iter().map(|&(target, _)| target).collect();
843
844 context::confine_child(&self.policy, &c_cmd, &pipes, cow_config.as_ref(), nested, &gather_keep_fds);
846 }
847
848 self.cow_branch = cow_branch;
852
853 drop(pipes.notif_w);
855 drop(pipes.ready_r);
856
857 self._stdout_read = stdout_r.map(|(r, _w)| r);
860 self._stderr_read = stderr_r.map(|(r, _w)| r);
861
862 self.child_pid = Some(pid);
864 self.state = SandboxState::Running;
865
866 let pidfd = match syscall::pidfd_open(pid as u32, 0) {
868 Ok(fd) => Some(fd),
869 Err(_) => None, };
871
872 let notif_fd_num = read_u32_fd(pipes.notif_r.as_raw_fd())
875 .map_err(|e| SandboxError::Child(format!("read notif fd from child: {}", e)))?;
876
877 let is_nested = notif_fd_num == 0;
878
879 let notif_fd = if is_nested {
881 None
882 } else if let Some(ref pfd) = pidfd {
883 Some(syscall::pidfd_getfd(pfd, notif_fd_num as i32, 0)
884 .map_err(|e| SandboxError::Child(format!("pidfd_getfd: {}", e)))?)
885 } else {
886 let path = format!("/proc/{}/fd/{}", pid, notif_fd_num);
887 let cpath = CString::new(path).unwrap();
888 let raw = unsafe { libc::open(cpath.as_ptr(), libc::O_RDWR) };
889 if raw < 0 {
890 return Err(
891 SandboxError::Child("failed to open notif fd from /proc".into()).into(),
892 );
893 }
894 Some(unsafe { OwnedFd::from_raw_fd(raw) })
895 };
896
897 if let Some(notif_fd) = notif_fd {
899 if self.policy.time_start.is_some() || self.policy.random_seed.is_some() {
901 let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
902 if let Err(e) = crate::vdso::patch(pid, time_offset, self.policy.random_seed.is_some()) {
903 eprintln!("sandlock: pre-exec vDSO patching failed (will retry after exec): {}", e);
904 }
905 }
906
907 let time_offset_val = self.policy.time_start
909 .map(|t| crate::time::calculate_time_offset(t))
910 .unwrap_or(0);
911
912 let notif_policy = NotifPolicy {
913 max_memory_bytes: self.policy.max_memory.map(|m| m.0).unwrap_or(0),
914 max_processes: self.policy.max_processes,
915 has_memory_limit: self.policy.max_memory.is_some(),
916 has_net_allowlist: self.policy.net_allow_hosts.is_some()
917 || self.policy.policy_fn.is_some()
918 || !self.policy.http_allow.is_empty()
919 || !self.policy.http_deny.is_empty(),
920 has_random_seed: self.policy.random_seed.is_some(),
921 has_time_start: self.policy.time_start.is_some(),
922 time_offset: time_offset_val,
923 num_cpus: self.policy.num_cpus,
924 port_remap: self.policy.port_remap,
925 cow_enabled: self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None,
926 chroot_root: self.policy.chroot.as_ref().and_then(|p| std::fs::canonicalize(p).ok()),
927 chroot_readable: self.policy.fs_readable.clone(),
928 chroot_writable: self.policy.fs_writable.clone(),
929 chroot_denied: self.policy.fs_denied.clone(),
930 chroot_mounts: self.policy.fs_mount.iter().map(|(vp, hp)| {
931 (vp.clone(), std::fs::canonicalize(hp).unwrap_or_else(|_| hp.clone()))
932 }).collect(),
933 deterministic_dirs: self.policy.deterministic_dirs,
934 hostname: self.policy.hostname.clone(),
935 has_http_acl: !self.policy.http_allow.is_empty() || !self.policy.http_deny.is_empty(),
936 virtual_etc_hosts,
937 };
938
939 use rand::SeedableRng;
941 use rand_chacha::ChaCha8Rng;
942
943 let random_state = self.policy.random_seed.map(|seed| ChaCha8Rng::seed_from_u64(seed));
944 let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
945
946 let time_random_state = TimeRandomState::new(time_offset, random_state);
948
949 let mut net_state = NetworkState::new();
951 net_state.network_policy = if self.policy.net_allow_hosts.is_some() {
952 crate::seccomp::notif::NetworkPolicy::AllowList(resolved_ips)
953 } else {
954 crate::seccomp::notif::NetworkPolicy::Unrestricted
955 };
956 net_state.http_acl_addr = self.http_acl_handle.as_ref().map(|h| h.addr);
957 net_state.http_acl_ports = self.policy.http_ports.iter().copied().collect();
958 net_state.http_acl_orig_dest = self.http_acl_handle.as_ref().map(|h| h.orig_dest.clone());
959 if let Some(cb) = self.on_bind.take() {
960 net_state.port_map.on_bind = Some(cb);
961 }
962
963 let procfs_state = ProcfsState::new();
965
966 let mut res_state = ResourceState::new(
968 notif_policy.max_memory_bytes,
969 notif_policy.max_processes,
970 );
971 res_state.proc_count = 1;
972
973 let mut cow_state = CowState::new();
975 if self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None {
976 let workdir = self.policy.workdir.as_ref().unwrap();
977 let storage = self.policy.fs_storage.as_deref();
978 let max_disk = self.policy.max_disk.map(|b| b.0).unwrap_or(0);
979 match crate::cow::seccomp::SeccompCowBranch::create(workdir, storage, max_disk) {
980 Ok(branch) => { cow_state.branch = Some(branch); }
981 Err(e) => { eprintln!("sandlock: seccomp COW branch creation failed: {}", e); }
982 }
983 }
984
985 let mut policy_fn_state = PolicyFnState::new();
987
988 if let Ok(mut denied) = policy_fn_state.denied_paths.write() {
989 for path in &self.policy.fs_denied {
990 denied.insert(path.to_string_lossy().into_owned());
991 }
992 }
993
994 if let Some(ref callback) = self.policy.policy_fn {
995 let live = crate::policy_fn::LivePolicy {
996 allowed_ips: match &net_state.network_policy {
997 crate::seccomp::notif::NetworkPolicy::AllowList(ips) => ips.clone(),
998 crate::seccomp::notif::NetworkPolicy::Unrestricted => std::collections::HashSet::new(),
999 },
1000 max_memory_bytes: notif_policy.max_memory_bytes,
1001 max_processes: notif_policy.max_processes,
1002 };
1003 let ceiling = live.clone();
1004 let live = std::sync::Arc::new(std::sync::RwLock::new(live));
1005 let denied_paths = policy_fn_state.denied_paths.clone();
1006 let pid_overrides = net_state.pid_ip_overrides.clone();
1007 policy_fn_state.live_policy = Some(live.clone());
1008 let tx = crate::policy_fn::spawn_policy_fn(
1009 callback.clone(), live, ceiling, pid_overrides, denied_paths,
1010 );
1011 policy_fn_state.event_tx = Some(tx);
1012 }
1013
1014 let chroot_state = ChrootState::new();
1016
1017 use std::os::unix::io::AsRawFd;
1018 let notif_raw_fd = notif_fd.as_raw_fd();
1019 let child_pidfd_raw = pidfd.as_ref().map(|pfd| pfd.as_raw_fd());
1020
1021 let res_state = Arc::new(Mutex::new(res_state));
1022 self.supervisor_resource = Some(Arc::clone(&res_state));
1023
1024 let cow_state = Arc::new(Mutex::new(cow_state));
1025 self.supervisor_cow = Some(Arc::clone(&cow_state));
1026
1027 let net_state = Arc::new(Mutex::new(net_state));
1028 self.supervisor_network = Some(Arc::clone(&net_state));
1029
1030 let procfs_state = Arc::new(Mutex::new(procfs_state));
1031 let time_random_state = Arc::new(Mutex::new(time_random_state));
1032 let policy_fn_state = Arc::new(Mutex::new(policy_fn_state));
1033 let chroot_state = Arc::new(Mutex::new(chroot_state));
1034 let processes = Arc::new(crate::seccomp::state::ProcessIndex::new());
1037
1038 let ctx = Arc::new(SupervisorCtx {
1039 resource: Arc::clone(&res_state),
1040 cow: Arc::clone(&cow_state),
1041 procfs: Arc::clone(&procfs_state),
1042 network: Arc::clone(&net_state),
1043 time_random: Arc::clone(&time_random_state),
1044 policy_fn: Arc::clone(&policy_fn_state),
1045 chroot: Arc::clone(&chroot_state),
1046 netlink: Arc::new(crate::netlink::NetlinkState::new()),
1047 processes: Arc::clone(&processes),
1048 policy: Arc::new(notif_policy),
1049 child_pidfd: child_pidfd_raw,
1050 notif_fd: notif_raw_fd,
1051 });
1052
1053 self.notif_handle = Some(tokio::spawn(
1055 notif::supervisor(notif_fd, ctx),
1056 ));
1057
1058 let la_resource = Arc::clone(&res_state);
1060 self.loadavg_handle = Some(tokio::spawn(async move {
1061 let mut interval = tokio::time::interval(Duration::from_secs(5));
1062 interval.tick().await; loop {
1064 interval.tick().await;
1065 let mut rs = la_resource.lock().await;
1066 let running = rs.proc_count;
1067 rs.load_avg.sample(running);
1068 }
1069 }));
1070 }
1071
1072 if let Some(cpu_pct) = self.policy.max_cpu {
1074 if cpu_pct < 100 {
1075 let child_pid = pid;
1076 self.throttle_handle = Some(tokio::spawn(throttle_cpu(child_pid, cpu_pct)));
1077 }
1078 }
1079
1080 write_u32_fd(pipes.ready_w.as_raw_fd(), 1)
1082 .map_err(|e| SandboxError::Child(format!("write ready signal: {}", e)))?;
1083
1084 self.pidfd = pidfd;
1086
1087 Ok(())
1088 }
1089}
1090
1091impl Drop for Sandbox {
1096 fn drop(&mut self) {
1097 if let Some(pid) = self.child_pid {
1098 if matches!(self.state, SandboxState::Running | SandboxState::Paused) {
1099 unsafe { libc::killpg(pid, libc::SIGKILL) };
1101 let mut status: i32 = 0;
1103 unsafe { libc::waitpid(pid, &mut status, 0) };
1104 }
1105 }
1106
1107 if let Some(h) = self.notif_handle.take() {
1108 h.abort();
1109 }
1110 if let Some(h) = self.throttle_handle.take() {
1111 h.abort();
1112 }
1113 if let Some(h) = self.loadavg_handle.take() {
1114 h.abort();
1115 }
1116
1117 let is_error = matches!(
1120 self.state,
1121 SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
1122 );
1123 let action = if is_error {
1124 &self.policy.on_error
1125 } else {
1126 &self.policy.on_exit
1127 };
1128
1129 if let Some(ref branch) = self.cow_branch {
1131 match action {
1132 BranchAction::Commit => { let _ = branch.commit(); }
1133 BranchAction::Abort => { let _ = branch.abort(); }
1134 BranchAction::Keep => {}
1135 }
1136 }
1137
1138 if let Some(ref mut cow) = self.seccomp_cow {
1140 match action {
1141 BranchAction::Commit => { let _ = cow.commit(); }
1142 BranchAction::Abort => { let _ = cow.abort(); }
1143 BranchAction::Keep => {}
1144 }
1145 }
1146 }
1147}
1148
1149async fn throttle_cpu(pid: i32, cpu_pct: u8) {
1155 let period = Duration::from_millis(100);
1156 let run_time = period * cpu_pct as u32 / 100;
1157 let stop_time = period - run_time;
1158
1159 loop {
1160 tokio::time::sleep(run_time).await;
1161 if unsafe { libc::killpg(pid, libc::SIGSTOP) } < 0 {
1162 break;
1163 }
1164 tokio::time::sleep(stop_time).await;
1165 if unsafe { libc::killpg(pid, libc::SIGCONT) } < 0 {
1166 break;
1167 }
1168 }
1169}
1170
1171fn read_exact(fd: i32, buf: &mut [u8]) {
1179 let mut off = 0;
1180 while off < buf.len() {
1181 let r = unsafe { libc::read(fd, buf[off..].as_mut_ptr() as *mut _, buf.len() - off) };
1182 if r <= 0 { break; }
1183 off += r as usize;
1184 }
1185}
1186
1187fn read_fd_to_end(fd: OwnedFd) -> Vec<u8> {
1188 use std::io::Read;
1189 let mut file = unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) };
1190 let mut buf = Vec::new();
1191 let _ = file.read_to_end(&mut buf);
1192 buf
1193}
1194
1195fn wait_status_to_exit(status: i32) -> ExitStatus {
1196 if libc::WIFEXITED(status) {
1197 ExitStatus::Code(libc::WEXITSTATUS(status))
1198 } else if libc::WIFSIGNALED(status) {
1199 let sig = libc::WTERMSIG(status);
1200 if sig == libc::SIGKILL {
1201 ExitStatus::Killed
1202 } else {
1203 ExitStatus::Signal(sig)
1204 }
1205 } else {
1206 ExitStatus::Killed
1207 }
1208}