use std::ffi::CString;
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::Mutex;
use tokio::task::JoinHandle;
use std::sync::atomic::{AtomicBool, Ordering};
use crate::context::{self, CowConfig, PipePair, read_u32_fd, write_u32_fd};
use crate::cow::{CowBranch, overlayfs::OverlayBranch, branchfs::BranchFsBranch};
use crate::error::{SandboxError, SandlockError};
use crate::network;
use crate::policy::{BranchAction, FsIsolation, Policy};
use crate::result::{ExitStatus, RunResult};
use crate::seccomp::notif::{self, NotifPolicy, SupervisorState};
use crate::sys::syscall;
pub(crate) static CONFINED: AtomicBool = AtomicBool::new(false);
pub fn is_nested() -> bool {
if CONFINED.load(Ordering::Relaxed) {
return true;
}
if let Ok(status) = std::fs::read_to_string("/proc/self/status") {
for line in status.lines() {
if line.starts_with("Seccomp:") {
return line.trim().ends_with('2');
}
}
}
false
}
enum SandboxState {
Created,
Running,
Paused,
Stopped(ExitStatus),
}
pub struct Sandbox {
policy: Policy,
state: SandboxState,
child_pid: Option<i32>,
pidfd: Option<OwnedFd>,
notif_handle: Option<JoinHandle<()>>,
throttle_handle: Option<JoinHandle<()>>,
_stdout_read: Option<OwnedFd>,
_stderr_read: Option<OwnedFd>,
cow_branch: Option<Box<dyn CowBranch>>,
supervisor_state: Option<Arc<Mutex<SupervisorState>>>,
ctrl_fd: Option<OwnedFd>,
stdout_pipe: Option<OwnedFd>,
init_fn: Option<Box<dyn FnOnce() + Send + 'static>>,
work_fn: Option<Arc<dyn Fn(u32) + Send + Sync + 'static>>,
io_overrides: Option<(Option<i32>, Option<i32>, Option<i32>)>,
}
impl Sandbox {
pub fn new(policy: &Policy) -> Result<Self, SandlockError> {
Ok(Self::create(policy))
}
pub fn new_with_fns(
policy: &Policy,
init_fn: impl FnOnce() + Send + 'static,
work_fn: impl Fn(u32) + Send + Sync + 'static,
) -> Result<Self, SandlockError> {
let mut sb = Self::create(policy);
sb.init_fn = Some(Box::new(init_fn));
sb.work_fn = Some(Arc::new(work_fn));
Ok(sb)
}
fn create(policy: &Policy) -> Self {
Self {
policy: policy.clone(),
state: SandboxState::Created,
child_pid: None,
pidfd: None,
notif_handle: None,
throttle_handle: None,
_stdout_read: None,
_stderr_read: None,
cow_branch: None,
supervisor_state: None,
ctrl_fd: None,
stdout_pipe: None,
init_fn: None,
work_fn: None,
io_overrides: None,
}
}
pub async fn run(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
let mut sb = Self::new(policy)?;
sb.do_spawn(cmd, true).await?;
sb.wait().await
}
pub async fn run_interactive(policy: &Policy, cmd: &[&str]) -> Result<RunResult, SandlockError> {
let mut sb = Self::new(policy)?;
sb.do_spawn(cmd, false).await?;
sb.wait().await
}
pub async fn dry_run(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
let mut policy = policy.clone();
policy.on_exit = BranchAction::Keep;
policy.on_error = BranchAction::Keep;
let mut sb = Self::new(&policy)?;
sb.do_spawn(cmd, true).await?;
let run_result = sb.wait().await?;
let changes = sb.collect_changes().await;
sb.do_abort().await;
Ok(crate::dry_run::DryRunResult { run_result, changes })
}
pub async fn dry_run_interactive(policy: &Policy, cmd: &[&str]) -> Result<crate::dry_run::DryRunResult, SandlockError> {
let mut policy = policy.clone();
policy.on_exit = BranchAction::Keep;
policy.on_error = BranchAction::Keep;
let mut sb = Self::new(&policy)?;
sb.do_spawn(cmd, false).await?;
let run_result = sb.wait().await?;
let changes = sb.collect_changes().await;
sb.do_abort().await;
Ok(crate::dry_run::DryRunResult { run_result, changes })
}
async fn collect_changes(&self) -> Vec<crate::dry_run::Change> {
if let Some(ref branch) = self.cow_branch {
return branch.changes().unwrap_or_default();
}
if let Some(ref state) = self.supervisor_state {
if let Ok(st) = state.try_lock() {
if let Some(ref cow) = st.cow_branch {
return cow.changes().unwrap_or_default();
}
}
}
Vec::new()
}
async fn do_abort(&mut self) {
if let Some(branch) = self.cow_branch.take() {
let _ = branch.abort();
}
if let Some(ref state) = self.supervisor_state {
if let Ok(mut st) = state.try_lock() {
if let Some(ref mut cow) = st.cow_branch {
let _ = cow.abort();
}
}
}
}
pub async fn fork(&mut self, n: u32) -> Result<Vec<Sandbox>, SandlockError> {
let init_fn = self.init_fn.take()
.ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
let work_fn = self.work_fn.take()
.ok_or_else(|| SandboxError::Child("fork() requires new_with_fns()".into()))?;
let policy = self.policy.clone();
let mut ctrl_fds = [0i32; 2];
if unsafe { libc::pipe2(ctrl_fds.as_mut_ptr(), 0) } < 0 {
return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
}
let ctrl_parent = unsafe { OwnedFd::from_raw_fd(ctrl_fds[0]) };
let ctrl_child_fd = ctrl_fds[1];
let mut pipe_read_ends: Vec<OwnedFd> = Vec::with_capacity(n as usize);
let mut pipe_write_fds: Vec<i32> = Vec::with_capacity(n as usize);
for _ in 0..n {
let mut pfds = [0i32; 2];
if unsafe { libc::pipe(pfds.as_mut_ptr()) } >= 0 {
pipe_read_ends.push(unsafe { OwnedFd::from_raw_fd(pfds[0]) });
pipe_write_fds.push(pfds[1]);
} else {
pipe_write_fds.push(-1);
}
}
let pid = unsafe { libc::fork() };
if pid < 0 {
unsafe { libc::close(ctrl_child_fd) };
return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
}
if pid == 0 {
drop(ctrl_parent);
unsafe { libc::setpgid(0, 0) };
unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) };
unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
let _ = crate::landlock::confine(&policy);
let deny = crate::context::deny_syscall_numbers(&policy);
let args = crate::context::arg_filters(&policy);
let filter = crate::seccomp::bpf::assemble_filter(&[], &deny, &args);
let _ = crate::seccomp::bpf::install_deny_filter(&filter);
CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
init_fn();
drop(pipe_read_ends);
crate::fork::fork_ready_loop_fn(ctrl_child_fd, n, &*work_fn, &pipe_write_fds);
unsafe { libc::_exit(0) };
}
unsafe { libc::close(ctrl_child_fd) };
for wfd in &pipe_write_fds {
if *wfd >= 0 { unsafe { libc::close(*wfd) }; }
}
self.child_pid = Some(pid);
self.state = SandboxState::Running;
let ctrl_fd = ctrl_parent.as_raw_fd();
let mut pid_buf = vec![0u8; n as usize * 4];
read_exact(ctrl_fd, &mut pid_buf);
let clone_pids: Vec<i32> = pid_buf.chunks(4)
.map(|c| u32::from_be_bytes(c.try_into().unwrap_or([0; 4])) as i32)
.collect();
let live_count = clone_pids.iter().filter(|&&p| p > 0).count();
let mut code_buf = vec![0u8; live_count * 4];
read_exact(ctrl_fd, &mut code_buf);
self.ctrl_fd = Some(ctrl_parent);
let mut status = 0i32;
unsafe { libc::waitpid(pid, &mut status, 0) };
let mut code_idx = 0;
let mut clones = Vec::with_capacity(live_count);
let mut pipe_iter = pipe_read_ends.into_iter();
for &clone_pid in &clone_pids {
let pipe = pipe_iter.next();
if clone_pid <= 0 { continue; }
let code = i32::from_be_bytes(
code_buf[code_idx * 4..(code_idx + 1) * 4].try_into().unwrap_or([0; 4])
);
code_idx += 1;
let mut sb = Sandbox::create(&policy);
sb.child_pid = Some(clone_pid);
sb.state = SandboxState::Stopped(if code == 0 {
ExitStatus::Code(0)
} else if code > 0 {
ExitStatus::Code(code)
} else {
ExitStatus::Killed
});
sb.stdout_pipe = pipe;
clones.push(sb);
}
Ok(clones)
}
pub async fn reduce(
&self,
cmd: &[&str],
clones: &mut [Sandbox],
) -> Result<RunResult, SandlockError> {
let mut combined = Vec::new();
for clone in clones.iter_mut() {
if let Some(pipe) = clone.stdout_pipe.take() {
combined.extend_from_slice(&read_fd_to_end(pipe));
}
}
let mut stdin_fds = [0i32; 2];
if unsafe { libc::pipe2(stdin_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
}
let write_fd = stdin_fds[1];
let write_handle = tokio::task::spawn_blocking(move || {
unsafe {
libc::write(write_fd, combined.as_ptr() as *const _, combined.len());
libc::close(write_fd);
}
});
let mut reducer = Sandbox::new(&self.policy)?;
reducer.io_overrides = Some((Some(stdin_fds[0]), None, None));
reducer.do_spawn(cmd, true).await?;
unsafe { libc::close(stdin_fds[0]) };
let _ = write_handle.await;
reducer.wait().await
}
pub async fn wait(&mut self) -> Result<RunResult, SandlockError> {
let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
if let SandboxState::Stopped(ref es) = self.state {
return Ok(RunResult {
exit_status: es.clone(),
stdout: None,
stderr: None,
});
}
let exit_status = tokio::task::spawn_blocking(move || -> ExitStatus {
let mut status: i32 = 0;
loop {
let ret = unsafe { libc::waitpid(pid, &mut status, 0) };
if ret < 0 {
let err = std::io::Error::last_os_error();
if err.raw_os_error() == Some(libc::EINTR) {
continue;
}
return ExitStatus::Killed;
}
break;
}
wait_status_to_exit(status)
})
.await
.unwrap_or(ExitStatus::Killed);
self.state = SandboxState::Stopped(exit_status.clone());
if let Some(h) = self.notif_handle.take() {
h.abort();
}
if let Some(h) = self.throttle_handle.take() {
h.abort();
}
let stdout = self._stdout_read.take().map(|fd| read_fd_to_end(fd));
let stderr = self._stderr_read.take().map(|fd| read_fd_to_end(fd));
Ok(RunResult {
exit_status,
stdout,
stderr,
})
}
pub fn pause(&mut self) -> Result<(), SandlockError> {
let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
let ret = unsafe { libc::killpg(pid, libc::SIGSTOP) };
if ret < 0 {
return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
}
self.state = SandboxState::Paused;
Ok(())
}
pub fn resume(&mut self) -> Result<(), SandlockError> {
let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
let ret = unsafe { libc::killpg(pid, libc::SIGCONT) };
if ret < 0 {
return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
}
self.state = SandboxState::Running;
Ok(())
}
pub fn kill(&mut self) -> Result<(), SandlockError> {
let pid = self.child_pid.ok_or(SandboxError::NotRunning)?;
let ret = unsafe { libc::killpg(pid, libc::SIGKILL) };
if ret < 0 {
let err = std::io::Error::last_os_error();
if err.raw_os_error() != Some(libc::ESRCH) {
return Err(SandboxError::Io(err).into());
}
}
Ok(())
}
pub fn pid(&self) -> Option<i32> {
self.child_pid
}
#[doc(hidden)]
pub fn is_running(&self) -> bool {
matches!(self.state, SandboxState::Running | SandboxState::Paused)
}
pub fn policy(&self) -> &Policy {
&self.policy
}
#[doc(hidden)]
pub async fn commit(&mut self) -> Result<(), SandlockError> {
if let Some(branch) = self.cow_branch.take() {
branch.commit().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
}
Ok(())
}
#[doc(hidden)]
pub async fn abort_branch(&mut self) -> Result<(), SandlockError> {
if let Some(branch) = self.cow_branch.take() {
branch.abort().map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
}
Ok(())
}
pub(crate) async fn freeze(&self) -> Result<(), SandlockError> {
let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
if let Some(ref state) = self.supervisor_state {
let mut st = state.lock().await;
st.hold_forks = true;
}
unsafe { libc::killpg(pid, libc::SIGSTOP); }
Ok(())
}
pub(crate) async fn thaw(&self) -> Result<(), SandlockError> {
let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
if let Some(ref state) = self.supervisor_state {
let mut st = state.lock().await;
st.hold_forks = false;
st.held_notif_ids.clear();
}
unsafe { libc::killpg(pid, libc::SIGCONT); }
Ok(())
}
#[doc(hidden)]
pub async fn spawn(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
self.do_spawn(cmd, false).await
}
#[doc(hidden)]
pub async fn spawn_captured(&mut self, cmd: &[&str]) -> Result<(), SandlockError> {
self.do_spawn(cmd, true).await
}
#[doc(hidden)]
pub async fn spawn_with_io(
&mut self,
cmd: &[&str],
stdin_fd: Option<std::os::unix::io::RawFd>,
stdout_fd: Option<std::os::unix::io::RawFd>,
stderr_fd: Option<std::os::unix::io::RawFd>,
) -> Result<(), SandlockError> {
self.io_overrides = Some((stdin_fd, stdout_fd, stderr_fd));
self.do_spawn(cmd, false).await
}
pub async fn checkpoint(&self) -> Result<crate::checkpoint::Checkpoint, SandlockError> {
let pid = self.child_pid.ok_or(SandlockError::Sandbox(SandboxError::NotRunning))?;
self.freeze().await?;
let cp = crate::checkpoint::capture(pid, &self.policy);
self.thaw().await?;
cp
}
async fn do_spawn(&mut self, cmd: &[&str], capture: bool) -> Result<(), SandlockError> {
if !matches!(self.state, SandboxState::Created) {
return Err(SandboxError::Child("sandbox already spawned".into()).into());
}
if cmd.is_empty() {
return Err(SandboxError::Child("empty command".into()).into());
}
let c_cmd: Vec<CString> = cmd
.iter()
.map(|s| CString::new(*s).map_err(|_| SandboxError::Child("invalid command string".into())))
.collect::<Result<Vec<_>, _>>()?;
let nested = is_nested();
let pipes = PipePair::new().map_err(SandboxError::Io)?;
let resolved_ips = if !self.policy.net_allow_hosts.is_empty() {
network::resolve_hosts(&self.policy.net_allow_hosts)
.await
.map_err(SandboxError::Io)?
} else {
std::collections::HashSet::new()
};
let cow_branch: Option<Box<dyn CowBranch>> = match self.policy.fs_isolation {
FsIsolation::OverlayFs => {
let workdir = self.policy.workdir.as_ref()
.ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("OverlayFs requires workdir".into())))?;
let storage = self.policy.fs_storage.as_ref()
.cloned()
.unwrap_or_else(|| std::env::temp_dir().join("sandlock-overlay"));
std::fs::create_dir_all(&storage)
.map_err(|e| SandlockError::Sandbox(SandboxError::Io(e)))?;
let branch = OverlayBranch::create(workdir, &storage)
.map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
Some(Box::new(branch))
}
FsIsolation::BranchFs => {
let workdir = self.policy.workdir.as_ref()
.ok_or_else(|| SandlockError::Sandbox(SandboxError::Child("BranchFs requires workdir".into())))?;
let branch = BranchFsBranch::create(workdir)
.map_err(|e| SandlockError::Sandbox(SandboxError::Branch(e)))?;
Some(Box::new(branch))
}
FsIsolation::None => None,
};
let cow_config = if let Some(ref branch) = cow_branch {
if self.policy.fs_isolation == FsIsolation::OverlayFs {
let workdir = self.policy.workdir.as_ref().unwrap();
let merged = branch.branch_path().to_path_buf();
let branch_dir = merged.parent().unwrap();
let upper = branch_dir.join("upper");
let work = branch_dir.join("work");
Some(CowConfig {
merged,
upper,
work,
lowers: vec![workdir.clone()],
})
} else {
None
}
} else {
None
};
let (stdout_r, stderr_r) = if capture {
let mut stdout_fds = [0i32; 2];
let mut stderr_fds = [0i32; 2];
if unsafe { libc::pipe2(stdout_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
}
if unsafe { libc::pipe2(stderr_fds.as_mut_ptr(), libc::O_CLOEXEC) } < 0 {
unsafe {
libc::close(stdout_fds[0]);
libc::close(stdout_fds[1]);
}
return Err(SandboxError::Io(std::io::Error::last_os_error()).into());
}
(
Some((
unsafe { OwnedFd::from_raw_fd(stdout_fds[0]) },
unsafe { OwnedFd::from_raw_fd(stdout_fds[1]) },
)),
Some((
unsafe { OwnedFd::from_raw_fd(stderr_fds[0]) },
unsafe { OwnedFd::from_raw_fd(stderr_fds[1]) },
)),
)
} else {
(None, None)
};
let pid = unsafe { libc::fork() };
if pid < 0 {
return Err(SandboxError::Fork(std::io::Error::last_os_error()).into());
}
if pid == 0 {
if let Some((stdin_fd, stdout_fd, stderr_fd)) = self.io_overrides {
if let Some(fd) = stdin_fd {
unsafe { libc::dup2(fd, 0) };
}
if let Some(fd) = stdout_fd {
unsafe { libc::dup2(fd, 1) };
}
if let Some(fd) = stderr_fd {
unsafe { libc::dup2(fd, 2) };
}
}
if let Some((_, ref stdout_w)) = stdout_r {
unsafe { libc::dup2(stdout_w.as_raw_fd(), 1) };
}
if let Some((_, ref stderr_w)) = stderr_r {
unsafe { libc::dup2(stderr_w.as_raw_fd(), 2) };
}
drop(stdout_r);
drop(stderr_r);
context::confine_child(&self.policy, &c_cmd, &pipes, cow_config.as_ref(), nested);
}
self.cow_branch = cow_branch;
drop(pipes.notif_w);
drop(pipes.ready_r);
self._stdout_read = stdout_r.map(|(r, _w)| r);
self._stderr_read = stderr_r.map(|(r, _w)| r);
self.child_pid = Some(pid);
self.state = SandboxState::Running;
let pidfd = match syscall::pidfd_open(pid as u32, 0) {
Ok(fd) => Some(fd),
Err(_) => None, };
let notif_fd_num = read_u32_fd(pipes.notif_r.as_raw_fd())
.map_err(|e| SandboxError::Child(format!("read notif fd from child: {}", e)))?;
let is_nested = notif_fd_num == 0;
let notif_fd = if is_nested {
None
} else if let Some(ref pfd) = pidfd {
Some(syscall::pidfd_getfd(pfd, notif_fd_num as i32, 0)
.map_err(|e| SandboxError::Child(format!("pidfd_getfd: {}", e)))?)
} else {
let path = format!("/proc/{}/fd/{}", pid, notif_fd_num);
let cpath = CString::new(path).unwrap();
let raw = unsafe { libc::open(cpath.as_ptr(), libc::O_RDWR) };
if raw < 0 {
return Err(
SandboxError::Child("failed to open notif fd from /proc".into()).into(),
);
}
Some(unsafe { OwnedFd::from_raw_fd(raw) })
};
if let Some(notif_fd) = notif_fd {
if self.policy.time_start.is_some() || self.policy.random_seed.is_some() {
let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
if let Err(e) = crate::vdso::patch(pid, time_offset, self.policy.random_seed.is_some()) {
eprintln!("sandlock: pre-exec vDSO patching failed (will retry after exec): {}", e);
}
}
let time_offset_val = self.policy.time_start
.map(|t| crate::time::calculate_time_offset(t))
.unwrap_or(0);
let notif_policy = NotifPolicy {
max_memory_bytes: self.policy.max_memory.map(|m| m.0).unwrap_or(0),
max_processes: self.policy.max_processes,
has_memory_limit: self.policy.max_memory.is_some(),
has_net_allowlist: !self.policy.net_allow_hosts.is_empty()
|| self.policy.policy_fn.is_some(),
has_random_seed: self.policy.random_seed.is_some(),
has_time_start: self.policy.time_start.is_some(),
time_offset: time_offset_val,
num_cpus: self.policy.num_cpus,
has_proc_virt: self.policy.num_cpus.is_some() || self.policy.max_memory.is_some() || self.policy.isolate_pids || self.policy.port_remap,
isolate_pids: self.policy.isolate_pids,
port_remap: self.policy.port_remap,
cow_enabled: self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None,
chroot_root: self.policy.chroot.clone(),
chroot_readable: self.policy.fs_readable.clone(),
chroot_writable: self.policy.fs_writable.clone(),
deterministic_dirs: self.policy.deterministic_dirs,
hostname: self.policy.hostname.clone(),
};
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
let random_state = self.policy.random_seed.map(|seed| ChaCha8Rng::seed_from_u64(seed));
let time_offset = self.policy.time_start.map(|t| crate::time::calculate_time_offset(t));
let mut sup_state = SupervisorState::new(
notif_policy.max_memory_bytes,
notif_policy.max_processes,
time_offset,
random_state,
);
sup_state.network_policy = if self.policy.net_allow_hosts.is_empty() {
crate::seccomp::notif::NetworkPolicy::Unrestricted
} else {
crate::seccomp::notif::NetworkPolicy::AllowList(resolved_ips)
};
if let Some(ref pfd) = pidfd {
use std::os::unix::io::AsRawFd;
sup_state.child_pidfd = Some(pfd.as_raw_fd());
}
if self.policy.workdir.is_some() && self.policy.fs_isolation == FsIsolation::None {
let workdir = self.policy.workdir.as_ref().unwrap();
let storage = self.policy.fs_storage.as_deref();
match crate::cow::seccomp::SeccompCowBranch::create(workdir, storage) {
Ok(branch) => { sup_state.cow_branch = Some(branch); }
Err(e) => { eprintln!("sandlock: seccomp COW branch creation failed: {}", e); }
}
}
if let Some(ref callback) = self.policy.policy_fn {
let live = crate::policy_fn::LivePolicy {
allowed_ips: match &sup_state.network_policy {
crate::seccomp::notif::NetworkPolicy::AllowList(ips) => ips.clone(),
crate::seccomp::notif::NetworkPolicy::Unrestricted => std::collections::HashSet::new(),
},
max_memory_bytes: notif_policy.max_memory_bytes,
max_processes: notif_policy.max_processes,
};
let ceiling = live.clone();
let live = std::sync::Arc::new(std::sync::RwLock::new(live));
let denied_paths = sup_state.denied_paths.clone();
let pid_overrides = sup_state.pid_ip_overrides.clone();
sup_state.live_policy = Some(live.clone());
let tx = crate::policy_fn::spawn_policy_fn(
callback.clone(), live, ceiling, pid_overrides, denied_paths,
);
sup_state.policy_event_tx = Some(tx);
}
let sup_state = Arc::new(Mutex::new(sup_state));
self.supervisor_state = Some(Arc::clone(&sup_state));
self.notif_handle = Some(tokio::spawn(
notif::supervisor(notif_fd, notif_policy, sup_state),
));
}
if let Some(cpu_pct) = self.policy.max_cpu {
if cpu_pct < 100 {
let child_pid = pid;
self.throttle_handle = Some(tokio::spawn(throttle_cpu(child_pid, cpu_pct)));
}
}
write_u32_fd(pipes.ready_w.as_raw_fd(), 1)
.map_err(|e| SandboxError::Child(format!("write ready signal: {}", e)))?;
self.pidfd = pidfd;
Ok(())
}
}
impl Drop for Sandbox {
fn drop(&mut self) {
if let Some(pid) = self.child_pid {
if matches!(self.state, SandboxState::Running | SandboxState::Paused) {
unsafe { libc::killpg(pid, libc::SIGKILL) };
let mut status: i32 = 0;
unsafe { libc::waitpid(pid, &mut status, 0) };
}
}
if let Some(h) = self.notif_handle.take() {
h.abort();
}
if let Some(h) = self.throttle_handle.take() {
h.abort();
}
if let Some(ref branch) = self.cow_branch {
let is_error = matches!(
self.state,
SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
);
let action = if is_error {
&self.policy.on_error
} else {
&self.policy.on_exit
};
match action {
BranchAction::Commit => { let _ = branch.commit(); }
BranchAction::Abort => { let _ = branch.abort(); }
BranchAction::Keep => {} }
}
if let Some(ref state) = self.supervisor_state {
let Ok(mut st) = state.try_lock() else { return; };
if let Some(ref mut cow) = st.cow_branch {
let is_error = matches!(
self.state,
SandboxState::Stopped(ref s) if !matches!(s, ExitStatus::Code(0))
);
let action = if is_error {
&self.policy.on_error
} else {
&self.policy.on_exit
};
match action {
BranchAction::Commit => { let _ = cow.commit(); }
BranchAction::Abort => { let _ = cow.abort(); }
BranchAction::Keep => {}
}
}
}
}
}
async fn throttle_cpu(pid: i32, cpu_pct: u8) {
let period = Duration::from_millis(100);
let run_time = period * cpu_pct as u32 / 100;
let stop_time = period - run_time;
loop {
tokio::time::sleep(run_time).await;
if unsafe { libc::killpg(pid, libc::SIGSTOP) } < 0 {
break;
}
tokio::time::sleep(stop_time).await;
if unsafe { libc::killpg(pid, libc::SIGCONT) } < 0 {
break;
}
}
}
fn read_exact(fd: i32, buf: &mut [u8]) {
let mut off = 0;
while off < buf.len() {
let r = unsafe { libc::read(fd, buf[off..].as_mut_ptr() as *mut _, buf.len() - off) };
if r <= 0 { break; }
off += r as usize;
}
}
fn read_fd_to_end(fd: OwnedFd) -> Vec<u8> {
use std::io::Read;
let mut file = unsafe { std::fs::File::from_raw_fd(fd.into_raw_fd()) };
let mut buf = Vec::new();
let _ = file.read_to_end(&mut buf);
buf
}
fn wait_status_to_exit(status: i32) -> ExitStatus {
if libc::WIFEXITED(status) {
ExitStatus::Code(libc::WEXITSTATUS(status))
} else if libc::WIFSIGNALED(status) {
let sig = libc::WTERMSIG(status);
if sig == libc::SIGKILL {
ExitStatus::Killed
} else {
ExitStatus::Signal(sig)
}
} else {
ExitStatus::Killed
}
}