syd 3.52.0

rock-solid application kernel
Documentation
use std::{
    ffi::CString,
    os::{
        fd::{AsFd, AsRawFd, FromRawFd, IntoRawFd},
        unix::io::RawFd,
    },
};

use libseccomp::ScmpFilterContext;
use nix::{
    errno::Errno,
    fcntl::OFlag,
    sched::CloneFlags,
    sys::{
        ptrace::{cont, Options},
        signal::{kill, Signal},
        wait::{Id, WaitPidFlag},
    },
    unistd::{read, write, Pid},
};

use crate::{
    compat::{waitid, WaitStatus, PTRACE_SEIZE},
    fd::{fdclone, pidfd_getfd, SafeOwnedFd},
    log::LOG_FD,
    retry::retry_on_eintr,
    rng::duprand,
    sandbox::RawIoctlMap,
    unshare::{child::child_after_clone, config::Config, Child, Command},
};

type ChildPreExecFunc = Box<dyn Fn() -> Result<(), Errno>>;
type PipePair = ((RawFd, RawFd), (RawFd, RawFd));

pub struct ChildInfo {
    pub cfg: Config,
    pub exe_file: CString,
    pub exe_args: Vec<CString>,
    pub pre_exec: Option<ChildPreExecFunc>,
    pub pty_fd: Option<RawFd>,
    pub ioctl_denylist: Option<RawIoctlMap>,
    pub seccomp_filter: Option<ScmpFilterContext>,
    pub seccomp_pipefd: PipePair,
}

impl Command {
    /// Spawn the command and return a handle that can be waited for
    pub fn spawn(mut self) -> Result<Child, Errno> {
        let exe_file = self.exe_file.take().ok_or(Errno::EFAULT)?;
        let exe_args = self.exe_args.take().ok_or(Errno::EFAULT)?;

        // Prepare information for the Syd child.
        let child_info = Box::new(ChildInfo {
            exe_file,
            exe_args,
            cfg: self.config,
            pre_exec: std::mem::take(&mut self.pre_exec),
            pty_fd: std::mem::take(&mut self.pty_fd),
            ioctl_denylist: std::mem::take(&mut self.ioctl_denylist),
            seccomp_filter: std::mem::take(&mut self.seccomp_filter),
            seccomp_pipefd: self.seccomp_pipefd,
        });

        // Call clone(2), child_after_clone never returns.
        let (pid_fd, child) = fdclone(
            move || {
                child_after_clone(child_info);
            },
            CloneFlags::empty(),
            Some(libc::SIGCHLD),
        )?;

        // SAFETY: Randomize the pid FD for hardening.
        let pid_fd_rand = duprand(pid_fd.as_raw_fd(), OFlag::O_CLOEXEC)?;
        drop(pid_fd);
        let pid_fd = pid_fd_rand;

        // SAFETY: Randomize the log FD for hardening.
        // O_EXCL closes oldfd on success.
        let log_fd = LOG_FD.load(std::sync::atomic::Ordering::Relaxed);
        if log_fd >= 0 {
            let log_fd = duprand(log_fd, OFlag::O_CLOEXEC | OFlag::O_EXCL)?;
            LOG_FD.store(log_fd.into_raw_fd(), std::sync::atomic::Ordering::Relaxed);
        } // else logging is disabled.

        let seccomp_fd = match self.after_start(child, &pid_fd) {
            Ok(seccomp_fd) => seccomp_fd,
            Err(e) => loop {
                match waitid(Id::PIDFd(pid_fd.as_fd()), WaitPidFlag::WEXITED) {
                    Ok(WaitStatus::Exited(_, errno)) => return Err(Errno::from_raw(errno)),
                    Err(Errno::EINTR) => {}
                    _ => return Err(e),
                }
            },
        };

        Ok(Child {
            pid: child.into(),
            pid_fd: pid_fd.into_raw_fd(),
            seccomp_fd: seccomp_fd.into_raw_fd(),
            status: None,
        })
    }

    fn after_start<Fd: AsFd>(mut self, pid: Pid, pid_fd: Fd) -> Result<SafeOwnedFd, Errno> {
        if self.config.stop {
            // Seize the process for tracing.
            // This must happen before reading the seccomp fd.
            // TODO: Make ptrace options configurable.
            let ptrace_options: Options = Options::PTRACE_O_TRACEFORK
                | Options::PTRACE_O_TRACEVFORK
                | Options::PTRACE_O_TRACECLONE
                | Options::PTRACE_O_TRACEEXEC     // used by Exec TOCTOU mitigator.
                | Options::PTRACE_O_TRACEEXIT     // used by SegvGuard.
                | Options::PTRACE_O_TRACESECCOMP  // used by chdir and exec hooks.
                | Options::PTRACE_O_TRACESYSGOOD  // ditto.
                | Options::PTRACE_O_EXITKILL; // we also set PDEATHSIG so this is the second layer.

            // Step 1: Wait for the process to stop itself.
            // Note, we also wait for EXITED so that if the process is
            // interrupted, and the wait will fall through to the assert
            // to fail.
            let status = waitid(
                Id::PIDFd(pid_fd.as_fd()),
                WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
            )?;
            assert_eq!(status, WaitStatus::Stopped(pid, libc::SIGSTOP));
            // Step 2: Seize the process.
            // 1. We use PTRACE_SEIZE in the parent rather than
            //    PTRACE_TRACEME in the child for its improved
            //    behaviour/API. This also gives us the chance to deny
            //    PTRACE_TRACEME and further confine the sandbox against
            //    e.g. trivial ptrace detectors.
            // 2. Panic if PTRACE_SEIZE fails as otherwise we will leave
            //    the sandbox process in an uninterruptible, broken state.
            //    The typical error case is EPERM which means parent is
            //    strace or YAMA is active.
            assert_eq!(
                // SAFETY: `pid` is a valid child pid from `fdclone`;
                // `PTRACE_SEIZE` with valid option flags.
                Errno::result(unsafe {
                    libc::ptrace(
                        PTRACE_SEIZE,
                        pid.as_raw(),
                        0,
                        ptrace_options.bits() as *mut libc::c_void,
                    )
                })
                .map(drop),
                Ok(()),
                "YAMA or strace? Use with strace -f syd -pD ..."
            );
            let status = waitid(
                Id::PIDFd(pid_fd.as_fd()),
                WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
            )?;
            assert_eq!(
                status,
                WaitStatus::PtraceEvent(pid, libc::SIGSTOP, libc::PTRACE_EVENT_STOP)
            );
            // SAFETY: nix does not have a wrapper for PTRACE_LISTEN.
            Errno::result(unsafe {
                libc::ptrace(crate::compat::PTRACE_LISTEN, pid.as_raw(), 0, 0)
            })?;
            // Step 3: Successfully attached, resume the process.
            // We have to do a simple signal ping-pong here but
            // it's done once and it's worth the trouble.
            kill(pid, Signal::SIGCONT)?;
            let status = waitid(
                Id::PIDFd(pid_fd.as_fd()),
                WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
            )?;
            assert_eq!(
                status,
                WaitStatus::PtraceEvent(pid, libc::SIGTRAP, libc::PTRACE_EVENT_STOP)
            );
            cont(pid, None)?;
            let status = waitid(
                Id::PIDFd(pid_fd.as_fd()),
                WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
            )?;
            assert_eq!(status, WaitStatus::PtraceEvent(pid, libc::SIGCONT, 0));
            cont(pid, Some(Signal::SIGCONT))?;
        }

        if let Some(ref mut callback) = self.before_unfreeze {
            #[expect(clippy::cast_sign_loss)]
            callback(i32::from(pid) as u32)?;
        }

        // SAFETY: Parent owns its copy of the pipes,
        // and is responsible for closing them.
        let seccomp_pipefd = unsafe {
            (
                (
                    SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.0 .0),
                    SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.0 .1),
                ),
                (
                    SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.1 .0),
                    SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.1 .1),
                ),
            )
        };

        // We'll read seccomp notify fd from the second pipe,
        // and write the acknowledgement notification to
        // the first pipe.
        let (pipe_ro, pipe_rw) = (seccomp_pipefd.1 .0, seccomp_pipefd.0 .1);

        // Close the unused ends of the pipes.
        drop(seccomp_pipefd.0 .0);
        drop(seccomp_pipefd.1 .1);

        // Read the value of the file descriptor from the pipe.
        // Handle interrupts and partial reads.
        // EOF means process died before writing to the pipe.
        let mut buf = vec![0u8; size_of::<RawFd>()];
        let mut nread = 0;
        while nread < buf.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match read(&pipe_ro, &mut buf[nread..]) {
                Ok(0) => return Err(Errno::EIO),
                Ok(n) => nread += n,
                Err(Errno::EINTR | Errno::EAGAIN) => continue,
                Err(errno) => return Err(errno),
            }
        }

        // Close the read end of the pipe.
        drop(pipe_ro);

        let remote_seccomp_fd = match buf.as_slice().try_into() {
            Ok(buf) => RawFd::from_le_bytes(buf),
            Err(_) => return Err(Errno::EINVAL),
        };

        // Get the seccomp notify fd using pidfd_getfd(2).
        // The child is waiting on the read end of the pipe,
        // for us to safely transfer the file descriptor.
        let seccomp_fd = pidfd_getfd(pid_fd, remote_seccomp_fd)?;

        // Unblock the child to safely continue and close
        // their copy of the seccomp notify file descriptor.
        // Handle interrupts.
        // Partial write is not possible.
        // EOF means process died before reading from the pipe.
        let buf = [42u8; 1];
        match retry_on_eintr(|| write(&pipe_rw, &buf))? {
            0 => return Err(Errno::EIO),
            1 => {}
            n => unreachable!("BUG: invalid pipe write of size {n}!"),
        };

        // Close the write end of the pipe.
        drop(pipe_rw);

        // SAFETY: Randomize the seccomp(2) fd for hardening.
        // Old seccomp fd will be closed by Drop on function exit.
        duprand(seccomp_fd.as_raw_fd(), OFlag::O_CLOEXEC)
    }
}