syd 3.52.0

rock-solid application kernel
Documentation
//
// Syd: rock-solid application kernel
// src/workers/gdb.rs: `syd_main' ptrace(2) thread
//
// Copyright (c) 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
// Based in part upon rusty_pool which is:
//     Copyright (c) Robin Friedli <robinfriedli@icloud.com>
//     SPDX-License-Identifier: Apache-2.0
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    os::fd::AsFd,
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc, RwLock,
    },
};

use libc::AF_ALG;
use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    fcntl::OFlag,
    sys::wait::{Id, WaitPidFlag},
    unistd::{Gid, Pid, Uid},
};

use crate::{
    compat::{waitid, WaitStatus},
    config::*,
    confine::{
        confine_scmp_accept4, confine_scmp_bind, confine_scmp_close, confine_scmp_close_range,
        confine_scmp_execveat, confine_scmp_faccessat2, confine_scmp_getdents64,
        confine_scmp_ioctl_syd, confine_scmp_madvise, confine_scmp_open, confine_scmp_openat,
        confine_scmp_openat2, confine_scmp_pidfd_getfd, confine_scmp_pidfd_open,
        confine_scmp_pidfd_send_signal, confine_scmp_pipe2, confine_scmp_prctl,
        confine_scmp_ptrace, confine_scmp_recvmsg, confine_scmp_sendfile, confine_scmp_sendmsg,
        confine_scmp_setid, confine_scmp_sigaction, confine_scmp_socket, confine_scmp_splice,
        confine_scmp_write, confine_scmp_wx_syd,
    },
    err::SydResult,
    info,
    kernel::ptrace::event::{
        exec::sysevent_exec, exit::sysevent_exit, fork::sysevent_fork, scmp::sysevent_scmp,
        sig::sysevent_sig, sysx::sysevent_sysx,
    },
    ptrace::{ptrace_cont, ptrace_listen},
    sandbox::{LockState, Options, Sandbox, SandboxGuard},
    workers::{WorkerCache, WorkerData},
};

#[derive(Clone)]
pub(crate) struct Tracer {
    cache: Arc<WorkerCache>,
    sandbox: Arc<RwLock<Sandbox>>,
    should_exit: Arc<AtomicBool>,
    worker_data: Arc<WorkerData>,
}

impl Tracer {
    pub(crate) fn new(
        cache: Arc<WorkerCache>,
        sandbox: Arc<RwLock<Sandbox>>,
        should_exit: Arc<AtomicBool>,
        worker_data: Arc<WorkerData>,
    ) -> Self {
        Self {
            cache,
            sandbox,
            should_exit,
            worker_data,
        }
    }

    /// Run the ptrace(2) loop. This is the main entry point.
    pub(crate) fn run<Fd: AsFd>(
        self,
        child_pfd: Fd,
        child_pid: Pid,
        wait_all: bool,
    ) -> SydResult<u8> {
        // Wait in a loop and push WaitStatus into queue.
        let mut xcode = 127;
        loop {
            match waitid(Id::All, WaitPidFlag::WEXITED | WaitPidFlag::__WNOTHREAD) {
                Ok(WaitStatus::Exited(pid, exit_code)) => {
                    let is_child = pid == child_pid;
                    self.handle_exit(pid, is_child, wait_all);
                    if is_child {
                        xcode = exit_code;
                        if !wait_all {
                            break;
                        }
                    }
                }
                Ok(WaitStatus::Signaled(pid, signal, _core)) => {
                    // Remove cache entries which belong to this TID/TGID.
                    // pid is TID with trace/allow_unsafe_ptrace:0 (default).
                    // pid is TGID with trace/allow_unsafe_ptrace:1.
                    // del_tgid calls del_tid internally.
                    self.cache.del_tgid(pid);

                    if pid == child_pid {
                        xcode = 128_i32.saturating_add(signal);
                        if !wait_all {
                            break;
                        }
                    }
                }
                Ok(status) => self.handle(status),
                Err(Errno::EINTR | Errno::EAGAIN) => {}
                Err(Errno::ECHILD) => break,
                Err(errno) => return Err(errno.into()),
            };

            // Check for exit notification.
            if self.should_exit.load(Ordering::Acquire) {
                break;
            }
        }

        // Ghost mode:
        // 1. If should_exit was set by one of the emulator threads
        //    before main thread could collect child's exit status,
        //    do a final blocking wait to get the correct exit code.
        // 2. Use the pid file descriptor to avoid PID recycling.
        if xcode == 127 {
            let flags = WaitPidFlag::WEXITED | WaitPidFlag::__WNOTHREAD;
            loop {
                match waitid(Id::PIDFd(child_pfd.as_fd()), flags) {
                    Ok(WaitStatus::Exited(_, exit_code)) => {
                        xcode = exit_code;
                        break;
                    }
                    Ok(WaitStatus::Signaled(_, signal, _)) => {
                        xcode = 128_i32.saturating_add(signal);
                        break;
                    }
                    Ok(status) => {
                        unreachable!("BUG: final waitid returned {status:?}, report a bug!")
                    }
                    Err(Errno::EINTR | Errno::EAGAIN) => {}
                    Err(Errno::ECHILD) => break,
                    Err(errno) => return Err(errno.into()),
                }
            }
        }

        // Inform other threads to exit and wake monitor.
        self.should_exit.store(true, Ordering::Release);
        self.worker_data.notify_monitor();

        Ok(u8::try_from(xcode).unwrap_or(127))
    }

    fn handle(&self, status: WaitStatus) {
        match status {
            // WaitStatus::Exited and WaitStatus::Signaled
            // are handled by caller, therefore they never
            // reach here. We panic if they do.
            WaitStatus::PtraceEvent(
                pid,
                libc::SIGSTOP | libc::SIGTSTP | libc::SIGTTIN | libc::SIGTTOU,
                libc::PTRACE_EVENT_STOP,
            ) => {
                let _ = ptrace_listen(pid);
            }
            WaitStatus::PtraceEvent(
                pid,
                _, // Can this ever be !SIGTRAP?
                libc::PTRACE_EVENT_STOP,
            ) => {
                // ptrace-stop, do not forward the signal.
                let _ = ptrace_cont(pid, None);
            }
            WaitStatus::PtraceEvent(pid, sig, 0) => {
                sysevent_sig(pid, sig, &self.cache, &self.sandbox);
            }
            WaitStatus::PtraceEvent(pid, libc::SIGTRAP, libc::PTRACE_EVENT_SECCOMP) => {
                sysevent_scmp(pid, &self.cache, &self.sandbox);
            }
            WaitStatus::PtraceSyscall(pid) => {
                sysevent_sysx(pid, &self.cache, &self.sandbox);
            }
            WaitStatus::PtraceEvent(
                pid,
                libc::SIGTRAP,
                libc::PTRACE_EVENT_CLONE | libc::PTRACE_EVENT_FORK | libc::PTRACE_EVENT_VFORK,
            ) => {
                sysevent_fork(pid, &self.sandbox);
            }
            WaitStatus::PtraceEvent(pid, libc::SIGTRAP, libc::PTRACE_EVENT_EXEC) => {
                sysevent_exec(pid, &self.cache, &self.sandbox);
            }
            WaitStatus::PtraceEvent(pid, libc::SIGTRAP, libc::PTRACE_EVENT_EXIT) => {
                sysevent_exit(pid, &self.cache, &self.sandbox);
            }
            status => panic!("Unhandled wait event: {status:?}"),
        }
    }

    fn handle_exit(&self, pid: Pid, is_child: bool, wait_all: bool) {
        // Remove cache entries which belong to this TID/TGID.
        // pid is TID with trace/allow_unsafe_ptrace:0 (default).
        // pid is TGID with trace/allow_unsafe_ptrace:1.
        // del_tgid calls del_tid internally.
        self.cache.del_tgid(pid);

        // We're done if:
        // (a) This is not the eldest process.
        // (b) trace/exit_wait_all is not set and we'll exit shortly.
        if !is_child || !wait_all {
            return;
        }

        // Lock sandbox immediately if we're in lock:exec.
        // The eldest child exited and sandbox can no longer
        // be edited. Let's assert that.
        let mut sandbox =
            SandboxGuard::Write(self.sandbox.write().unwrap_or_else(|err| err.into_inner()));
        if sandbox.lock == Some(LockState::Exec) {
            // Panic is the only option here on errors.
            #[expect(clippy::disallowed_methods)]
            sandbox
                .lock(LockState::Set)
                .expect("BUG: failed to lock sandbox, report a bug!");
        }
    }

    /// Prepare to confine the Tracer threads.
    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        options: Options,
        safe_kcapi: bool,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> SydResult<ScmpFilterContext> {
        let ssb = options.allow_unsafe_exec_speculative();
        let restrict_cookie = !options.allow_unsafe_nocookie();
        let safe_setuid = options.allow_safe_setuid();
        let safe_setgid = options.allow_safe_setgid();
        let safe_setid = safe_setuid || safe_setgid;

        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(ssb)?;

        // DO NOT synchronize filter to all threads.
        // Thread pool confines itself as necessary.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Prevent executable memory.
        confine_scmp_wx_syd(&mut ctx)?;

        // Deny open and {l,}stat with ENOSYS rather than KillProcess.
        confine_scmp_open(&mut ctx)?;

        // openat(2) may be used to open the parent directory only by getdir_long().
        confine_scmp_openat(&mut ctx)?;

        // openat2(2) may be used only with syscall argument cookies.
        confine_scmp_openat2(&mut ctx, restrict_cookie)?;

        // close(2) and close_range(2) may be used only with syscall argument cookies.
        confine_scmp_close(&mut ctx, restrict_cookie)?;
        confine_scmp_close_range(&mut ctx, restrict_cookie)?;

        // Allow writes to the log-fd and proc_pid_mem(5) as necessary.
        confine_scmp_write(&mut ctx, None, true)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // Allow safe fcntl(2) utility calls.
        for sysname in ["fcntl", "fcntl64"] {
            let syscall = match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => syscall,
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_gdb_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                    continue;
                }
            };

            // TODO: Figure out what fcntl(2) ops are needed for KCOV.
            if cfg!(feature = "kcov") {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
                continue;
            }

            for op in MAIN_FCNTL_OPS {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &[scmp_cmp!($arg1 == *op)])?;
            }
        }

        // Allow safe prctl(2) operations.
        confine_scmp_prctl(&mut ctx, MAIN_PRCTL_OPS)?;

        // Allow ioctl(2) request PROCMAP_QUERY to lookup proc_pid_maps(5) efficiently.
        // This request is new in Linux-6.11.
        confine_scmp_ioctl_syd(&mut ctx, restrict_cookie, None /*seccomp_fd*/)?;

        // Deny installing new signal handlers for {rt_,}sigaction(2).
        confine_scmp_sigaction(&mut ctx)?;

        // Confine network system calls and zero-copy as necessary.
        //
        // Main thread uses AF_ALG networking for Force sandboxing.
        if safe_kcapi {
            confine_scmp_socket(
                &mut ctx,
                Some(&[AF_ALG]),
                true, /* restrict_socket */
                restrict_cookie,
            )?;
            confine_scmp_bind(&mut ctx, restrict_cookie)?;
            confine_scmp_accept4(&mut ctx, restrict_cookie)?;

            // Allow pipe(2), splice(2), and sendfile(2) for zero-copy.
            confine_scmp_pipe2(&mut ctx, restrict_cookie, OFlag::O_CLOEXEC)?;
            confine_scmp_splice(&mut ctx)?;
            confine_scmp_sendfile(&mut ctx, restrict_cookie)?;

            // Allow recvmsg(2) and sendmsg(2) for AF_ALG networking.
            confine_scmp_recvmsg(&mut ctx, restrict_cookie)?;
            confine_scmp_sendmsg(&mut ctx, restrict_cookie)?;
        };

        // Allow safe system calls.
        //
        // KCOV_SYSCALLS is empty in case `kcov` feature is disabled.
        // PROF_SYSCALLS is empty in case `prof` feature is disabled.
        for sysname in MAIN_SYSCALLS
            .iter()
            .chain(ALLOC_SYSCALLS)
            .chain(FUTEX_SYSCALLS)
            .chain(GETID_SYSCALLS)
            .chain(KCOV_SYSCALLS)
            .chain(PROF_SYSCALLS)
            .chain(VDSO_SYSCALLS)
        {
            if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                info!("ctx": "confine", "op": "allow_gdb_syscall",
                    "msg": format!("invalid or unsupported syscall {sysname}"));
            }
        }

        // Allow execveat(2) with AT_EXECVE_CHECK for Linux>=6.14.
        confine_scmp_execveat(&mut ctx, restrict_cookie)?;

        // Allow faccessat2(2) system call.
        confine_scmp_faccessat2(&mut ctx, restrict_cookie)?;

        // getdents64(2) may be used only with syscall argument cookies.
        // TODO: Only allow this if pid-limiter is enabled.
        confine_scmp_getdents64(&mut ctx, restrict_cookie)?;

        // pidfd family system calls may be used only with syscall argument cookies.
        confine_scmp_pidfd_getfd(&mut ctx, restrict_cookie)?;
        confine_scmp_pidfd_open(&mut ctx, restrict_cookie)?;
        confine_scmp_pidfd_send_signal(&mut ctx, restrict_cookie)?;

        // ptrace(2) may be used only with syscall argument cookies.
        confine_scmp_ptrace(&mut ctx, restrict_cookie)?;

        // Allow UID/GID changing system calls as necessary.
        if safe_setid {
            confine_scmp_setid(
                "main",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }
}