syd 3.52.0

rock-solid application kernel
Documentation
//
// Syd: rock-solid application kernel
// src/workers/out.rs: `syd_out' timeouter thread
//
// Copyright (c) 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY:
// 1. This module has (almost) been liberated from unsafe code.
//    Owner::from_raw_fd is used for notif_fd which is unsafe.
//    Use deny rather than forbid so we can allow this case.
// 2. This module forbids arithmetic side effects, et al.
#![deny(unsafe_code)]
#![forbid(clippy::arithmetic_side_effects)]
#![forbid(clippy::cast_possible_truncation)]
#![forbid(clippy::cast_possible_wrap)]

use std::{
    os::fd::{FromRawFd, RawFd},
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
    thread,
};

use dur::Duration;
use libseccomp::{ScmpAction, ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    sched::{unshare, CloneFlags},
    unistd::{write, Gid, Uid},
};

use crate::{
    alert,
    config::*,
    confine::{
        confine_scmp_fcntl, confine_scmp_madvise, confine_scmp_open_stat, confine_scmp_setid,
        confine_scmp_write, confine_scmp_wx_syd, secure_getenv, ExportMode,
    },
    err::{err2no, scmp2no, SydJoinHandle, SydResult},
    error,
    fd::{closeexcept, SafeOwnedFd},
    info,
    landlock::Errata,
    landlock_policy::LandlockPolicy,
    retry::retry_on_eintr,
    sandbox::Options,
};

#[derive(Clone)]
pub(crate) struct Timeouter {
    tmout: Duration,
    options: Options,

    transit_uids: Vec<(Uid, Uid)>,
    transit_gids: Vec<(Gid, Gid)>,

    should_exit: Arc<AtomicBool>,
}

impl Timeouter {
    pub(crate) fn new(
        tmout: Duration,
        options: Options,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        should_exit: Arc<AtomicBool>,
    ) -> Self {
        Self {
            tmout,
            options,
            should_exit,
            transit_uids: transit_uids.to_vec(),
            transit_gids: transit_gids.to_vec(),
        }
    }

    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self, notif_pipe: (RawFd, RawFd)) -> Result<SydJoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_out".to_string())
            .stack_size(OUT_STACK_SIZE)
            .spawn(move || {
                // We use exit_group(2) here to bail, because this
                // unsharing is a critical safety feature.
                if let Err(errno) = unshare(CloneFlags::CLONE_FS | CloneFlags::CLONE_FILES | CloneFlags::CLONE_SYSVSEM) {
                    alert!("ctx": "boot", "op": "unshare_timeout_thread",
                        "msg": format!("failed to unshare(CLONE_FS|CLONE_FILES |CLONE_SYSVSEM): {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // SAFETY: notif_pipe points to valid FDs.
                #[expect(unsafe_code)]
                let (pipe_rd, pipe_wr) = unsafe {
                    (
                        SafeOwnedFd::from_raw_fd(notif_pipe.0),
                        SafeOwnedFd::from_raw_fd(notif_pipe.1),
                    )
                };
                drop(pipe_rd);
                let buf = [42u8; 1];
                #[expect(clippy::disallowed_methods)]
                match retry_on_eintr(|| write(&pipe_wr, &buf)).unwrap() {
                    0 => return Err(Errno::EIO.into()), // Syd died before reading.
                    1 => {}
                    n => unreachable!("BUG: invalid pipe write of size {n}!"),
                }

                // Close the notification pipe.
                drop(pipe_wr);

                // Timeout thread needs to inherit the following FDs:
                // 1. Log FD.
                #[expect(clippy::cast_sign_loss)]
                let set = &[crate::log::LOG_FD.load(Ordering::Relaxed) as libc::c_uint];
                if let Err(errno) = closeexcept(set) {
                    alert!("ctx": "boot", "op": "close_range_timeout_thread",
                        "msg": format!("failed to close range: {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // Honour dry-run when exporting.
                let dry_run =
                    secure_getenv(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();

                // Confine `syd_out' thread.
                if !dry_run {
                    // We use exit_group(2) here to bail, because this
                    // confinement is a critical safety feature.
                    let ctx = match Self::prepare_confine(
                        self.options,
                        &self.transit_uids,
                        &self.transit_gids,
                        false,
                    ) {
                        Ok(ctx) => ctx,
                        Err(error) => {
                            let errno = error.errno().unwrap_or(Errno::ENOSYS);
                            alert!("ctx": "boot", "op": "confine_out_thread",
                                "msg": format!("failed to confine: {error}"),
                                "err": errno as i32);
                            std::process::exit(101);
                        }
                    };

                    // Load seccomp(2) BPF into the kernel.
                    // We use exit_group(2) here to bail, because this
                    // confinement is a critical safety feature.
                    if let Err(error) = ctx.load() {
                        let errno = scmp2no(&error).unwrap_or(Errno::ENOSYS);
                        alert!("ctx": "boot", "op": "confine_out_thread",
                            "msg": format!("failed to confine: {error}"),
                            "err": errno as i32);
                        std::process::exit(101);
                    }

                    let safe_setid = self
                        .options
                        .intersects(Options::OPT_ALLOW_SAFE_SETUID | Options::OPT_ALLOW_SAFE_SETGID);
                    info!("ctx": "confine", "op": "confine_out_thread",
                        "msg": format!("timeout thread confined with{} SROP mitigation",
                            if safe_setid { "out" } else { "" }));
                } else {
                    error!("ctx": "confine", "op": "confine_out_thread",
                        "msg": "timeout thread is running unconfined in debug mode");
                }

                // Enter main loop.
                self.main()
            })
            .map_err(|err| err2no(&err))
    }

    fn main(self) -> SydResult<()> {
        // Sleep for sandbox timeout.
        std::thread::sleep(self.tmout.into());

        // Check for exit notification.
        if self.should_exit.load(Ordering::Acquire) {
            return Ok(());
        }

        let tmout = self.tmout.to_string();
        alert!("ctx": "out", "op": "timeout_exceeded",
            "msg": format!("sandbox timeout `{tmout}' exceeded"),
            "dur": self.tmout.as_secs());
        std::process::exit(124);
    }

    // Confine Timeouter thread.
    pub(crate) fn prepare_confine(
        options: Options,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        if !dry_run {
            // Set up a landlock(7) sandbox to disallow all access.
            let abi = crate::landlock::ABI::new_current();
            let errata = crate::landlock::Errata::query();
            let policy = LandlockPolicy {
                scoped_abs: true,
                scoped_sig: errata.contains(Errata::SCOPED_SIGNAL_SAME_TGID),
                ..Default::default()
            };
            let _ = policy.restrict_self(abi);
        }

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(options.allow_unsafe_exec_speculative())?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Deny rest of open and stat family with ENOSYS rather than KillProcess.
        confine_scmp_open_stat(&mut ctx, false /* openat2 */)?;

        // Allow safe fcntl(2) utility calls.
        confine_scmp_fcntl(&mut ctx, OUT_FCNTL_OPS)?;

        // Prevent executable memory.
        confine_scmp_wx_syd(&mut ctx)?;

        // Allow writes to the log-fd.
        // No proc_pid_mem(5) access required here.
        confine_scmp_write(&mut ctx, None, false)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // Allow safe system calls.
        //
        // KCOV_SYSCALLS is empty in case `kcov` feature is disabled.
        for sysname in OUT_SYSCALLS
            .iter()
            .chain(ALLOC_SYSCALLS)
            .chain(FUTEX_SYSCALLS)
            .chain(GETID_SYSCALLS)
            .chain(KCOV_SYSCALLS)
            .chain(VDSO_SYSCALLS)
        {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_out_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = options.allow_safe_setuid();
        let safe_setgid = options.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            confine_scmp_setid(
                "out",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }
}