syd 3.54.1

rock-solid application kernel
Documentation
//
// Syd: rock-solid application kernel
// src/workers/out.rs: `syd_out' timeouter thread
//
// Copyright (c) 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY:
// 1. This module has been liberated from unsafe code!
// 2. This module forbids arithmetic side effects, et al.
#![forbid(unsafe_code)]
#![forbid(clippy::arithmetic_side_effects)]
#![forbid(clippy::cast_possible_truncation)]
#![forbid(clippy::cast_possible_wrap)]

use std::{sync::atomic::Ordering, thread};

use dur::Duration;
use libseccomp::{ScmpAction, ScmpFilterContext};
use nix::{
    errno::Errno,
    sched::{unshare, CloneFlags},
    unistd::{Gid, Uid},
};

use crate::{
    alert,
    config::*,
    confine::{
        confine_scmp_fcntl, confine_scmp_madvise, confine_scmp_open_stat, confine_scmp_setid,
        confine_scmp_write, confine_scmp_wx_syd, secure_getenv, ExportMode, Sydcall,
    },
    err::{err2no, scmp2no, SydJoinHandle, SydResult},
    error,
    fd::closeexcept,
    id::SydId,
    info,
    landlock::Errata,
    landlock_policy::LandlockPolicy,
    log_enabled,
    sandbox::Options,
    syslog::LogLevel,
};

#[derive(Clone)]
pub(crate) struct Timeouter {
    timeout: Duration,
    options: Options,

    transit_uids: Vec<(Uid, Uid)>,
    transit_gids: Vec<(Gid, Gid)>,
}

impl Timeouter {
    pub(crate) fn new(
        timeout: Duration,
        options: Options,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> Self {
        Self {
            timeout,
            options,
            transit_uids: transit_uids.to_vec(),
            transit_gids: transit_gids.to_vec(),
        }
    }

    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self) -> Result<SydJoinHandle<()>, Errno> {
        thread::Builder::new()
            .name(SydId::get_name("syd_out").to_string())
            .stack_size(OUT_STACK_SIZE)
            .spawn(move || {
                // Use exit_group(2) here to bail, because this
                // unsharing is a critical safety feature.
                if let Err(errno) = unshare(CloneFlags::CLONE_FS | CloneFlags::CLONE_FILES | CloneFlags::CLONE_SYSVSEM) {
                    alert!("ctx": "boot", "op": "unshare_timeout_thread",
                        "msg": format!("failed to unshare(CLONE_FS|CLONE_FILES |CLONE_SYSVSEM): {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // Timeout thread needs to inherit Log FD.
                #[expect(clippy::cast_sign_loss)]
                let set = &[crate::log::LOG_FD.load(Ordering::Relaxed) as libc::c_uint];
                if let Err(errno) = closeexcept(set) {
                    alert!("ctx": "boot", "op": "close_range_timeout_thread",
                        "msg": format!("failed to close range: {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // Honour dry-run when exporting.
                let dry_run =
                    secure_getenv(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();

                // Confine `syd_out' thread.
                if !dry_run {
                    // We use exit_group(2) here to bail, because this
                    // confinement is a critical safety feature.
                    let ctx = match Self::prepare_confine(
                        self.options,
                        &self.transit_uids,
                        &self.transit_gids,
                        false,
                    ) {
                        Ok(ctx) => ctx,
                        Err(error) => {
                            let errno = error.errno().unwrap_or(Errno::ENOSYS);
                            alert!("ctx": "boot", "op": "confine_out_thread",
                                "msg": format!("failed to confine: {error}"),
                                "err": errno as i32);
                            std::process::exit(101);
                        }
                    };

                    // Load seccomp(2) BPF into the kernel.
                    // We use exit_group(2) here to bail, because this
                    // confinement is a critical safety feature.
                    if let Err(error) = ctx.load() {
                        let errno = scmp2no(&error).unwrap_or(Errno::ENOSYS);
                        alert!("ctx": "boot", "op": "confine_out_thread",
                            "msg": format!("failed to confine: {error}"),
                            "err": errno as i32);
                        std::process::exit(101);
                    }

                    let safe_setid = self
                        .options
                        .intersects(Options::OPT_ALLOW_SAFE_SETUID | Options::OPT_ALLOW_SAFE_SETGID);
                    info!("ctx": "confine", "op": "confine_out_thread",
                        "msg": format!("timeout thread confined with{} SROP mitigation",
                            if safe_setid { "out" } else { "" }));
                } else {
                    error!("ctx": "confine", "op": "confine_out_thread",
                        "msg": "timeout thread is running unconfined in debug mode");
                }

                // Enter main loop.
                self.main()
            })
            .map_err(|err| err2no(&err))
    }

    fn main(self) -> SydResult<()> {
        // Sleep for sandbox timeout.
        std::thread::sleep(self.timeout.into());

        if log_enabled!(LogLevel::Alert) {
            let timeout = self.timeout.to_string();
            alert!("ctx": "out", "op": "timeout_exceeded",
                "msg": format!("sandbox timeout `{timeout}' exceeded"),
                "dur": self.timeout.as_secs());
        }

        // Good bye cruel world.
        std::process::exit(124);
    }

    // Confine Timeouter thread.
    pub(crate) fn prepare_confine(
        options: Options,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        if !dry_run {
            // Set up a landlock(7) sandbox to disallow all access.
            let abi = crate::landlock::ABI::new_current();
            let errata = crate::landlock::Errata::query();
            let policy = LandlockPolicy {
                scoped_abs: true,
                scoped_sig: errata.contains(Errata::SCOPED_SIGNAL_SAME_TGID),
                ..Default::default()
            };
            let _ = policy.restrict_self(abi);
        }

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(options.allow_unsafe_exec_speculative())?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Syscall argument cookies may be disabled
        // at startup with trace/allow_unsafe_nocookie:1.
        let restrict_cookie = !options.allow_unsafe_nocookie();

        // Deny rest of open and stat family with ENOSYS rather than KillProcess.
        confine_scmp_open_stat(&mut ctx, false /*openat2*/)?;

        // Allow safe fcntl(2) utility calls.
        confine_scmp_fcntl(&mut ctx, OUT_FCNTL_OPS)?;

        // Prevent executable memory.
        confine_scmp_wx_syd(&mut ctx)?;

        // Allow writes to the log-fd.
        // No proc_pid_mem(5) access required here.
        confine_scmp_write(&mut ctx, None, true /*log_only*/, restrict_cookie)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // Allow safe system calls.
        //
        // KCOV_SYSCALLS is empty in case `kcov` feature is disabled.
        for sysname in OUT_SYSCALLS
            .iter()
            .chain(ALLOC_SYSCALLS)
            .chain(FUTEX_SYSCALLS)
            .chain(GETID_SYSCALLS)
            .chain(KCOV_SYSCALLS)
            .chain(VDSO_SYSCALLS)
        {
            match Sydcall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_out_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = options.allow_safe_setuid();
        let safe_setgid = options.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            confine_scmp_setid(
                "out",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }
}