syd 3.53.0

rock-solid application kernel
Documentation
//
// Syd: rock-solid application kernel
// src/workers/not.rs: `syd_not' notifier thread
//
// Copyright (c) 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY:
// 1. This module has been liberated from unsafe code!
// 2. This module forbids arithmetic side effects, et al.
#![forbid(unsafe_code)]
#![forbid(clippy::arithmetic_side_effects)]
#![forbid(clippy::cast_possible_truncation)]
#![forbid(clippy::cast_possible_wrap)]

use std::{
    os::fd::RawFd,
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
    thread,
};

use flume::{TryRecvError, TrySendError};
use libseccomp::{ScmpAction, ScmpFilterContext};
use nix::{
    errno::Errno,
    sched::{unshare, CloneFlags},
    unistd::{Gid, Uid},
};

use crate::{
    alert,
    cache::SysNotif,
    compat::seccomp_notif_resp,
    config::*,
    confine::{
        confine_scmp_fcntl, confine_scmp_ioctl_not, confine_scmp_madvise, confine_scmp_open_stat,
        confine_scmp_prctl, confine_scmp_setid, confine_scmp_write, confine_scmp_wx_syd,
        secure_getenv, ExportMode, ScmpNotifReq, Sydcall,
    },
    err::{err2no, scmp2no, SydJoinHandle, SydResult},
    error,
    fd::closeexcept,
    fs::{seccomp_notify_receive, seccomp_notify_respond},
    info,
    landlock::Errata,
    landlock_policy::LandlockPolicy,
    sandbox::Options,
    workers::WorkerCache,
};

#[derive(Clone)]
pub(crate) struct Notifier {
    seccomp_fd: RawFd,
    options: Options,
    transit_uids: Vec<(Uid, Uid)>,
    transit_gids: Vec<(Gid, Gid)>,
    should_exit: Arc<AtomicBool>,
    cache: Arc<WorkerCache>,
}

impl Notifier {
    pub(crate) fn new(
        seccomp_fd: RawFd,
        options: Options,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        should_exit: Arc<AtomicBool>,
        cache: Arc<WorkerCache>,
    ) -> Self {
        Self {
            options,
            seccomp_fd,
            should_exit,
            cache,
            transit_uids: transit_uids.to_vec(),
            transit_gids: transit_gids.to_vec(),
        }
    }

    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(self, sysreq_notif: SysNotif) -> Result<SydJoinHandle<()>, Errno> {
        thread::Builder::new()
            .name("syd_not".to_string())
            .stack_size(NOT_STACK_SIZE)
            .spawn(move || {
                // Use exit_group(2) here to bail, because this
                // unsharing is a critical safety feature.
                if let Err(errno) = unshare(CloneFlags::CLONE_FS | CloneFlags::CLONE_FILES | CloneFlags::CLONE_SYSVSEM) {
                    alert!("ctx": "boot", "op": "unshare_not_thread",
                        "msg": format!("failed to unshare(CLONE_FS|CLONE_FILES|CLONE_SYSVSEM): {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // Interrupt thread needs to inherit the following FDs:
                // 1. Seccomp-notify FD.
                // 2. Log FD.
                // We have to sort the set as the FDs are randomized.
                #[expect(clippy::cast_sign_loss)]
                let mut set = vec![
                    self.seccomp_fd as libc::c_uint,
                    crate::log::LOG_FD.load(Ordering::Relaxed) as libc::c_uint,
                ];
                set.sort_unstable();
                if let Err(errno) = closeexcept(&set) {
                    alert!("ctx": "boot", "op": "close_range_not_thread",
                        "msg": format!("failed to close range: {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }
                drop(set);

                // Honour dry-run when exporting.
                let dry_run =
                    secure_getenv(ENV_SKIP_SCMP).is_some() || ExportMode::from_env().is_some();

                // Confine `syd_mon' thread.
                if !dry_run {
                    // We use exit_group(2) here to bail, because this
                    // confinement is a critical safety feature.
                    let ctx = match Self::prepare_confine(
                        self.seccomp_fd,
                        self.options,
                        &self.transit_uids,
                        &self.transit_gids,
                        false,
                    ) {
                        Ok(ctx) => ctx,
                        Err(error) => {
                            let errno = error.errno().unwrap_or(Errno::ENOSYS);
                            alert!("ctx": "boot", "op": "confine_not_thread",
                                "msg": format!("failed to confine: {error}"),
                                "err": errno as i32);
                            std::process::exit(101);
                        }
                    };

                    // Load seccomp(2) BPF into the kernel.
                    // We use exit_group(2) here to bail, because this
                    // confinement is a critical safety feature.
                    if let Err(error) = ctx.load() {
                        let errno = scmp2no(&error).unwrap_or(Errno::ENOSYS);
                        alert!("ctx": "boot", "op": "confine_int_thread",
                            "msg": format!("failed to confine: {error}"),
                            "err": errno as i32);
                        std::process::exit(101);
                    }

                    let safe_setid = self
                        .options
                        .intersects(Options::OPT_ALLOW_SAFE_SETUID | Options::OPT_ALLOW_SAFE_SETGID);
                    info!("ctx": "confine", "op": "confine_not_thread",
                        "msg": format!("notify thread confined with{} SROP mitigation",
                            if safe_setid { "out" } else { "" }));
                } else {
                    error!("ctx": "confine", "op": "confine_not_thread",
                        "msg": "notify thread is running unconfined in debug mode");
                }

                // Enter main loop.
                self.main(sysreq_notif)
            })
            .map_err(|err| err2no(&err))
    }

    fn main(self, sysreq_notif: SysNotif) -> SydResult<()> {
        loop {
            if self.should_exit.load(Ordering::Acquire) {
                return Ok(());
            }

            let req = if let Some(req) = self.receive()? {
                req
            } else {
                continue;
            };

            self.queue(&sysreq_notif, req)?;
        }
    }

    fn queue(&self, sysreq_notif: &SysNotif, req: ScmpNotifReq) -> Result<(), Errno> {
        let queue = &self.cache.sysreq_queue;

        loop {
            match sysreq_notif.try_send(req) {
                Ok(()) => return Ok(()),
                Err(TrySendError::Full(_)) => match queue.try_recv() {
                    Ok(req_old) => self.deny_syscall(req_old.id, Errno::EINTR),
                    Err(TryRecvError::Empty) => {}
                    Err(TryRecvError::Disconnected) => return Err(Errno::ENOTCONN),
                },
                Err(TrySendError::Disconnected(_)) => return Err(Errno::ENOTCONN),
            }
        }
    }

    fn receive(&self) -> Result<Option<ScmpNotifReq>, Errno> {
        // Receive and return request.
        // Break if file descriptor was closed.
        // Ignore rest of the errors as we cannot handle them,
        // e.g: EINTR|ENOENT: task is killed mid-way.
        match seccomp_notify_receive(self.seccomp_fd) {
            Ok(request) => Ok(Some(request)),
            Err(Errno::EBADF) => Err(Errno::EBADF),
            Err(_) => Ok(None),
        }
    }

    fn deny_syscall(&self, id: u64, errno: Errno) {
        let response = seccomp_notif_resp {
            id,
            val: 0,
            flags: 0,
            error: (errno as i32).checked_neg().unwrap_or(-libc::ENOSYS),
        };

        // EINTR is not retried because it may mean child is signaled.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.seccomp_fd, std::ptr::addr_of!(response));
    }

    // Confine Notifier thread.
    pub(crate) fn prepare_confine(
        seccomp_fd: RawFd,
        options: Options,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
        dry_run: bool,
    ) -> SydResult<ScmpFilterContext> {
        if !dry_run {
            // Set up a landlock(7) sandbox to disallow all access.
            let abi = crate::landlock::ABI::new_current();
            let errata = crate::landlock::Errata::query();
            let policy = LandlockPolicy {
                scoped_abs: true,
                scoped_sig: errata.contains(Errata::SCOPED_SIGNAL_SAME_TGID),
                ..Default::default()
            };
            let _ = policy.restrict_self(abi);
        }

        let restrict_cookie = !options.allow_unsafe_nocookie();

        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(options.allow_unsafe_exec_speculative())?;

        // DO NOT synchronize filter to all threads.
        // Other threads will self-confine.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Deny rest of open and stat family with ENOSYS rather than KillProcess.
        confine_scmp_open_stat(&mut ctx, true /*openat2*/)?;

        // Allow safe seccomp ioctl(2) requests.
        confine_scmp_ioctl_not(&mut ctx, restrict_cookie, seccomp_fd)?;

        // Allow safe fcntl(2) utility calls.
        confine_scmp_fcntl(&mut ctx, NOT_FCNTL_OPS)?;

        // Allow safe prctl(2) operations.
        confine_scmp_prctl(&mut ctx, NOT_PRCTL_OPS)?;

        // Prevent executable memory.
        confine_scmp_wx_syd(&mut ctx)?;

        // Allow writes to the log-fd.
        // No proc_pid_mem(5) access required here.
        confine_scmp_write(&mut ctx, None, true /*log_only*/, restrict_cookie)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // Allow safe, futex and getid system calls.
        //
        // KCOV_SYSCALLS is empty in case `kcov` feature is disabled.
        for sysname in NOT_SYSCALLS
            .iter()
            .chain(ALLOC_SYSCALLS)
            .chain(FUTEX_SYSCALLS)
            .chain(GETID_SYSCALLS)
            .chain(KCOV_SYSCALLS)
            .chain(VDSO_SYSCALLS)
        {
            match Sydcall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_not_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = options.allow_safe_setuid();
        let safe_setgid = options.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            confine_scmp_setid(
                "not",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;
        }

        Ok(ctx)
    }
}