cellos-supervisor 0.5.1

CellOS execution-cell runner — boots cells in Firecracker microVMs or gVisor, enforces narrow typed authority, emits signed CloudEvents.
Documentation
//! Optional seccomp(2) filter applied in the `spec.run` child after namespace hooks, before `execve`.
//! Supports either an operator-supplied classic BPF program or a bundled baseline profile.

use std::path::{Path, PathBuf};

/// Maximum `sock_filter` instructions in bytes (conservative; kernel limit is higher on modern kernels).
const MAX_FILTER_LEN: usize = 4096 * 8;
const SOCK_FILTER_SIZE: usize = std::mem::size_of::<libc::sock_filter>();
const SECCOMP_DATA_NR_OFFSET: u32 = 0;
const SECCOMP_DATA_ARCH_OFFSET: u32 = 4;
const SECCOMP_RET_ALLOW: u32 = 0x7fff_0000;
const SECCOMP_RET_ERRNO: u32 = 0x0005_0000;
const SECCOMP_RET_KILL_PROCESS: u32 = 0x8000_0000;
const BPF_LD: u16 = 0x00;
const BPF_W: u16 = 0x00;
const BPF_ABS: u16 = 0x20;
const BPF_JMP: u16 = 0x05;
const BPF_JEQ: u16 = 0x10;
const BPF_K: u16 = 0x00;
const BPF_RET: u16 = 0x06;
const PR_SET_NO_NEW_PRIVS: libc::c_int = 38;

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum SeccompSelection {
    Baseline,
    ExternalBpf,
}

#[cfg(target_arch = "x86_64")]
const SECCOMP_AUDIT_ARCH: u32 = 0xc000_003e;
#[cfg(target_arch = "aarch64")]
const SECCOMP_AUDIT_ARCH: u32 = 0xc000_00b7;
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
const SECCOMP_AUDIT_ARCH: u32 = 0;

pub(crate) fn seccomp_bpf_path_from_env() -> Option<PathBuf> {
    let raw = std::env::var_os("CELLOS_SECCOMP_BPF_PATH")?;
    if raw.is_empty() {
        return None;
    }
    Some(PathBuf::from(raw))
}

#[allow(dead_code)]
pub(crate) fn seccomp_requested_from_env() -> bool {
    if seccomp_bpf_path_from_env().is_some() {
        return true;
    }
    std::env::var("CELLOS_SUBPROCESS_SECCOMP")
        .map(|raw| {
            let value = raw.trim();
            !value.is_empty()
                && value != "0"
                && !value.eq_ignore_ascii_case("off")
                && !value.eq_ignore_ascii_case("none")
        })
        .unwrap_or(false)
}

/// Resolve a seccomp program from environment, or `Ok(None)` if seccomp is disabled.
///
/// Supported forms:
/// - `CELLOS_SUBPROCESS_SECCOMP=baseline|default|1` -> bundled baseline denylist
/// - `CELLOS_SUBPROCESS_SECCOMP=bpf` + `CELLOS_SECCOMP_BPF_PATH=/path/to/filter.bpf`
/// - legacy: `CELLOS_SECCOMP_BPF_PATH=/path/to/filter.bpf` with profile unset
pub(crate) fn load_seccomp_program_from_env() -> Result<Option<Vec<u8>>, String> {
    let selection = seccomp_selection_from_env()?;
    let bpf_path = seccomp_bpf_path_from_env();

    match (selection, bpf_path) {
        (None, None) => Ok(None),
        (None, Some(path)) => load_seccomp_program_from_file(&path).map(Some),
        (Some(SeccompSelection::Baseline), None) => Ok(Some(baseline_seccomp_program()?)),
        (Some(SeccompSelection::Baseline), Some(path)) => Err(format!(
            "CELLOS_SUBPROCESS_SECCOMP=baseline cannot be combined with CELLOS_SECCOMP_BPF_PATH ({})",
            path.display()
        )),
        (Some(SeccompSelection::ExternalBpf), Some(path)) => {
            load_seccomp_program_from_file(&path).map(Some)
        }
        (Some(SeccompSelection::ExternalBpf), None) => Err(
            "CELLOS_SUBPROCESS_SECCOMP=bpf requires CELLOS_SECCOMP_BPF_PATH".into(),
        ),
    }
}

fn seccomp_selection_from_env() -> Result<Option<SeccompSelection>, String> {
    let Ok(raw) = std::env::var("CELLOS_SUBPROCESS_SECCOMP") else {
        return Ok(None);
    };
    let value = raw.trim();
    if value.is_empty()
        || value == "0"
        || value.eq_ignore_ascii_case("off")
        || value.eq_ignore_ascii_case("none")
    {
        return Ok(None);
    }
    if value == "1"
        || value.eq_ignore_ascii_case("default")
        || value.eq_ignore_ascii_case("baseline")
    {
        return Ok(Some(SeccompSelection::Baseline));
    }
    if value.eq_ignore_ascii_case("bpf")
        || value.eq_ignore_ascii_case("path")
        || value.eq_ignore_ascii_case("file")
    {
        return Ok(Some(SeccompSelection::ExternalBpf));
    }
    Err(format!(
        "CELLOS_SUBPROCESS_SECCOMP must be one of: 1, default, baseline, bpf, 0, off, none (got {value:?})"
    ))
}

fn load_seccomp_program_from_file(path: &Path) -> Result<Vec<u8>, String> {
    let bytes = std::fs::read(path)
        .map_err(|e| format!("CELLOS_SECCOMP_BPF_PATH {}: {e}", path.display()))?;
    validate_seccomp_bpf_bytes(&bytes)?;
    Ok(bytes)
}

/// Validates raw seccomp BPF program bytes (classic `sock_filter` array).
pub(crate) fn validate_seccomp_bpf_bytes(bytes: &[u8]) -> Result<(), String> {
    if bytes.is_empty() {
        return Err("seccomp BPF program is empty".into());
    }
    if bytes.len() % SOCK_FILTER_SIZE != 0 {
        return Err(format!(
            "seccomp BPF length {} is not a multiple of {} (sock_filter size)",
            bytes.len(),
            SOCK_FILTER_SIZE
        ));
    }
    if bytes.len() > MAX_FILTER_LEN {
        return Err(format!(
            "seccomp BPF program too large (max {MAX_FILTER_LEN} bytes)"
        ));
    }
    Ok(())
}

fn baseline_seccomp_program() -> Result<Vec<u8>, String> {
    if SECCOMP_AUDIT_ARCH == 0 {
        return Err(
            "bundled baseline seccomp profile currently supports x86_64 and aarch64 only".into(),
        );
    }
    let mut filters = vec![stmt(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_ARCH_OFFSET)];
    filters.push(jump(BPF_JMP + BPF_JEQ + BPF_K, SECCOMP_AUDIT_ARCH, 1, 0));
    filters.push(stmt(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS));
    filters.push(stmt(BPF_LD + BPF_W + BPF_ABS, SECCOMP_DATA_NR_OFFSET));
    for nr in baseline_blocked_syscalls() {
        filters.push(jump(BPF_JMP + BPF_JEQ + BPF_K, *nr as u32, 0, 1));
        filters.push(stmt(
            BPF_RET + BPF_K,
            SECCOMP_RET_ERRNO | (libc::EPERM as u32),
        ));
    }
    filters.push(stmt(BPF_RET + BPF_K, SECCOMP_RET_ALLOW));
    sock_filter_bytes(&filters)
}

fn baseline_blocked_syscalls() -> &'static [libc::c_long] {
    &[
        libc::SYS_ptrace,
        libc::SYS_process_vm_readv,
        libc::SYS_process_vm_writev,
        libc::SYS_bpf,
        libc::SYS_perf_event_open,
        libc::SYS_keyctl,
        libc::SYS_add_key,
        libc::SYS_request_key,
        libc::SYS_mount,
        libc::SYS_umount2,
        libc::SYS_pivot_root,
        libc::SYS_setns,
        libc::SYS_unshare,
        libc::SYS_init_module,
        libc::SYS_finit_module,
        libc::SYS_delete_module,
        libc::SYS_kexec_load,
    ]
}

fn stmt(code: u16, k: u32) -> libc::sock_filter {
    libc::sock_filter {
        code,
        jt: 0,
        jf: 0,
        k,
    }
}

fn jump(code: u16, k: u32, jt: u8, jf: u8) -> libc::sock_filter {
    libc::sock_filter { code, jt, jf, k }
}

fn sock_filter_bytes(filters: &[libc::sock_filter]) -> Result<Vec<u8>, String> {
    let len = std::mem::size_of_val(filters);
    // SAFETY: `sock_filter` is POD from libc; we only copy the bytes into an owned Vec<u8>.
    let bytes = unsafe { std::slice::from_raw_parts(filters.as_ptr().cast::<u8>(), len) }.to_vec();
    validate_seccomp_bpf_bytes(&bytes)?;
    Ok(bytes)
}

/// Apply a classic BPF seccomp filter in the current thread (child before exec).
/// Sets `PR_SET_NO_NEW_PRIVS` then `seccomp(SECCOMP_SET_MODE_FILTER)`.
pub(crate) fn apply_seccomp_filter(program: &[u8]) -> Result<(), std::io::Error> {
    validate_seccomp_bpf_bytes(program)
        .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
    // SAFETY: libc seccomp + prctl; program outlives this call on the same stack as the slice.
    unsafe {
        if libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0 {
            return Err(std::io::Error::last_os_error());
        }

        let n_insn = program.len() / SOCK_FILTER_SIZE;
        let fprog = libc::sock_fprog {
            len: n_insn as libc::c_ushort,
            filter: program.as_ptr().cast_mut().cast(),
        };

        const SECCOMP_SET_MODE_FILTER: libc::c_uint = 1;
        let rc = libc::syscall(
            libc::SYS_seccomp,
            SECCOMP_SET_MODE_FILTER as libc::c_long,
            0 as libc::c_long,
            &fprog as *const libc::sock_fprog,
        );
        if rc != 0 {
            return Err(std::io::Error::last_os_error());
        }
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::{
        baseline_seccomp_program, load_seccomp_program_from_env, seccomp_requested_from_env,
        validate_seccomp_bpf_bytes,
    };
    use std::sync::Mutex;

    /// Serialize tests that mutate `CELLOS_SUBPROCESS_SECCOMP` / `CELLOS_SECCOMP_BPF_PATH`.
    /// `std::env::set_var` is not thread-safe; without this lock, parallel tests race on the
    /// same env keys and produce flaky results.
    static ENV_LOCK: Mutex<()> = Mutex::new(());

    struct EnvGuard {
        key: &'static str,
        value: Option<std::ffi::OsString>,
    }

    impl EnvGuard {
        fn set(key: &'static str, value: &str) -> Self {
            let original = std::env::var_os(key);
            std::env::set_var(key, value);
            Self {
                key,
                value: original,
            }
        }

        fn remove(key: &'static str) -> Self {
            let original = std::env::var_os(key);
            std::env::remove_var(key);
            Self {
                key,
                value: original,
            }
        }
    }

    impl Drop for EnvGuard {
        fn drop(&mut self) {
            if let Some(value) = self.value.take() {
                std::env::set_var(self.key, value);
            } else {
                std::env::remove_var(self.key);
            }
        }
    }

    #[test]
    fn validate_rejects_empty() {
        assert!(validate_seccomp_bpf_bytes(&[]).is_err());
    }

    #[test]
    fn validate_rejects_bad_length() {
        assert!(validate_seccomp_bpf_bytes(&[0u8; 7]).is_err());
    }

    #[test]
    fn validate_accepts_minimal_non_empty_multiple_of_8() {
        assert!(validate_seccomp_bpf_bytes(&[0u8; 8]).is_ok());
    }

    #[test]
    fn baseline_program_is_valid_bpf() {
        let bytes = baseline_seccomp_program().expect("baseline program");
        validate_seccomp_bpf_bytes(&bytes).expect("valid baseline bytes");
    }

    #[test]
    fn requested_detects_baseline_profile() {
        let _lock = ENV_LOCK.lock().unwrap();
        let _profile = EnvGuard::set("CELLOS_SUBPROCESS_SECCOMP", "baseline");
        let _path = EnvGuard::remove("CELLOS_SECCOMP_BPF_PATH");
        assert!(seccomp_requested_from_env());
    }

    #[test]
    fn load_prefers_baseline_profile() {
        let _lock = ENV_LOCK.lock().unwrap();
        let _profile = EnvGuard::set("CELLOS_SUBPROCESS_SECCOMP", "baseline");
        let _path = EnvGuard::remove("CELLOS_SECCOMP_BPF_PATH");
        assert!(load_seccomp_program_from_env().unwrap().is_some());
    }

    #[test]
    fn load_rejects_unknown_profile() {
        let _lock = ENV_LOCK.lock().unwrap();
        let _profile = EnvGuard::set("CELLOS_SUBPROCESS_SECCOMP", "nope");
        let _path = EnvGuard::remove("CELLOS_SECCOMP_BPF_PATH");
        assert!(load_seccomp_program_from_env().is_err());
    }
}