zeph-tools 0.19.2

// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
// SPDX-License-Identifier: MIT OR Apache-2.0

//! Linux sandbox backend using `bwrap` (bubblewrap) + Landlock + seccomp BPF.
//!
//! # Architecture (zero `unsafe_code`)
//!
//! 1. **bwrap** — the command is rewritten as `bwrap <namespace flags> <bind mounts> -- bash -c
//!    <code>`. `bwrap` creates a new user namespace, PID namespace, and network namespace (when
//!    network is blocked), giving strong isolation at the OS level.
//!
//! 2. **Landlock** — `landlock::RulesetAttr::restrict_self()` is called on a throw-away
//!    `std::thread::spawn` thread *before* `Command::spawn()`. The restriction tightens only
//!    the spawned thread's filesystem access, which is inherited across `fork()+exec()` into
//!    the `bwrap` child. The main Tokio runtime threads are never restricted.
//!
//!    Why a throw-away thread: `restrict_self` is a per-thread LSM operation in Landlock v4+.
//!    Calling it on the Tokio worker thread would permanently restrict that thread's filesystem
//!    access for all subsequent tasks. Isolating the restriction to a dedicated thread that
//!    only calls `spawn()` and exits preserves the runtime's health.
//!
//! 3. **seccomp BPF** — a compiled BPF program is passed to `bwrap --seccomp <fd>`. The BPF
//!    filter is generated by `seccompiler` and applied inside the bwrap child process. The
//!    parent Zeph process is never restricted.
//!
//! # NFR-SB-3
//!
//! If `bwrap` is not found on `PATH`:
//! - `strict = true` (default): returns `SandboxError::Unavailable` → startup aborts.
//! - `strict = false`: falls back to `NoopSandbox` and logs `WARN`.

use std::os::fd::OwnedFd;
use std::path::{Path, PathBuf};
use std::sync::Mutex;

use landlock::{
    Access, AccessFs, BitFlags, PathBeneath, PathFd, RestrictionStatus, Ruleset, RulesetAttr,
    RulesetCreatedAttr,
};
use seccompiler::{BpfProgram, SeccompAction, SeccompFilter, TargetArch};
use tokio::process::Command;

use super::{Sandbox, SandboxError, SandboxPolicy, SandboxProfile};

/// Linux sandbox backend combining `bwrap` namespaces, Landlock FS rules, and seccomp BPF.
///
/// Constructed via [`LinuxSandbox::new`]; holds the resolved path to `bwrap` and the
/// pre-compiled BPF program (compiled once at construction, reused on every `wrap()` call).
#[derive(Debug)]
pub struct LinuxSandbox {
    bwrap_path: PathBuf,
    /// BPF program bytes serialized once at construction.
    bpf_bytes: Vec<u8>,
    /// Keeps the BPF tempfile fd alive between `wrap()` and the subsequent `spawn()`.
    ///
    /// `bwrap --seccomp <fd>` requires the fd to remain open until after fork. Without this
    /// slot, `into_raw_fd()` would leave an unowned fd that leaks if spawn fails or panics.
    /// Each `wrap()` call replaces the slot (dropping and closing the previous fd).
    pending_fd: Mutex<Option<OwnedFd>>,
}

impl LinuxSandbox {
    /// Locate `bwrap` and construct the backend.
    ///
    /// # Errors
    ///
    /// Returns [`SandboxError::Unavailable`] when `bwrap` is absent and `strict = true`.
    /// When `strict = false` and `bwrap` is missing, callers should fall back to
    /// [`super::NoopSandbox`].
    pub fn new(strict: bool) -> Result<Self, SandboxError> {
        match locate_bwrap() {
            Some(path) => {
                let bpf_bytes = compile_bpf_bytes()?;
                Ok(Self {
                    bwrap_path: path,
                    bpf_bytes,
                    pending_fd: Mutex::new(None),
                })
            }
            None if strict => Err(SandboxError::Unavailable {
                reason: "bwrap not found on PATH; install bubblewrap or set strict=false to fall back to noop".into(),
            }),
            None => {
                tracing::warn!(
                    "bwrap not found — Linux sandbox falling back to noop (strict=false)"
                );
                Err(SandboxError::Unavailable {
                    reason: "bwrap not found".into(),
                })
            }
        }
    }
}

impl Sandbox for LinuxSandbox {
    fn name(&self) -> &'static str {
        "linux-bwrap-landlock"
    }

    fn supports(&self, _policy: &SandboxPolicy) -> Result<(), SandboxError> {
        Ok(())
    }

    /// Rewrites `cmd` to execute inside `bwrap` with Landlock + seccomp.
    ///
    /// The rewrite modifies `cmd` in-place so that the next `.spawn()` call launches the
    /// sandboxed subprocess. Landlock restriction is applied on the spawning thread via the
    /// [`SpawnWithLandlock`] extension called from `ShellExecutor` just before spawn.
    ///
    /// # Errors
    ///
    /// Returns [`SandboxError`] if the BPF filter cannot be compiled or bwrap setup fails.
    fn wrap(&self, cmd: &mut Command, policy: &SandboxPolicy) -> Result<(), SandboxError> {
        if policy.profile == SandboxProfile::Off {
            return Ok(());
        }

        // Write pre-compiled BPF bytes to a fresh anonymous file for bwrap.
        // Store the OwnedFd in self.pending_fd so it stays open until after spawn().
        // bwrap inherits the fd across fork() and closes it when done.
        let (owned_fd, fd_num) = write_bytes_to_tmpfd(&self.bpf_bytes)?;
        *self.pending_fd.lock().expect("pending_fd lock poisoned") = Some(owned_fd);

        rewrite_with_bwrap(cmd, &self.bwrap_path, policy, fd_num);

        Ok(())
    }
}

fn locate_bwrap() -> Option<PathBuf> {
    // Check common locations first.
    for candidate in &["/usr/bin/bwrap", "/usr/local/bin/bwrap"] {
        let p = PathBuf::from(candidate);
        if p.exists() {
            return Some(p);
        }
    }
    // Fall back to PATH search.
    std::env::var_os("PATH").and_then(|path_var| {
        std::env::split_paths(&path_var).find_map(|dir| {
            let candidate = dir.join("bwrap");
            if candidate.exists() {
                Some(candidate)
            } else {
                None
            }
        })
    })
}

/// Compile BPF filter bytes once at startup for the current arch.
///
/// The filter denies a minimal set of privilege-escalation syscalls and allows everything
/// else. A strict allowlist would break arbitrary bash scripts; bwrap namespace isolation
/// (user-ns, pid-ns, net-ns) is the primary containment layer.
///
/// Syscalls blocked: `ptrace`, `kexec_load`, `init_module`, `finit_module`, `bpf`,
/// `perf_event_open`, `mount`, `umount2`, `pivot_root`, `reboot`, `userfaultfd`,
/// `keyctl`, `add_key`, `request_key`, `swapon`, `swapoff`.
fn compile_bpf_bytes() -> Result<Vec<u8>, SandboxError> {
    let arch = target_arch();
    let rules = escalation_deny_rules();

    let filter = SeccompFilter::new(
        rules,
        // mismatch_action: allow all syscalls not in the denylist
        SeccompAction::Allow,
        // match_action: EPERM for syscalls in the denylist
        SeccompAction::Errno(libc_eperm()),
        arch,
    )
    .map_err(|e| SandboxError::Policy(format!("seccomp filter build failed: {e}")))?;

    let prog: BpfProgram = filter
        .try_into()
        .map_err(|e| SandboxError::Policy(format!("seccomp BPF compilation failed: {e}")))?;

    // Serialize to raw bytes (each sock_filter: code:u16 le, jt:u8, jf:u8, k:u32 le).
    let mut bytes = Vec::with_capacity(prog.len() * 8);
    for insn in &prog {
        bytes.extend_from_slice(&insn.code.to_ne_bytes());
        bytes.push(insn.jt);
        bytes.push(insn.jf);
        bytes.extend_from_slice(&insn.k.to_ne_bytes());
    }
    Ok(bytes)
}

fn target_arch() -> TargetArch {
    #[cfg(target_arch = "aarch64")]
    {
        TargetArch::aarch64
    }
    #[cfg(not(target_arch = "aarch64"))]
    {
        TargetArch::x86_64
    }
}

/// Syscall numbers for privilege-escalation denylist, per architecture.
///
/// Numbers sourced from seccompiler's syscall table (0.4.0).
/// Denied: ptrace, kexec_load, init_module, finit_module, bpf, perf_event_open,
///         mount, umount2, pivot_root, reboot, userfaultfd, keyctl, add_key,
///         request_key, swapon, swapoff.
fn escalation_deny_syscalls() -> &'static [i64] {
    #[cfg(target_arch = "aarch64")]
    {
        // aarch64 syscall numbers
        &[
            217, // add_key
            280, // bpf
            273, // finit_module
            105, // init_module
            104, // kexec_load
            219, // keyctl
            40,  // mount
            241, // perf_event_open
            41,  // pivot_root
            117, // ptrace
            142, // reboot
            218, // request_key
            225, // swapoff
            224, // swapon
            39,  // umount2
            282, // userfaultfd
        ]
    }
    #[cfg(not(target_arch = "aarch64"))]
    {
        // x86_64 syscall numbers
        &[
            248, // add_key
            321, // bpf
            313, // finit_module
            175, // init_module
            246, // kexec_load
            250, // keyctl
            165, // mount
            298, // perf_event_open
            155, // pivot_root
            101, // ptrace
            169, // reboot
            249, // request_key
            168, // swapoff
            167, // swapon
            166, // umount2
            323, // userfaultfd
        ]
    }
}

fn escalation_deny_rules() -> std::collections::BTreeMap<i64, Vec<seccompiler::SeccompRule>> {
    escalation_deny_syscalls()
        .iter()
        .filter_map(|&nr| {
            seccompiler::SeccompRule::new(vec![])
                .ok()
                .map(|rule| (nr, vec![rule]))
        })
        .collect()
}

/// EPERM errno value (1) used in seccomp deny rules.
const fn libc_eperm() -> u32 {
    1
}

/// Write pre-compiled BPF bytes to an anonymous tmpfile and return `(OwnedFd, fd_number)`.
///
/// `bwrap --seccomp <fd>` reads the filter from this descriptor. The fd is intentionally
/// NOT `O_CLOEXEC` so bwrap can read it after fork; it is closed by bwrap when done.
/// The caller must keep the returned [`OwnedFd`] alive until after `Command::spawn()`.
fn write_bytes_to_tmpfd(bpf_bytes: &[u8]) -> Result<(OwnedFd, i32), SandboxError> {
    use std::io::Write as _;

    let mut tmp = tempfile::tempfile().map_err(SandboxError::Setup)?;
    tmp.write_all(bpf_bytes).map_err(SandboxError::Setup)?;
    tmp.flush().map_err(SandboxError::Setup)?;

    use std::io::Seek as _;
    tmp.seek(std::io::SeekFrom::Start(0))
        .map_err(SandboxError::Setup)?;

    use std::os::unix::io::AsRawFd as _;
    let owned: OwnedFd = tmp.into();
    let fd_num = owned.as_raw_fd();
    Ok((owned, fd_num))
}

/// Rewrite `cmd` to execute via `bwrap <flags> -- <original>`.
fn rewrite_with_bwrap(cmd: &mut Command, bwrap: &Path, policy: &SandboxPolicy, seccomp_fd: i32) {
    let std_cmd = cmd.as_std_mut();

    let original_program = std_cmd.get_program().to_os_string();
    let original_args: Vec<std::ffi::OsString> =
        std_cmd.get_args().map(|a| a.to_os_string()).collect();

    let mut bwrap_args: Vec<std::ffi::OsString> = Vec::new();

    // Namespace flags.
    bwrap_args.extend(
        [
            "--unshare-user",
            "--unshare-pid",
            "--unshare-ipc",
            "--unshare-uts",
        ]
        .map(Into::into),
    );

    if !policy.allow_network && policy.profile != SandboxProfile::NetworkAllowAll {
        bwrap_args.push("--unshare-net".into());
    }

    // Essential read-only bind mounts.
    for ro in &["/usr", "/bin", "/sbin", "/lib", "/lib64", "/etc"] {
        let p = Path::new(ro);
        if p.exists() {
            bwrap_args.extend(["--ro-bind".into(), ro.into(), ro.into()]);
        }
    }

    // /proc and /dev.
    bwrap_args.extend(["--proc".into(), "/proc".into()]);
    bwrap_args.extend(["--dev".into(), "/dev".into()]);

    // Tmp.
    bwrap_args.extend(["--tmpfs".into(), "/tmp".into()]);

    // allow_read paths.
    for path in &policy.allow_read {
        let p = path.display().to_string();
        bwrap_args.extend(["--ro-bind".into(), p.clone().into(), p.into()]);
    }

    // allow_write paths.
    for path in &policy.allow_write {
        let p = path.display().to_string();
        bwrap_args.extend(["--bind".into(), p.clone().into(), p.into()]);
    }

    // Seccomp fd.
    bwrap_args.push("--seccomp".into());
    bwrap_args.push(seccomp_fd.to_string().into());

    bwrap_args.push("--".into());
    bwrap_args.push(original_program);
    for arg in original_args {
        bwrap_args.push(arg);
    }

    *std_cmd = std::process::Command::new(bwrap);
    for arg in bwrap_args {
        std_cmd.arg(arg);
    }
}

/// Apply a Landlock FS ruleset on the *calling* thread and restrict it to `policy`.
///
/// # Thread-exit and child-inheritance chain
///
/// This function is designed to be called from a **throw-away `std::thread::spawn` thread**
/// that also calls `cmd.spawn()` immediately afterwards. The sequence is:
///
/// 1. `apply_landlock(policy)` — calls `landlock_restrict_self(2)` (via the safe
///    [`landlock`] crate API) on the **current Linux task** (thread). The kernel installs
///    a new LSM ruleset that tightens filesystem access to `allow_read` and `allow_write`
///    paths only.
///
/// 2. `cmd.spawn()` — the kernel `clone3`/`fork+exec` call inherits the per-thread Landlock
///    ruleset into the child process. Per Linux `Documentation/userspace-api/landlock.rst`,
///    Landlock rulesets are inherited across `execve` via the thread's LSM credential domain.
///    The `bwrap` child process (and transitively the `bash` sub-process inside it) therefore
///    runs under the same filesystem restrictions.
///
/// 3. The throw-away thread exits after `cmd.spawn()` returns (not after `.wait()`).
///    Thread exit releases the thread-local LSM state. **Tokio worker threads (and the main
///    thread) are never restricted** — they were not the ones that called `restrict_self`.
///
/// 4. The caller awaits `child.wait()` on the Tokio runtime. At this point only the child
///    process carries the Landlock restriction; the parent Zeph process is unaffected.
///
/// ## Why a throw-away thread (not `pre_exec`)
///
/// `std::os::unix::process::CommandExt::pre_exec` would require `unsafe` code and is
/// prohibited by `specs/constitution.md §52,82`. Calling `restrict_self` on a Tokio worker
/// thread would permanently degrade that worker's filesystem access for all subsequent
/// tasks. A throw-away `std::thread::spawn` is the only safe, `unsafe`-free approach.
///
/// ## Kernel documentation reference
///
/// <https://www.kernel.org/doc/html/latest/userspace-api/landlock.html>
///
/// # Errors
///
/// Returns [`SandboxError::Policy`] when ruleset creation or rule addition fails.
/// Returns [`SandboxError::Setup`] on I/O errors opening path file descriptors.
pub fn apply_landlock(policy: &SandboxPolicy) -> Result<RestrictionStatus, SandboxError> {
    let abi = landlock::ABI::V4;

    let ruleset = Ruleset::default()
        .handle_access(AccessFs::from_all(abi))
        .map_err(|e| SandboxError::Policy(format!("landlock handle_access: {e}")))?
        .create()
        .map_err(|e| SandboxError::Policy(format!("landlock create: {e}")))?;

    let mut ruleset = ruleset;

    // Grant read+execute to allow_read paths.
    let read_access = AccessFs::ReadFile | AccessFs::ReadDir | AccessFs::Execute;
    for path in &policy.allow_read {
        if path.exists() {
            let fd =
                PathFd::new(path).map_err(|e| SandboxError::Setup(std::io::Error::other(e)))?;
            ruleset = ruleset
                .add_rule(PathBeneath::new(fd, read_access))
                .map_err(|e| SandboxError::Policy(format!("landlock add_rule read: {e}")))?;
        }
    }

    // Grant read+write+execute to allow_write paths.
    let write_access = read_access | AccessFs::WriteFile | AccessFs::MakeDir | AccessFs::MakeReg;
    for path in &policy.allow_write {
        if path.exists() {
            let fd =
                PathFd::new(path).map_err(|e| SandboxError::Setup(std::io::Error::other(e)))?;
            ruleset = ruleset
                .add_rule(PathBeneath::new(fd, write_access))
                .map_err(|e| SandboxError::Policy(format!("landlock add_rule write: {e}")))?;
        }
    }

    // Essential system paths always need read access.
    let sys_read = AccessFs::ReadFile | AccessFs::ReadDir | AccessFs::Execute;
    for sys_path in &["/usr", "/bin", "/sbin", "/lib", "/lib64", "/etc"] {
        let p = Path::new(sys_path);
        if p.exists() {
            let fd = PathFd::new(p).map_err(|e| SandboxError::Setup(std::io::Error::other(e)))?;
            ruleset = ruleset
                .add_rule(PathBeneath::new(fd, sys_read))
                .map_err(|e| {
                    SandboxError::Policy(format!("landlock add_rule sys {sys_path}: {e}"))
                })?;
        }
    }

    let status = ruleset
        .restrict_self()
        .map_err(|e| SandboxError::Policy(format!("landlock restrict_self: {e}")))?;

    Ok(status)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn locate_bwrap_returns_none_when_absent() {
        // Test that the locate function doesn't panic when bwrap is absent.
        // We can't assert presence since CI may or may not have bwrap.
        let _ = locate_bwrap();
    }

    /// Verify that Landlock restriction on a throw-away thread does not affect the
    /// current thread's filesystem access.
    ///
    /// This test requires Linux + kernel Landlock support. It is ignored in CI without
    /// the `sandbox` feature environment but documents the expected behavior.
    #[cfg(all(target_os = "linux", feature = "sandbox"))]
    #[test]
    fn landlock_restriction_isolated_to_spawned_thread() {
        use std::sync::Arc;

        // Verify the current thread can read /etc/hostname before restriction.
        assert!(
            std::path::Path::new("/etc/hostname").exists(),
            "/etc/hostname must exist for this test"
        );

        let policy = SandboxPolicy {
            profile: SandboxProfile::Workspace,
            allow_read: vec![std::path::PathBuf::from("/tmp")],
            allow_write: vec![std::path::PathBuf::from("/tmp")],
            allow_network: false,
            ..Default::default()
        };
        let policy = Arc::new(policy);
        let policy_clone = Arc::clone(&policy);

        // Apply landlock on a dedicated thread simulating the spawn path.
        let handle = std::thread::spawn(move || {
            let status = apply_landlock(&policy_clone);
            // The status might be partial if kernel doesn't support all access types,
            // but it should not error out.
            status
        });

        let result = handle.join().expect("landlock thread should not panic");
        // We don't assert Ok() because kernels < 5.13 won't support landlock and
        // restrict_self returns BestEffort status.
        drop(result);

        // Current thread (main test thread) must STILL be able to read /etc/hostname
        // — the landlock restriction was isolated to the spawned thread only.
        assert!(
            std::path::Path::new("/etc/hostname").exists(),
            "main thread must retain filesystem access after spawned thread applies landlock"
        );
    }

    /// Integration test: bwrap + Landlock enforce path isolation for the child process
    /// while the parent Zeph test process retains full filesystem access after child exits.
    ///
    /// Skipped when `bwrap` is not installed.
    ///
    /// Assertions:
    /// - (a) Child can read from `allow_read` path.
    /// - (b) Child cannot read from a path outside `allow_read` (EACCES or similar).
    /// - (c) Parent test thread can read `/etc/hostname` after the child exits.
    #[cfg(all(target_os = "linux", feature = "sandbox"))]
    #[test]
    fn bwrap_landlock_path_isolation() {
        use std::fs;
        use std::process::Stdio;

        // Skip test if bwrap is not installed.
        if locate_bwrap().is_none() {
            eprintln!("bwrap not installed — skipping bwrap_landlock_path_isolation");
            return;
        }

        // Create two temp files: one allowed, one denied.
        let tmp = tempfile::TempDir::new().expect("TempDir");
        let allowed_path = tmp.path().join("sandbox-ro-allowed");
        let denied_path = tmp.path().join("sandbox-ro-denied");
        fs::write(&allowed_path, "allowed-content").expect("write allowed");
        fs::write(&denied_path, "denied-content").expect("write denied");

        let bwrap = locate_bwrap().expect("bwrap present");

        // (a) Reading allowed path must succeed.
        let out_a = std::process::Command::new(&bwrap)
            .args([
                "--unshare-user",
                "--unshare-pid",
                "--ro-bind",
                "/usr",
                "/usr",
                "--ro-bind",
                "/bin",
                "/bin",
                "--ro-bind",
                "/lib",
                "/lib",
                "--proc",
                "/proc",
                "--dev",
                "/dev",
                "--tmpfs",
                "/tmp",
                "--ro-bind",
                allowed_path.to_str().expect("utf8"),
                allowed_path.to_str().expect("utf8"),
                "--",
                "cat",
            ])
            .arg(&allowed_path)
            .stdout(Stdio::piped())
            .stderr(Stdio::piped())
            .output()
            .expect("spawn bwrap for allowed read");

        assert!(
            out_a.status.success(),
            "child should read allowed path; stderr: {}",
            String::from_utf8_lossy(&out_a.stderr)
        );
        assert!(
            String::from_utf8_lossy(&out_a.stdout).contains("allowed-content"),
            "allowed path content mismatch"
        );

        // (b) Reading denied path must fail (not bound into bwrap namespace).
        let out_b = std::process::Command::new(&bwrap)
            .args([
                "--unshare-user",
                "--unshare-pid",
                "--ro-bind",
                "/usr",
                "/usr",
                "--ro-bind",
                "/bin",
                "/bin",
                "--ro-bind",
                "/lib",
                "/lib",
                "--proc",
                "/proc",
                "--dev",
                "/dev",
                "--tmpfs",
                "/tmp",
                // denied_path intentionally NOT bound.
                "--",
                "cat",
            ])
            .arg(&denied_path)
            .stdout(Stdio::piped())
            .stderr(Stdio::piped())
            .output()
            .expect("spawn bwrap for denied read");

        assert!(
            !out_b.status.success(),
            "child should fail to read denied path; stdout: {}",
            String::from_utf8_lossy(&out_b.stdout)
        );

        // (c) Parent process retains full filesystem access after child exits.
        assert!(
            std::path::Path::new("/etc/hostname").exists(),
            "parent test process must retain full filesystem access after child exits"
        );
    }
}