envseal 0.3.9 - Docs.rs

//! Post-clone sandbox helper — finishes Lockdown setup and execs the target.
//!
//! `apply_sandbox(SandboxTier::Lockdown)` runs in the child's `pre_exec`
//! closure, which is restricted to async-signal-safe syscalls. It can call
//! [`unshare(2)`] to create the new namespaces but cannot call [`mount(2)`]
//! (not async-signal-safe). To finish the job we re-exec into envseal itself
//! in `__sandbox_helper` mode: by that point we are out of `pre_exec` and
//! free to use any syscall, but still inside the new namespaces.
//!
//! # Helper protocol
//!
//! ```text
//! envseal __sandbox_helper --target-fd <N> --arg0 <PATH> -- <argv1> [argv...]
//! ```
//!
//! - `--target-fd <N>` — file-descriptor (inherited from the parent without
//!   `FD_CLOEXEC`) pinning the target binary. The helper `execve`s
//!   `/proc/self/fd/N` so TOCTOU-binding is preserved across the helper
//!   indirection.
//! - `--arg0 <PATH>` — `argv[0]` to pass to the target so it sees its
//!   conventional path, not `/proc/self/fd/N`.
//! - `-- <argv1> ...` — the rest of the target's argv.
//!
//! # Setup performed
//!
//! - `MS_PRIVATE` propagation on `/` so subsequent mounts don't leak back to
//!   the host.
//! - Fresh `tmpfs` mounted on `/tmp`. This is what gives Lockdown its
//!   "child cannot exfiltrate via the filesystem" property: anything the
//!   child writes to `/tmp` lives in tmpfs that the host parent has no
//!   handle to.
//!
//! # Process lifetime
//!
//! Returns `!` — on success it never returns (it `execve`s the target).
//! On failure it prints to stderr and exits non-zero so the parent
//! supervisor can detect it.

#![cfg(target_os = "linux")]

use std::ffi::CString;
use std::os::unix::io::RawFd;

/// Enter helper mode: parse helper args, finish Lockdown setup, exec target.
///
/// Called from the CLI dispatcher when `argv[1] == "__sandbox_helper"`.
/// Never returns — either `execve`s the target or `exit`s with an error.
///
/// # Errors / exits
///
/// Writes a one-line diagnostic to stderr and `exit(127)`s on argv parse
/// failure, mount failure, or `execve` failure.
pub fn run_helper(args: &[String]) -> ! {
    let parsed = match parse_args(args) {
        Ok(p) => p,
        Err(msg) => {
            eprintln!("envseal __sandbox_helper: {msg}");
            std::process::exit(127);
        }
    };

    if let Err(msg) = setup_private_mounts() {
        eprintln!("envseal __sandbox_helper: mount setup failed: {msg}");
        std::process::exit(127);
    }

    // `exec_target` returns `Result<Infallible, String>`. The `Ok` arm
    // cannot be constructed (`Infallible` is uninhabited), so this
    // match exhaustively handles the only reachable case — and we
    // never have to fabricate a panic for the impossible branch.
    match exec_target(&parsed) {
        Err(msg) => {
            eprintln!("envseal __sandbox_helper: exec failed: {msg}");
            std::process::exit(127);
        }
    }
}

struct ParsedArgs {
    target_fd: RawFd,
    arg0: String,
    rest_argv: Vec<String>,
}

fn parse_args(args: &[String]) -> Result<ParsedArgs, String> {
    let sep = args
        .iter()
        .position(|a| a == "--")
        .ok_or_else(|| "missing `--` separator before target argv".to_string())?;
    let head = &args[..sep];
    let tail = &args[sep + 1..];

    let mut found_target_fd: Option<RawFd> = None;
    let mut found_arg0: Option<String> = None;
    let mut iter = head.iter();
    while let Some(arg) = iter.next() {
        match arg.as_str() {
            "--target-fd" => {
                let v = iter
                    .next()
                    .ok_or_else(|| "--target-fd requires a value".to_string())?;
                found_target_fd = Some(
                    v.parse::<RawFd>()
                        .map_err(|e| format!("invalid --target-fd {v:?}: {e}"))?,
                );
            }
            "--arg0" => {
                found_arg0 = Some(
                    iter.next()
                        .ok_or_else(|| "--arg0 requires a value".to_string())?
                        .clone(),
                );
            }
            other => return Err(format!("unrecognized helper flag: {other}")),
        }
    }

    Ok(ParsedArgs {
        target_fd: found_target_fd.ok_or_else(|| "--target-fd is required".to_string())?,
        arg0: found_arg0.ok_or_else(|| "--arg0 is required".to_string())?,
        rest_argv: tail.to_vec(),
    })
}

/// Set up private mounts inside the helper's mount namespace.
///
/// 1. Mark all mounts `MS_PRIVATE`/`MS_REC` so further mounts in this
///    namespace don't propagate back to the host.
/// 2. Remount the root filesystem read-only so the child cannot write to
///    host paths.
/// 3. Mount a fresh `tmpfs` over `/tmp` so writes there live in tmpfs that
///    the host has no handle to.
/// 4. Mount fresh `tmpfs` over `/dev/shm` and `/var/tmp` for ephemeral
///    scratch spaces.
fn setup_private_mounts() -> Result<(), String> {
    // SAFETY: before touching mounts, verify we are in a distinct mount
    // namespace from PID 1. If we are not, remounting root read-only would
    // cripple the host system.
    if !in_private_mount_ns() {
        return Err("refusing to modify mounts: not in a private mount namespace".to_string());
    }

    let root = CString::new("/").map_err(|e| e.to_string())?;
    let none = CString::new("none").map_err(|e| e.to_string())?;
    let tmp = CString::new("/tmp").map_err(|e| e.to_string())?;
    let tmpfs = CString::new("tmpfs").map_err(|e| e.to_string())?;
    let dev_shm = CString::new("/dev/shm").map_err(|e| e.to_string())?;
    let var_tmp = CString::new("/var/tmp").map_err(|e| e.to_string())?;
    let opts = CString::new("size=256m,mode=1777").map_err(|e| e.to_string())?;

    unsafe {
        let ret = libc::mount(
            none.as_ptr(),
            root.as_ptr(),
            std::ptr::null(),
            libc::MS_REC | libc::MS_PRIVATE,
            std::ptr::null(),
        );
        if ret != 0 {
            let err = std::io::Error::last_os_error();
            return Err(format!("MS_PRIVATE on /: {err}"));
        }

        // Remount root read-only to block writes to host filesystem
        let ret = libc::mount(
            none.as_ptr(),
            root.as_ptr(),
            std::ptr::null(),
            libc::MS_REMOUNT | libc::MS_RDONLY | libc::MS_BIND,
            std::ptr::null(),
        );
        if ret != 0 {
            let err = std::io::Error::last_os_error();
            return Err(format!("MS_REMOUNT | MS_RDONLY on /: {err}"));
        }

        // Mount fresh tmpfs on common writable paths
        for mount_point in [&tmp, &dev_shm, &var_tmp] {
            let ret = libc::mount(
                tmpfs.as_ptr(),
                mount_point.as_ptr(),
                tmpfs.as_ptr(),
                0,
                opts.as_ptr().cast::<libc::c_void>(),
            );
            if ret != 0 {
                let err = std::io::Error::last_os_error();
                return Err(format!(
                    "mount tmpfs on {}: {err}",
                    mount_point.to_string_lossy()
                ));
            }
        }
    }

    Ok(())
}

/// Check whether the current process is in a mount namespace different
/// from PID 1 (init). Returns `false` if we can't tell, forcing the caller
/// to abort rather than risk modifying the host's mounts.
fn in_private_mount_ns() -> bool {
    let self_ns = std::fs::read_link("/proc/self/ns/mnt");
    let init_ns = std::fs::read_link("/proc/1/ns/mnt");
    match (self_ns, init_ns) {
        (Ok(a), Ok(b)) => a != b,
        _ => false,
    }
}

/// `execve` the target binary via its inherited fd, preserving TOCTOU pinning.
///
/// On success this function never returns (control transfers into the
/// target binary). On failure it returns the OS error wrapped as a
/// `String`. We use [`std::convert::Infallible`] for the success-arm
/// type because it expresses "this branch is unreachable" while
/// staying on stable Rust (the experimental `!` type is not
/// available without nightly).
fn exec_target(parsed: &ParsedArgs) -> Result<std::convert::Infallible, String> {
    let exec_path = format!("/proc/self/fd/{}", parsed.target_fd);
    let exec_path_c = CString::new(exec_path.as_bytes()).map_err(|e| e.to_string())?;
    let arg0_c = CString::new(parsed.arg0.as_bytes()).map_err(|e| e.to_string())?;

    // Build argv: [arg0_c, rest_argv..., NULL]. Keep CStrings alive via
    // a Vec so the raw pointers we hand to execv stay valid through the call.
    let mut argv_cstrings: Vec<CString> = Vec::with_capacity(parsed.rest_argv.len() + 1);
    argv_cstrings.push(arg0_c);
    for a in &parsed.rest_argv {
        argv_cstrings.push(CString::new(a.as_bytes()).map_err(|e| e.to_string())?);
    }

    let mut argv_ptrs: Vec<*const libc::c_char> =
        argv_cstrings.iter().map(|c| c.as_ptr()).collect();
    argv_ptrs.push(std::ptr::null());

    unsafe {
        libc::execv(exec_path_c.as_ptr(), argv_ptrs.as_ptr());
    }
    // execv only returns on failure.
    let err = std::io::Error::last_os_error();
    Err(format!("execv({exec_path}): {err}"))
}