envseal 0.3.6 - Docs.rs

//! Linux namespace isolation backend for [`super::SandboxTier`].
//!
//! Translates the abstract tier into [`unshare(2)`] flags for unprivileged
//! user namespaces.
//!
//! # Tier mapping
//!
//! - [`SandboxTier::None`] — no `unshare`; the always-on parent hardening still applies.
//! - [`SandboxTier::Hardened`] — `CLONE_NEWPID | CLONE_NEWIPC` inside `CLONE_NEWUSER`.
//! - [`SandboxTier::Lockdown`] — Hardened plus `CLONE_NEWNS | CLONE_NEWNET`.
//!
//! # Mount namespace and the helper
//!
//! `unshare(CLONE_NEWNS)` creates an isolated mount namespace, but populating
//! it with private mounts (e.g. tmpfs over `/tmp`) requires `mount(2)`, which
//! is not async-signal-safe and therefore unsafe to call from the `pre_exec`
//! closure. So `apply_sandbox` only creates the namespace here; the
//! private-mount setup is performed by [`crate::execution::sandbox_helper`],
//! which the supervisor `execve`s into for [`SandboxTier::Lockdown`] runs.
//! The helper runs inside the new namespace (out of `pre_exec` context), does
//! the mount work, then `execve`s the target through its inherited fd so
//! TOCTOU pinning is preserved across the helper indirection.
//!
//! # Requirements
//!
//! - Linux 3.8+ for unprivileged user namespaces.
//! - `/proc/sys/kernel/unprivileged_userns_clone = 1` (default on most distros).
//! - For full network isolation: kernel 5.9+ recommended.

#![cfg(target_os = "linux")]

use super::SandboxTier;

/// Apply sandbox isolation matching the requested tier.
///
/// Called between `fork()` and `exec()` via `Command::pre_exec`. Only
/// async-signal-safe syscalls are used (just `unshare(2)`).
///
/// # Tier coverage
///
/// - [`SandboxTier::None`] — no-op.
/// - [`SandboxTier::Hardened`] — PID + IPC namespaces (and the wrapping user namespace).
/// - [`SandboxTier::Lockdown`] — PID + IPC + mount + network namespaces (plus user
///   namespace). The mount namespace is created here but populated with private
///   mounts by [`crate::execution::sandbox_helper`] post-`pre_exec`.
///
/// # Errors
///
/// [`std::io::ErrorKind::PermissionDenied`] if `unshare` fails (e.g.
/// unprivileged user namespaces disabled by sysctl).
///
/// # Safety
///
/// Must only be called from a `pre_exec` closure.
pub fn apply_sandbox(tier: SandboxTier) -> std::io::Result<()> {
    // The mount-namespace branch of Lockdown only creates the namespace here.
    // Populating it with private mounts (`mount tmpfs over /tmp`, `MS_PRIVATE`
    // propagation) uses `mount(2)`, which is **not** async-signal-safe and
    // therefore cannot run in `pre_exec`. The supervisor arranges for those
    // mounts to be performed inside the new namespace, before the target
    // binary is exec'd, by routing the spawn through
    // [`crate::execution::sandbox_helper::run_helper`]. See the helper for
    // the post-`unshare` setup.
    let flags = match tier {
        SandboxTier::None => return Ok(()),
        SandboxTier::Hardened => libc::CLONE_NEWUSER | libc::CLONE_NEWPID | libc::CLONE_NEWIPC,
        SandboxTier::Lockdown => {
            libc::CLONE_NEWUSER
                | libc::CLONE_NEWPID
                | libc::CLONE_NEWIPC
                | libc::CLONE_NEWNS
                | libc::CLONE_NEWNET
        }
    };

    let ret = unsafe { libc::unshare(flags) };
    if ret != 0 {
        let err = std::io::Error::last_os_error();
        // Retry without CLONE_NEWUSER — works only if caller has CAP_SYS_ADMIN.
        let fallback_flags = flags & !libc::CLONE_NEWUSER;
        if fallback_flags != 0 {
            let ret2 = unsafe { libc::unshare(fallback_flags) };
            if ret2 != 0 {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::PermissionDenied,
                    format!(
                        "failed to apply namespace isolation: {err}. \
                         unprivileged user namespaces may be disabled"
                    ),
                ));
            }
        } else {
            return Err(std::io::Error::new(
                std::io::ErrorKind::PermissionDenied,
                format!("failed to apply namespace isolation: {err}"),
            ));
        }
    }

    Ok(())
}

/// Check whether unprivileged user namespaces are available.
///
/// Returns `true` if the kernel supports creating user namespaces
/// without root. Prerequisite for any non-`None` tier on Linux.
#[must_use]
pub fn user_namespaces_available() -> bool {
    if let Ok(val) = std::fs::read_to_string("/proc/sys/kernel/unprivileged_userns_clone") {
        if val.trim() == "0" {
            return false;
        }
    }
    if let Ok(val) =
        std::fs::read_to_string("/proc/sys/kernel/apparmor_restrict_unprivileged_userns")
    {
        if val.trim() == "1" {
            return false;
        }
    }
    let ret = unsafe { libc::unshare(libc::CLONE_NEWUSER) };
    ret == 0
}