kavach 0.22.3 - Docs.rs

//! Process backend — OS process with seccomp, Landlock, namespaces, cgroups.

pub mod cgroups;
pub mod landlock_enforce;
pub mod namespaces;
pub mod seccomp;

use crate::backend::{Backend, SandboxBackend};
use crate::lifecycle::{ExecResult, SandboxConfig};
use crate::policy::SandboxPolicy;

/// Process-based sandbox backend.
#[derive(Debug)]
pub struct ProcessBackend {
    _config: SandboxConfig,
}

impl ProcessBackend {
    /// Create a new process backend from configuration.
    pub fn new(config: &SandboxConfig) -> crate::Result<Self> {
        Ok(Self {
            _config: config.clone(),
        })
    }
}

#[async_trait::async_trait]
impl SandboxBackend for ProcessBackend {
    fn backend_type(&self) -> Backend {
        Backend::Process
    }

    async fn exec(&self, command: &str, policy: &SandboxPolicy) -> crate::Result<ExecResult> {
        // Parse command into program + args
        let parts = shell_words(command);
        if parts.is_empty() {
            return Err(crate::KavachError::ExecFailed("empty command".into()));
        }

        let program = &parts[0];
        let args = &parts[1..];

        let mut cmd = tokio::process::Command::new(program);
        cmd.args(args)
            .stdout(std::process::Stdio::piped())
            .stderr(std::process::Stdio::piped());

        // Apply environment from config
        for (k, v) in &self._config.env {
            cmd.env(k, v);
        }

        // Apply working directory
        if let Some(ref workdir) = self._config.workdir {
            cmd.current_dir(workdir);
        }

        // ── Pre-exec isolation (Linux only) ─────────────────────────────
        #[cfg(target_os = "linux")]
        {
            use crate::backend::capabilities;

            let caps = capabilities::detect_capabilities();

            // Pre-build seccomp BPF program (before fork, can allocate freely)
            let seccomp_program = if policy.seccomp_enabled && caps.seccomp_available {
                let profile = policy.seccomp_profile.as_deref().unwrap_or("basic");
                match seccomp::build_filter(profile) {
                    Ok(p) => Some(p),
                    Err(e) => {
                        tracing::warn!("seccomp filter build failed, skipping: {e}");
                        None
                    }
                }
            } else {
                None
            };

            // Derive namespace config — only apply if namespaces are available
            let ns_config = if caps.namespaces_available {
                Some(namespaces::NamespaceConfig::from_policy(policy))
            } else {
                None
            };

            // Only apply landlock if kernel supports it
            let apply_ll = caps.landlock_available && landlock_enforce::should_apply(policy);

            // Clone policy for pre_exec closure
            let policy_clone = policy.clone();

            // SAFETY: `CommandExt::pre_exec` requires unsafe because the closure
            // runs in the child process between fork() and exec(), where only
            // async-signal-safe operations are permitted (no heap allocation,
            // no mutex acquisition, no stdio beyond write()).
            //
            // This closure satisfies those requirements:
            // 1. The BPF program is pre-compiled above (before fork) — no
            //    allocation happens inside the closure.
            // 2. All operations are direct kernel syscalls via FFI:
            //    - unshare(2) for namespace isolation
            //    - landlock_create_ruleset(2) / landlock_restrict_self(2)
            //    - prctl(2) for capability dropping
            //    - setrlimit(2) for resource limits
            //    - seccomp(2) / prctl(PR_SET_SECCOMP) for BPF filter
            // 3. Error paths use eprintln! (write to fd 2) which is
            //    async-signal-safe, or return Err (no cleanup needed).
            // 4. No heap-allocated data is created inside the closure —
            //    all captured values (ns_config, policy_clone, seccomp_program,
            //    apply_ll) are moved in and only read.
            // 5. Ordering is critical and documented inline: namespaces first
            //    (needs unshare), then landlock (needs landlock_* syscalls),
            //    then caps (needs capset), then seccomp last (would block
            //    all preceding syscalls).
            unsafe {
                cmd.pre_exec(move || {
                    // Order matters: each step needs syscalls the next would block.
                    // 1. Namespaces (needs unshare syscall) — best-effort
                    if let Some(ref ns) = ns_config
                        && ns.any_enabled()
                        && let Err(e) = namespaces::apply_namespaces(ns)
                    {
                        eprintln!("kavach: namespace isolation skipped: {e}");
                    }

                    // 2. Landlock (needs landlock_* syscalls) — best-effort
                    if apply_ll && let Err(e) = landlock_enforce::apply_landlock(&policy_clone) {
                        eprintln!("kavach: landlock skipped: {e}");
                    }

                    // 3. Drop capabilities (needs capset syscall) — best-effort
                    let _ = namespaces::drop_capabilities();

                    // 4. Apply resource limits via rlimits — best-effort
                    let _ = cgroups::apply_rlimits(&policy_clone);

                    // 5. Seccomp filter (MUST BE LAST — blocks future syscalls)
                    if let Some(ref program) = seccomp_program {
                        seccomp::apply_filter(program)
                            .map_err(|e| std::io::Error::other(e.to_string()))?;
                    }

                    Ok(())
                });
            }
        }

        crate::backend::exec_util::execute_with_timeout(
            &mut cmd,
            self._config.timeout_ms,
            "process",
        )
        .await
    }

    async fn health_check(&self) -> crate::Result<bool> {
        Ok(true)
    }

    async fn destroy(&self) -> crate::Result<()> {
        Ok(())
    }
}

/// Simple whitespace-based command splitting (no shell expansion).
fn shell_words(input: &str) -> Vec<String> {
    let mut words = Vec::with_capacity(8);
    let mut current = String::new();
    let mut in_single = false;
    let mut in_double = false;
    let mut escape = false;

    for ch in input.chars() {
        if escape {
            current.push(ch);
            escape = false;
            continue;
        }
        match ch {
            '\\' if !in_single => escape = true,
            '\'' if !in_double => in_single = !in_single,
            '"' if !in_single => in_double = !in_double,
            ' ' | '\t' if !in_single && !in_double => {
                if !current.is_empty() {
                    words.push(std::mem::take(&mut current));
                }
            }
            _ => current.push(ch),
        }
    }
    if !current.is_empty() {
        words.push(current);
    }
    words
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::backend::Backend;

    #[test]
    fn shell_words_basic() {
        assert_eq!(shell_words("echo hello"), vec!["echo", "hello"]);
        assert_eq!(shell_words("ls -la /tmp"), vec!["ls", "-la", "/tmp"]);
    }

    #[test]
    fn shell_words_quoted() {
        assert_eq!(
            shell_words(r#"echo "hello world""#),
            vec!["echo", "hello world"]
        );
        assert_eq!(
            shell_words("echo 'hello world'"),
            vec!["echo", "hello world"]
        );
    }

    #[test]
    fn shell_words_empty() {
        assert!(shell_words("").is_empty());
        assert!(shell_words("   ").is_empty());
    }

    #[tokio::test]
    async fn exec_echo() {
        let config = SandboxConfig::builder().backend(Backend::Process).build();
        let backend = ProcessBackend::new(&config).unwrap();
        let policy = SandboxPolicy::minimal();
        let result = backend.exec("echo hello", &policy).await.unwrap();
        assert_eq!(result.exit_code, 0);
        assert_eq!(result.stdout.trim(), "hello");
        assert!(!result.timed_out);
    }

    #[tokio::test]
    async fn exec_false_returns_nonzero() {
        let config = SandboxConfig::builder().backend(Backend::Process).build();
        let backend = ProcessBackend::new(&config).unwrap();
        let policy = SandboxPolicy::minimal();
        let result = backend.exec("false", &policy).await.unwrap();
        assert_ne!(result.exit_code, 0);
    }

    #[tokio::test]
    async fn exec_timeout() {
        let config = SandboxConfig::builder()
            .backend(Backend::Process)
            .timeout_ms(100)
            .build();
        let backend = ProcessBackend::new(&config).unwrap();
        let policy = SandboxPolicy::minimal();
        let result = backend.exec("sleep 10", &policy).await.unwrap();
        assert!(result.timed_out);
        assert_eq!(result.exit_code, -1);
    }

    #[tokio::test]
    async fn exec_empty_command() {
        let config = SandboxConfig::builder().backend(Backend::Process).build();
        let backend = ProcessBackend::new(&config).unwrap();
        let policy = SandboxPolicy::minimal();
        let result = backend.exec("", &policy).await;
        assert!(result.is_err());
    }
}