droidsaw 1.0.0 - Docs.rs

//! Timeout-aware subprocess runner for MCP tool handlers.
//!
//! Without a wall-clock timeout and stderr cap, subprocess calls to semgrep
//! and trufflehog can block a tokio worker indefinitely — which (combined
//! with no `spawn_blocking` discipline) can freeze the entire MCP server.
//!
//! **This module provides:**
//! - `run_with_timeout` — runs a `std::process::Command` with:
//!   - SIGTERM at `timeout_secs` (default from `DROIDSAW_MCP_SUBPROCESS_TIMEOUT_SEC`,
//!     falling back to 600 s).
//!   - SIGKILL 5 s after SIGTERM if the process is still alive.
//!   - stderr captured to a `NamedTempFile`; the last 64 KiB tail is
//!     returned in `SubprocessOutput::stderr_tail`.
//! - Typed errors: `McpSubprocessError::Timeout` and `McpSubprocessError::Io`.
//!
//! See `DROIDSAW_MCP_SUBPROCESS_TIMEOUT_SEC` env var (u64 seconds, default 600).

use std::ffi::OsString;
use std::io::{Read, Seek, SeekFrom, Write};
use std::process::{Command, Stdio};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};

use tempfile::NamedTempFile;

/// Cooperative-cancellation handle for blocking-pool work.
///
/// Constructed by the async layer alongside a tokio task that watches
/// `CancellationToken::cancelled()` and sets the inner flag. The
/// `spawn_blocking` closure receives a clone of this `AbortFlag` and
/// polls it at phase boundaries (between findings collection, db
/// writes, subprocess invocations); subprocess runners poll it on the
/// same 250 ms cadence as the timeout check.
///
/// The flag is one-shot — once set, it stays set for the lifetime of
/// the call. Drop the handle when the blocking task completes; future
/// calls get a fresh flag.
pub type AbortFlag = Arc<AtomicBool>;

/// Create a fresh, un-fired abort flag.
pub fn new_abort_flag() -> AbortFlag {
    Arc::new(AtomicBool::new(false))
}

/// Maximum bytes of stderr we retain and return. Caps at 64 KiB to prevent
/// unbounded memory growth from noisy scanners.
const STDERR_TAIL_CAP: u64 = 64 * 1024;

/// Default subprocess timeout in seconds.
const DEFAULT_TIMEOUT_SECS: u64 = 600;

/// Env var that overrides the default subprocess timeout.
const TIMEOUT_ENV_VAR: &str = "DROIDSAW_MCP_SUBPROCESS_TIMEOUT_SEC";

/// Read the active subprocess timeout from the environment (or the default).
pub fn subprocess_timeout_secs() -> u64 {
    std::env::var(TIMEOUT_ENV_VAR)
        .ok()
        .and_then(|v| v.parse::<u64>().ok())
        .unwrap_or(DEFAULT_TIMEOUT_SECS)
}

/// Outcome of a successful `run_with_timeout` call.
#[derive(Debug)]
pub struct SubprocessOutput {
    pub status: std::process::ExitStatus,
    pub stdout: Vec<u8>,
    /// Last `STDERR_TAIL_CAP` bytes of stderr output. The full stderr
    /// is written to a `NamedTempFile` during execution; only the tail
    /// is read back to bound memory usage.
    pub stderr_tail: Vec<u8>,
    /// Whether stderr exceeded `STDERR_TAIL_CAP` bytes.
    pub stderr_overflowed: bool,
}

/// Typed errors from `run_with_timeout`.
#[derive(Debug)]
pub enum McpSubprocessError {
    /// Process did not finish within the timeout; SIGTERM was sent (and
    /// SIGKILL 5 s later). The `tool` and `observed_secs` fields match
    /// the MCP `SubprocessTimeout` error data shape.
    Timeout {
        tool: String,
        observed_secs: u64,
    },
    /// The async layer fired the cooperative-cancellation flag (client
    /// disconnected); SIGTERM was sent and the child reaped. The `tool`
    /// field lets callers distinguish "client gave up" from "tool exceeded
    /// its budget" — both terminate the child, but only the timeout case
    /// suggests bumping the env-var.
    Aborted { tool: String },
    /// I/O error spawning or communicating with the subprocess.
    Io(std::io::Error),
}

impl McpSubprocessError {
    /// Convert to an MCP wire error.
    pub fn into_mcp_error(self) -> rmcp::ErrorData {
        match self {
            McpSubprocessError::Timeout { tool, observed_secs } => rmcp::ErrorData::new(
                rmcp::model::ErrorCode(-32000),
                format!(
                    "subprocess for tool {tool:?} timed out after {observed_secs}s. \
                     SIGTERM sent; caller may retry with a larger \
                     DROIDSAW_MCP_SUBPROCESS_TIMEOUT_SEC value."
                ),
                Some(serde_json::json!({
                    "type": "SubprocessTimeout",
                    "tool": tool,
                    "observed_secs": observed_secs,
                })),
            ),
            McpSubprocessError::Aborted { tool } => rmcp::ErrorData::new(
                rmcp::model::ErrorCode(-32000),
                format!(
                    "subprocess for tool {tool:?} aborted: client disconnected. \
                     SIGTERM sent; the abort signal fired before the wall-clock \
                     timeout was reached."
                ),
                Some(serde_json::json!({
                    "type": "SubprocessAborted",
                    "tool": tool,
                })),
            ),
            McpSubprocessError::Io(e) => rmcp::ErrorData::new(rmcp::model::ErrorCode::INTERNAL_ERROR,
                format!("subprocess I/O error: {e}"),
                None,
            ),
        }
    }
}

/// Run `program` with `args` under a wall-clock timeout.
///
/// stderr is redirected to a `NamedTempFile` during execution; after the
/// process exits (or is killed) the last `STDERR_TAIL_CAP` bytes are read
/// back and returned. stdout is collected in memory (callers should bound
/// it via structured output flags like `--json --quiet`).
///
/// # Platform notes
/// SIGTERM is a POSIX signal. On non-Unix platforms the implementation
/// falls back to `Child::kill()` (unconditional termination).
///
/// # Cancellation safety
/// This function is **synchronous** and blocks the calling thread. It MUST
/// be called from inside `tokio::task::spawn_blocking`. Callers pass an
/// optional [`AbortFlag`] (`None` to disable cooperative cancellation).
/// The poll-wait loop checks the flag on its 250 ms tick alongside the
/// deadline; on `true` it SIGTERMs the child (5 s grace → SIGKILL,
/// matching the timeout path) and returns [`McpSubprocessError::Aborted`].
/// The async layer at the call site (`audit` handler) wires
/// `CancellationToken::cancelled()` → flag-set via a watcher task.
pub fn run_with_timeout(
    program: &str,
    args: &[OsString],
    tool: &str,
    timeout_secs: u64,
    abort: Option<&AtomicBool>,
) -> Result<SubprocessOutput, McpSubprocessError> {
    // Write stderr to a temp file so we can tail it without buffering the
    // full stream in memory while the process runs. Use try_clone() so
    // the child gets an independent file description into the same inode.
    let mut stderr_file =
        NamedTempFile::new().map_err(McpSubprocessError::Io)?;
    let stderr_for_child = stderr_file
        .as_file()
        .try_clone()
        .map_err(McpSubprocessError::Io)?;

    let mut child = Command::new(program)
        .args(args)
        .stdout(Stdio::piped())
        .stderr(Stdio::from(stderr_for_child))
        .spawn()
        .map_err(McpSubprocessError::Io)?;

    // `Instant + Duration` is unrepresentable only past ~292 years from
    // now (i64-ns overflow); `timeout_secs` is operator-supplied so we
    // saturate rather than trust a checked path. The fallback (the
    // far-future `Instant::now()`) is functionally equivalent to "no
    // timeout" for any practical caller.
    let deadline = Instant::now()
        .checked_add(Duration::from_secs(timeout_secs))
        .unwrap_or_else(|| {
            #[allow(
                clippy::arithmetic_side_effects,
                reason = "1h fallback under impossible-in-practice Instant overflow"
            )]
            { Instant::now() + Duration::from_secs(60 * 60) }
        });
    let poll_interval = Duration::from_millis(250);

    // Poll-wait loop: check if the child has exited on each interval.
    // The 250 ms granularity means at most ~250 ms overshoot on top of
    // `timeout_secs`, and at most ~250 ms latency on the cooperative
    // abort signal.
    let status = loop {
        match child.try_wait().map_err(McpSubprocessError::Io)? {
            Some(s) => break s,
            None => {
                let aborted = abort.is_some_and(|f| f.load(Ordering::Relaxed));
                if Instant::now() >= deadline || aborted {
                    // SIGTERM first; give 5 s grace before SIGKILL. The
                    // same kill ladder serves both the timeout and the
                    // cooperative-abort path; only the returned error
                    // discriminates so callers can tell which fired.
                    terminate_child(&mut child);
                    // Same overflow rationale as the outer deadline.
                    let kill_deadline = Instant::now()
                        .checked_add(Duration::from_secs(5))
                        .unwrap_or_else(|| {
                            #[allow(
                                clippy::arithmetic_side_effects,
                                reason = "60s fallback under impossible-in-practice Instant overflow"
                            )]
                            { Instant::now() + Duration::from_secs(60) }
                        });
                    loop {
                        match child.try_wait().map_err(McpSubprocessError::Io)? {
                            Some(_) => break,
                            None => {
                                if Instant::now() >= kill_deadline {
                                    // Best-effort fallback after the 5 s grace
                                    // window: the process is wedged enough that
                                    // we cannot do better than ignore kill/wait
                                    // failures. Both calls return `io::Result`,
                                    // but there is no recovery path here.
                                    drop(child.kill());
                                    drop(child.wait());
                                    break;
                                }
                                std::thread::sleep(Duration::from_millis(100));
                            }
                        }
                    }
                    if aborted {
                        return Err(McpSubprocessError::Aborted {
                            tool: tool.to_owned(),
                        });
                    }
                    return Err(McpSubprocessError::Timeout {
                        tool: tool.to_owned(),
                        observed_secs: timeout_secs,
                    });
                }
                std::thread::sleep(poll_interval);
            }
        }
    };

    // Collect stdout (in memory — callers use --json to bound this).
    let stdout = match child.stdout.take() {
        Some(mut s) => {
            let mut buf = Vec::new();
            s.read_to_end(&mut buf).map_err(McpSubprocessError::Io)?;
            buf
        }
        None => Vec::new(),
    };

    // Flush + tail stderr from the temp file. Flush failure on a
    // tempfile would imply a critical I/O error that the subsequent
    // `read_tail` will surface; we don't need to short-circuit here.
    drop(stderr_file.flush());
    let (stderr_tail, stderr_overflowed) = read_tail(&mut stderr_file, STDERR_TAIL_CAP)
        .unwrap_or((Vec::new(), false));

    Ok(SubprocessOutput {
        status,
        stdout,
        stderr_tail,
        stderr_overflowed,
    })
}

/// Read the last `cap` bytes of `file`, returning `(bytes, overflowed)`.
/// `overflowed` is true when the file length exceeded `cap`.
fn read_tail(file: &mut NamedTempFile, cap: u64) -> std::io::Result<(Vec<u8>, bool)> {
    let len = file.as_file().metadata()?.len();
    let overflowed = len > cap;
    let start = if overflowed { len.saturating_sub(cap) } else { 0 };
    file.seek(SeekFrom::Start(start))?;
    #[allow(
        clippy::as_conversions,
        clippy::cast_possible_truncation,
        reason = "PROOF: read_len = min(len, cap) where cap is the caller-provided stderr cap (default 64 KiB at call site `run_with_timeout`, hard-bounded by `DROIDSAW_MCP_SUBPROCESS_STDERR_CAP`). Since cap fits in u64 and is always set to ≤ usize::MAX on supported 64-bit targets, the (u64 -> usize) widen is lossless. The variable `read_len` is then passed to Vec::with_capacity (a hint), so even if a hypothetical 32-bit target truncates, the subsequent read_to_end would re-grow correctly."
    )]
    let read_len = (len.saturating_sub(start)) as usize;
    let mut buf = Vec::with_capacity(read_len);
    file.read_to_end(&mut buf)?;
    Ok((buf, overflowed))
}

/// Send SIGTERM to a child process. On non-Unix, kills immediately.
fn terminate_child(child: &mut std::process::Child) {
    #[cfg(unix)]
    {
        // SAFETY: `child.id()` is the OS-assigned PID of a running child
        // we spawned moments ago. Sending SIGTERM is well-defined and
        // idempotent if the process has already exited (ESRCH is ignored).
        #[allow(
            clippy::as_conversions,
            clippy::cast_possible_wrap,
            reason = "PROOF: u32 -> i32 reinterpret of a Unix PID for libc::kill(pid_t, sig). pid_t is defined as i32 on every Unix std::process supports. Linux+macOS PIDs are constrained to [1, PID_MAX_LIMIT] (Linux PID_MAX_LIMIT=4_194_304; macOS pid_t max=99_999), well below i32::MAX, so the bit pattern is identical."
        )]
        let pid = child.id() as i32;
        unsafe {
            libc::kill(pid, libc::SIGTERM);
        }
    }
    #[cfg(not(unix))]
    {
        let _ = child.kill();
    }
}

/// Convenience wrapper: same as `run_with_timeout` but converts the result
/// into the `std::process::Output` shape expected by callers that were
/// previously using `Command::output()`.
///
/// stderr is the tail (up to 64 KiB), not the full stream.
pub fn run_command_with_timeout(
    program: &str,
    args: &[OsString],
    tool: &str,
    timeout_secs: u64,
    abort: Option<&AtomicBool>,
) -> Result<std::process::Output, McpSubprocessError> {
    let out = run_with_timeout(program, args, tool, timeout_secs, abort)?;
    Ok(std::process::Output {
        status: out.status,
        stdout: out.stdout,
        stderr: out.stderr_tail,
    })
}