droidsaw 1.0.0

DROIDSAW — unified Android reverse engineering CLI. Hermes, DEX, APK signing. JSON output, MCP server. Bytecode is not a security layer.
Documentation
//! Shared semgrep invoke + persist helper.
//!
//! Both the MCP `audit` handler (modular-mode `Full | Semgrep`) and the
//! CLI `scan semgrep --persist` dispatch call this. Splits the
//! "extract → invoke → write_semgrep_db" chain so that source extraction
//! stays in [`crate::commands::semgrep`] (the pre-existing entry point)
//! and the subprocess + DB-write half is shared between the two callers.
//!
//! Invariant: the displayed `command` hint and the actual subprocess
//! argv are composed from the same [`SemgrepArgs`] value. The hint uses
//! [`crate::semgrep::compose_config_args`] (lossy `String`, suitable for
//! JSON embedding); the subprocess uses [`crate::semgrep::compose_argv`]
//! (byte-preserving `OsString`). Callers that want a hint string compose
//! it themselves; this helper only invokes.
//!
//! Schema invariant: when invoked, the helper opens `db_path` and
//! ensures the findings schema exists before any INSERT. This makes the
//! `scan semgrep --persist` path self-contained — it does not require a
//! prior `audit` run to have created the schema. For the MCP audit
//! caller, `write_findings_db_with_run` has already created the schema,
//! so the second `ensure_findings_db_schema` call is a no-op
//! (`CREATE TABLE IF NOT EXISTS` + `CREATE VIEW IF NOT EXISTS`).

use std::ffi::OsString;
use std::path::Path;

use serde_json::Value;

use crate::semgrep::SemgrepArgs;

/// Invoke the `semgrep` subprocess against `output_dir` (the directory
/// that [`crate::commands::semgrep`] populated with extracted source),
/// then persist parsed results into the findings DB at `db_path`.
///
/// Returns the inner `semgrep_scan` JSON value the audit envelope
/// already wraps. Shape (preserved for backward compat with the MCP
/// audit handler's prior inline implementation):
///
/// - On success: `{"ran": true, "results_persisted": N, "high_severity": M, "db_table": "semgrep_results"}`
/// - On subprocess error: `{"ran": false, "error": "<message>"}`
/// - On semgrep binary missing from PATH: `{"ran": false, "command": "<hint>"}`
///
/// The `db_table` label preserves the prior audit-handler value
/// (`"semgrep_results"`); rows actually land in the `findings` table —
/// queryable via the `semgrep_hits` view created by
/// [`crate::commands::ensure_findings_db_schema`].
pub fn run_and_persist(
    output_dir: &Path,
    sg_args: &SemgrepArgs,
    db_path: &Path,
    abort: Option<&std::sync::atomic::AtomicBool>,
) -> anyhow::Result<Value> {
    // `abort` is consumed only when the `mcp` feature is on (the
    // CLI-only build path uses unconditional `Command::output` and has
    // no cooperative-cancellation surface). Acknowledge it here so the
    // non-mcp build doesn't lint on the unused parameter.
    #[cfg(not(feature = "mcp"))]
    let _ = abort;
    // Lossy hint string — for the `{"ran": false, "command": ...}` JSON
    // envelope. JSON strings are utf-8 by spec; U+FFFD substitution on
    // non-utf8 paths is acceptable (and documented) here.
    let hint_args = crate::semgrep::compose_config_args(sg_args)
        .map_err(|e| anyhow::anyhow!("semgrep arg composition: {e}"))?;

    if !which_in_path("semgrep") {
        let cmd_hint = format!(
            "semgrep {} {}/",
            hint_args.join(" "),
            output_dir.display()
        );
        return Ok(serde_json::json!({"ran": false, "command": cmd_hint}));
    }

    // Byte-preserving argv — for the actual subprocess. Preserves non-utf8
    // path bytes that `display().to_string()` would corrupt with U+FFFD.
    let mut argv: Vec<OsString> = crate::semgrep::compose_argv(sg_args)
        .map_err(|e| anyhow::anyhow!("semgrep arg composition: {e}"))?;
    argv.push(OsString::from(format!("{}/", output_dir.display())));
    argv.push(OsString::from("--json"));
    argv.push(OsString::from("--quiet"));

    // Use timeout-aware subprocess runner when the `mcp` feature is active.
    // Sends SIGTERM at DROIDSAW_MCP_SUBPROCESS_TIMEOUT_SEC (default 600 s),
    // then SIGKILL after 5 s grace. Caps stderr to 64 KiB. The raw
    // `Command::output()` fallback applies only when the crate is built
    // without the `mcp` feature (CLI-only builds).
    #[cfg(feature = "mcp")]
    let result = {
        let timeout_secs = crate::mcp::subprocess::subprocess_timeout_secs();
        crate::mcp::subprocess::run_command_with_timeout(
            "semgrep",
            &argv,
            "semgrep",
            timeout_secs,
            abort,
        )
        .map_err(|e| anyhow::anyhow!("{}", e.into_mcp_error().message))
    };

    #[cfg(not(feature = "mcp"))]
    let result = std::process::Command::new("semgrep")
        .args(&argv)
        .output()
        .map_err(anyhow::Error::new);

    match result {
        Ok(out) => {
            let stdout_str = String::from_utf8_lossy(&out.stdout);
            // Schema-first: ensure findings + views exist before INSERT
            // lands. Idempotent for the audit caller (already created by
            // write_findings_db_with_run); load-bearing for the
            // scan-semgrep-persist caller (this is the first DB write).
            crate::commands::ensure_findings_db_schema(db_path)?;
            let results_persisted =
                crate::commands::write_semgrep_db(&stdout_str, db_path).unwrap_or(0);
            let high_severity: usize = serde_json::from_str::<Value>(&stdout_str)
                .ok()
                .and_then(|v| {
                    v.get("results").and_then(Value::as_array).map(|a| {
                        a.iter()
                            .filter(|r| {
                                r.get("extra")
                                    .and_then(|e| e.get("severity"))
                                    .and_then(Value::as_str)
                                    == Some("ERROR")
                            })
                            .count()
                    })
                })
                .unwrap_or(0);
            Ok(serde_json::json!({
                "ran": true,
                "results_persisted": results_persisted,
                "high_severity": high_severity,
                "db_table": "semgrep_results",
            }))
        }
        Err(e) => Ok(serde_json::json!({"ran": false, "error": e.to_string()})),
    }
}

/// PATH-based binary lookup. Mirrors the prior inline `which_in_path`
/// from `mcp/mod.rs` (now removed in favour of this shared helper).
fn which_in_path(name: &str) -> bool {
    std::env::var_os("PATH")
        .map(|path| std::env::split_paths(&path).any(|dir| dir.join(name).is_file()))
        .unwrap_or(false)
}