droidsaw 2.0.0

DROIDSAW — unified Android reverse engineering CLI. Hermes, DEX, APK signing. JSON output, MCP server. Bytecode is not a security layer.
Documentation
//! Shared trufflehog extract + invoke + persist helper.
//!
//! Both the MCP `audit` handler (modular-mode `Full | Trufflehog`) and
//! the CLI `audit --mode=full | --mode=trufflehog` dispatch call this.
//! Splits the "dump strings → invoke trufflehog filesystem → parse →
//! write_credentials_db" chain into one place so the two callers cannot
//! drift on:
//!
//! - which arguments are passed to the trufflehog subprocess,
//! - what JSON envelope is returned (the gauge consumed by
//!   `detectors.trufflehog`),
//! - what counter semantics live in `credentials_written` /
//!   `hit_count` / `verified_count`,
//! - what the `note` field says.
//!
//! ## Envelope shape (stable for downstream parsers)
//!
//! - Success: `{"ran": true, "hit_count": N, "verified_count": M,
//!   "unverified_count": N - M, "credentials_written": K,
//!   "db_table": "credentials", "note": "..."}`
//! - Binary missing or no strings produced: `{"ran": false,
//!   "strings_file": "...", "command": "..."}`
//! - Subprocess error: `{"ran": false, "error": "..."}`
//!
//! ## Schema invariant
//!
//! The helper opens `db_path` only when `write_credentials_db` is
//! called on successful subprocess output. `write_credentials_db` runs
//! its own `CREATE TABLE IF NOT EXISTS` so the CLI's first invocation
//! against a fresh `./droidsaw-<stem>.db` is self-contained.

use std::path::Path;

use serde_json::Value;

use crate::context::CrossLayerContext;

/// Default minimum string length passed to
/// [`crate::commands::trufflehog`] when extracting strings for the
/// subprocess input. Mirrors the prior MCP-inline default
/// (`mcp/mod.rs:1354`) and the `droidsaw scan trufflehog --min-length`
/// default. 8 bytes is short enough to surface short tokens
/// (`sk-` prefixes etc.) and long enough to suppress noise.
pub const DEFAULT_MIN_LENGTH: usize = 8;

/// Run trufflehog filesystem mode against strings extracted from
/// `ctx`, persist parsed credential hits to `db_path`, and return the
/// JSON envelope describing what happened.
///
/// `min_length` is the floor on extracted string length; pass
/// [`DEFAULT_MIN_LENGTH`] unless you have a specific reason to differ
/// from the MCP / `scan trufflehog` precedent.
///
/// `db_path` receives credential rows via
/// [`crate::commands::write_credentials_db`] on subprocess success.
/// The helper does NOT create the path's parent directories — callers
/// pass a path whose parent already exists (per-input temp dir for
/// MCP, `./droidsaw-<stem>.db` for CLI default).
///
/// `abort` is the cooperative-cancellation flag from MCP's
/// `spawn_blocking` lifecycle. CLI callers pass `None`.
///
/// `strings_file_seed` lets callers override the tempfile path used to
/// hold extracted strings. The MCP path passes a timestamped name to
/// allow concurrent audits to coexist; CLI callers can pass `None` to
/// let the helper synthesize one.
pub fn run_and_persist(
    ctx: &CrossLayerContext,
    min_length: usize,
    db_path: &Path,
    abort: Option<&std::sync::atomic::AtomicBool>,
    strings_file_seed: Option<&Path>,
) -> anyhow::Result<Value> {
    // Acknowledge the abort param on builds without the mcp feature —
    // the no-mcp subprocess fallback below uses `Command::output()`
    // and has no cooperative-cancellation surface to consult.
    #[cfg(not(feature = "mcp"))]
    let _ = abort;

    // Step 1: extract strings to a tempfile. The MCP path's
    // `state.lock()` ceremony is collapsed away because the helper
    // takes `&CrossLayerContext` directly.
    //
    // When `strings_file_seed` is None we own the tempfile lifetime via
    // `tempfile::NamedTempFile` — its `Drop` unlinks the file when this
    // function returns. The seed-passing path (used by MCP for
    // concurrent-audit isolation) hands us a caller-owned path; we do
    // NOT unlink it on return so the caller can decide cleanup.
    // This NamedTempFile approach closes temp file leak surfaces by
    // ensuring automatic cleanup on drop.
    let _tempfile_guard: Option<tempfile::NamedTempFile>;
    let strings_file: std::path::PathBuf = match strings_file_seed {
        Some(p) => {
            _tempfile_guard = None;
            p.to_path_buf()
        }
        None => {
            let tf = create_strings_tempfile()?;
            let p = tf.path().to_path_buf();
            _tempfile_guard = Some(tf);
            p
        }
    };

    let mut buf: Vec<u8> = Vec::new();
    crate::commands::trufflehog(ctx, min_length, &mut buf)?;
    let strings_written = buf.split(|&b| b == b'\n').count();
    std::fs::write(&strings_file, &buf)?;

    let th_cmd = format!(
        "trufflehog filesystem {} --json --no-verification",
        strings_file.display()
    );

    // Step 2: binary-on-PATH gate. Mirrors the MCP-inline check; the
    // "no strings written" branch matches the prior precedent too
    // (an empty input file would produce zero hits regardless).
    if !which_in_path("trufflehog") || strings_written == 0 {
        return Ok(serde_json::json!({
            "ran": false,
            "strings_file": strings_file.display().to_string(),
            "command": th_cmd,
        }));
    }

    // Step 3: subprocess invocation. mcp-feature build uses the
    // timeout-aware runner from `crate::mcp::subprocess` (SIGTERM at
    // `DROIDSAW_MCP_SUBPROCESS_TIMEOUT_SEC`, then SIGKILL after 5 s
    // grace). The non-mcp CLI-only build uses `Command::output()` —
    // mirrors `crate::semgrep::run::run_and_persist` precedent.
    let th_args: Vec<std::ffi::OsString> = vec![
        "filesystem".into(),
        strings_file.as_os_str().into(),
        "--json".into(),
        "--no-update".into(),
        "--no-verification".into(),
    ];

    #[cfg(feature = "mcp")]
    let result = {
        let timeout_secs = crate::mcp::subprocess::subprocess_timeout_secs();
        crate::mcp::subprocess::run_command_with_timeout(
            "trufflehog",
            &th_args,
            "trufflehog",
            timeout_secs,
            abort,
        )
        .map_err(|e| anyhow::anyhow!("{}", e.into_mcp_error().message))
    };

    #[cfg(not(feature = "mcp"))]
    let result = std::process::Command::new("trufflehog")
        .args(&th_args)
        .output()
        .map_err(anyhow::Error::new);

    match result {
        Ok(out) => {
            let stdout_str = String::from_utf8_lossy(&out.stdout);
            let cred_count =
                crate::commands::write_credentials_db(&stdout_str, db_path).unwrap_or(0);
            let hit_count = stdout_str
                .lines()
                .filter(|l| !l.trim().is_empty())
                .count();
            let verified_count = if cred_count > 0 {
                rusqlite::Connection::open_with_flags(
                    db_path,
                    rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY,
                )
                .and_then(|db| {
                    db.query_row(
                        "SELECT COUNT(*) FROM credentials WHERE verified=1",
                        [],
                        |r| r.get::<_, i64>(0),
                    )
                })
                .unwrap_or(0)
            } else {
                0
            };
            #[allow(clippy::as_conversions, clippy::cast_possible_wrap, reason = "display-only unverified-secret count; saturating_sub is exact here (hit_count >= verified_count by construction). Compute outside the json! macro so the `as` lint allow can attach to the let-binding (attribute-on-expression is unstable).")]
            let unverified_count = (hit_count as i64).saturating_sub(verified_count);
            Ok(serde_json::json!({
                "ran": true,
                "hit_count": hit_count,
                "verified_count": verified_count,
                "unverified_count": unverified_count,
                "credentials_written": cred_count,
                "db_table": "credentials",
                "note": if verified_count == 0 {
                    "all hits unverified (pattern matches only) — query credentials table and confirm manually"
                } else {
                    "verified=1 rows are live secrets; verified=0 are pattern matches requiring confirmation"
                },
            }))
        }
        Err(e) => Ok(serde_json::json!({"ran": false, "error": e.to_string()})),
    }
}

/// PATH-based binary lookup. Mirrors
/// `crate::semgrep::run::which_in_path` and the prior MCP-inline
/// `which_in_path`; kept here so the trufflehog module is
/// self-contained.
fn which_in_path(name: &str) -> bool {
    std::env::var_os("PATH")
        .map(|path| std::env::split_paths(&path).any(|dir| dir.join(name).is_file()))
        .unwrap_or(false)
}

/// Create a tempfile for the strings-extraction dump. The returned
/// `NamedTempFile` unlinks its on-disk path when dropped, ensuring
/// proper cleanup when the handle is dropped after subprocess + parse
/// complete. Extracted as a named fn so the Drop contract has a
/// focused regression test (see `strings_tempfile_unlinks_on_drop` below).
fn create_strings_tempfile() -> std::io::Result<tempfile::NamedTempFile> {
    tempfile::Builder::new()
        .prefix("droidsaw-strings-")
        .suffix(".txt")
        .tempfile()
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Regression test for the leak-fix Drop contract. Pins the named
    /// constructor + verifies the on-disk file is unlinked when the
    /// `NamedTempFile` handle drops.
    ///
    /// A future refactor that replaces `NamedTempFile` with a raw
    /// `PathBuf` (e.g., reverting to `temp_dir + format!`) would
    /// silently reopen the leak surface; this test fires at compile
    /// time (signature mismatch on `create_strings_tempfile`) or at
    /// run time (file persists after Drop).
    #[test]
    fn strings_tempfile_unlinks_on_drop() {
        let tf = create_strings_tempfile().expect("create tempfile");
        let path = tf.path().to_path_buf();
        // Pre-Drop: file exists on disk.
        assert!(
            path.exists(),
            "strings tempfile not on disk at create: {}",
            path.display()
        );
        // The prefix ensures consistent naming for operational tooling
        // to identify these temporary strings-extraction files.
        let basename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
        assert!(
            basename.starts_with("droidsaw-strings-"),
            "expected prefix preserved for ops-grep affordance; got {basename}"
        );
        assert!(
            basename.ends_with(".txt"),
            "expected .txt suffix preserved; got {basename}"
        );
        drop(tf);
        // Post-Drop: file removed.
        assert!(
            !path.exists(),
            "strings tempfile NOT unlinked on Drop — leak fix regressed: {}",
            path.display()
        );
    }
}