droidsaw 2.0.0

DROIDSAW — unified Android reverse engineering CLI. Hermes, DEX, APK signing. JSON output, MCP server. Bytecode is not a security layer.
Documentation
use serde_json::{json, Value};

use crate::context::CrossLayerContext;

use super::{meta, progress};

#[allow(clippy::arithmetic_side_effects, clippy::as_conversions, reason = "HBC slice bounds same as `decompile()`; `i + 1` usize+1 bounded by ctx.dex.len(). Display-only counters (hermes_count/dex_count) are handled via `saturating_*` in the display-only commit.")]
pub fn semgrep(
    ctx: &CrossLayerContext,
    output: Option<&std::path::Path>,
    semgrep_args: &crate::semgrep::SemgrepArgs,
) -> anyhow::Result<Value> {
    // RAII drain guard: closes the I/O-`?`-before-explicit-drain gap.
    // The function explicitly drains at end and embeds findings in the
    // returned JSON, but several `?` ops (fs::create_dir_all, fs::write,
    // require_apk, compose_config_args) sit between the per-function
    // optimize() loop and the explicit drain. SIGPIPE / ENOSPC / any
    // I/O failure strands findings unless this guard's Drop fires.
    let _drain_guard = crate::context::HermesFindingDrainGuard::install_discard();

    let path = std::path::PathBuf::from(&ctx.require_apk()?.path);
    let default_out = std::path::PathBuf::from(format!(
        "./droidsaw-semgrep-{}",
        path.file_stem().and_then(|s| s.to_str()).unwrap_or("unknown")
    ));
    let out_dir = output.unwrap_or(&default_out);
    std::fs::create_dir_all(out_dir)?;

    let mut hermes_count = 0usize;
    let mut dex_count = 0usize;
    let mut bytes_written: u64 = 0;

    if let Some(hbc_owned) = ctx.hbc.as_ref() {
        let hbc = hbc_owned.hbc();
        let hbc_dir = out_dir.join("hermes");
        std::fs::create_dir_all(&hbc_dir)?;
        let hbc_data = hbc_owned.bytes();

        for fid in 0..hbc.function_count {
            let f = hbc.function_get(fid);
            if f.size == 0 {
                continue;
            }
            let fname = if f.name_id < hbc.string_count {
                hbc.string_as_str_or_empty(f.name_id).into_owned()
            } else {
                String::new()
            };
            let safe_name = sanitize_filename(&fname);

            #[allow(
                clippy::cast_possible_truncation,
                reason = "PROOF: bounded by hbc_data.len() via .min(); usize→u64→usize roundtrip lossless on every supported target."
            )]
            let end = (u64::from(f.offset) + u64::from(f.size)).min(hbc_data.len() as u64) as usize;
            let code_end = (end + 256).min(hbc_data.len());
            let Some(code) = hbc_data.get(f.offset as usize..code_end) else {
                continue;
            };
            let Some(decode_slice) = hbc_data.get(f.offset as usize..end) else {
                continue;
            };
            let Ok(instructions) = droidsaw_hermes::decompile::decode::decode_function(
                decode_slice,
                hbc.opcode_version(),
            ) else {
                continue;
            };
            let mut exc_handlers = Vec::new();
            for i in 0..hbc.function_exception_count(fid) {
                let eh = hbc.function_exception_get(fid, i);
                exc_handlers.push(droidsaw_hermes::decompile::cfg::ExcHandler {
                    start: eh.start,
                    end: eh.end,
                    target: eh.target,
                });
            }
            let Ok(cfg) = droidsaw_hermes::decompile::cfg::Cfg::build(&instructions, &exc_handlers, code) else {
                continue;
            };
            let Ok(ssa) = droidsaw_hermes::decompile::ssa::build_ssa(&cfg, f.frame_size) else {
                continue;
            };
            let get_str = |id: u32| -> String {
                if id < hbc.string_count {
                    hbc.string_as_str_or_empty(id).into_owned()
                } else {
                    format!("<{id}>")
                }
            };
            let get_literal = |a: u8, b: u32, c: u32, d: u32| -> (u8, u32, i32, f64) {
                let v = hbc.literal_get(a, b, c, d);
                (v.tag, v.str_id, v.ival, v.dval)
            };
            let get_shape = |i: u32| -> (u32, u32) {
                match hbc.object_shape_get(i) {
                    Some(s) => (s.key_buffer_offset, s.num_props),
                    None => (0, 0),
                }
            };
            let get_func_name = |fid2: u32| -> String {
                if fid2 < hbc.function_count {
                    let fi = hbc.function_get(fid2);
                    if fi.name_id < hbc.string_count {
                        return hbc.string_as_str_or_empty(fi.name_id).into_owned();
                    }
                }
                String::new()
            };
            let get_bigint = |idx: u32| -> Option<String> { hbc.bigint_as_str(idx) };
            let ssa = droidsaw_hermes::decompile::optimize::optimize(
                ssa,
                &get_str,
                &get_literal,
                &get_shape,
                &get_func_name,
                &get_bigint,
            );
            let exc_map: std::collections::BTreeMap<u32, u32> = cfg
                .blocks
                .values()
                .filter_map(|b| b.exc_handler.map(|h| (b.id, h)))
                .collect();
            let fname_for_emit = if f.name_id < hbc.string_count {
                hbc.string_as_str_or_empty(f.name_id).into_owned()
            } else {
                String::new()
            };
            let structured = droidsaw_hermes::decompile::structure::structure_function_with_exc(
                &ssa,
                fname_for_emit,
                f.param_count,
                f.flags,
                &exc_map,
            );
            let js = droidsaw_hermes::decompile::emit::emit_js(&structured, &get_str);

            let file_path = hbc_dir.join(format!("fn_{fid:06}_{safe_name}.js"));
            // DISPLAY-ONLY: `files_written` / `hermes_functions` JSON fields.
            bytes_written = bytes_written.saturating_add(js.len() as u64);
            std::fs::write(file_path, js)?;
            hermes_count = hermes_count.saturating_add(1);
        }
    }

    let apk = ctx.require_apk()?;
    use rayon::prelude::*;
    use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
    for ((i, dex), apk_dex) in ctx.dex.iter().enumerate().zip(apk.dex.iter()) {
        let dex_dir = out_dir.join(format!("dex{}", i + 1));
        std::fs::create_dir_all(&dex_dir)?;
        let data = &apk_dex.data;
        // Bulk-path: build TypeToClassDefMap + EnumInlineMap ONCE per DEX
        // and reuse across every class. The single-class `decompile_class`
        // entry point passes `None` for ttm, which falls through to an
        // O(n_classes) linear scan inside `is_synthetic_bridge_ctor` —
        // called per-method per-class → O(classes² × methods) aggregate.
        // Measured as the dominant superlinear cost on large Play APKs
        // (43k-class chatgpt: ~28 min extract under the scan vs expected
        // few minutes with the O(1) ttm lookup).
        let ttm = droidsaw_dex::classes::TypeToClassDefMap::build(dex);
        let enum_inlines = droidsaw_dex::classes::EnumInlineMap::build(dex, data, &ttm);
        // Per-class decompile is pure over shared-reference context
        // (&dex, &data, &cd, &ttm, &enum_inlines) and writes to its own
        // output file at a disjoint path — embarrassingly parallel.
        // rayon's work-stealing thread pool handles scheduling; the
        // .par_bridge() adapter converts the non-indexable iterator
        // from classes_to_decompile into a parallel iterator.
        // Counters are AtomicU64/AtomicUsize to keep the post-loop
        // display-only JSON totals consistent with the serial path.
        let dex_count_atomic = AtomicUsize::new(0);
        let bytes_written_atomic = AtomicU64::new(0);
        // Amortize r8_inversion::build_trampoline_census across the
        // par_bridge per-class loop.
        // Without the guard: census was being rebuilt per class through
        // decompile_class_ext on every call. On the --mode=full hot path
        // this dominated CPU. Built once per DEX + shared across rayon
        // workers (TrampolineCensus is Sync).
        let census = droidsaw_dex::r8_inversion::build_trampoline_census(dex);
        droidsaw_dex::classes::classes_to_decompile(dex)
            .par_bridge()
            .for_each(|(_, cd)| {
                let Ok(desc) = dex.get_type_descriptor(cd.class_idx) else {
                    return;
                };
                let safe = desc
                    .trim_start_matches('L')
                    .trim_end_matches(';')
                    .replace('/', "_");
                let safe = sanitize_filename(&safe);
                let source = droidsaw_dex::classes::decompile_class_ext_with_census(
                    dex,
                    data,
                    cd,
                    Some(&enum_inlines),
                    Some(&ttm),
                    Some(&census),
                );
                let file_path = dex_dir.join(format!("{safe}.java"));
                // DISPLAY-ONLY: `files_written` / `dex_classes` JSON fields.
                bytes_written_atomic.fetch_add(source.len() as u64, Ordering::Relaxed);
                // WHY: best-effort write for DISPLAY-ONLY filesystem mirror
                // (counters are the load-bearing output); drop is explicit.
                drop(std::fs::write(file_path, source));
                dex_count_atomic.fetch_add(1, Ordering::Relaxed);
            });
        bytes_written = bytes_written.saturating_add(bytes_written_atomic.load(Ordering::Relaxed));
        dex_count = dex_count.saturating_add(dex_count_atomic.load(Ordering::Relaxed));
    }

    // Drain decompile-time HermesFinding emissions accumulated across
    // the per-function `optimize::optimize()` calls. The channel is
    // thread-local; without this drain the findings would either leak
    // into the next parse on the same blocking-pool worker (tokio
    // `spawn_blocking` thread reuse) or, in single-shot CLI runs, never
    // surface. Translate to common::Finding and embed in the returned
    // JSON envelope so operator-facing tooling can consume them.
    let hermes_findings = CrossLayerContext::drain_hermes_findings();

    progress!(
        "wrote {:?} hermes functions, {:?} dex classes to {:?}",
        hermes_count,
        dex_count,
        out_dir
    );

    // Compose the JSON `command` hint from the user's --rules / --no-auto
    // flags + DROIDSAW_SEMGREP_RULES env. Backward compat: callers passing
    // SemgrepArgs::default() with no env produce `semgrep --config auto <dir>/`,
    // identical to the prior hardcoded form.
    let composed = crate::semgrep::compose_config_args(semgrep_args)
        .map_err(|e| anyhow::anyhow!("semgrep arg composition: {e}"))?;
    let cmd_hint = format!(
        "semgrep {} {}/",
        composed.join(" "),
        out_dir.display()
    );

    let out = json!({
        "output_dir": out_dir.display().to_string(),
        // DISPLAY-ONLY: JSON sum of two file-counters (each bounded by
        // actual writes; saturating defends against pathological inputs).
        "files_written": hermes_count.saturating_add(dex_count),
        "hermes_functions": hermes_count,
        "dex_classes": dex_count,
        "bytes_written": bytes_written,
        "command": cmd_hint,
        "findings": hermes_findings,
        "_meta": meta(
            1,
            false,
            "source extracted — run the returned `command` to scan with semgrep",
            &["audit-full", "audit-light", "strings", "xrefs"],
        ),
    });
    Ok(out)
}

/// CLI `scan semgrep` entry point — wraps [`semgrep`] (extract) and
/// optionally chains the shared
/// [`crate::semgrep::run_and_persist`] helper when `persist` is set.
///
/// Default (`persist = false`): identical to calling [`semgrep`]
/// directly — extract source + return a `command` hint string. Backward
/// compatible with consumers that parse the hint and run semgrep
/// themselves.
///
/// `persist = true`: also invokes `semgrep` against the extracted
/// source and writes hits to a SQLite findings DB at `db_path` (or a
/// derived default path next to the input). Returns an extended JSON
/// envelope with the original extraction fields plus `db_path` and a
/// nested `semgrep_scan` object — the same shape the audit handler
/// produces when its mode runs semgrep.
pub fn scan_semgrep(
    ctx: &CrossLayerContext,
    output: Option<&std::path::Path>,
    semgrep_args: &crate::semgrep::SemgrepArgs,
    persist: bool,
    db: Option<&std::path::Path>,
) -> anyhow::Result<Value> {
    let extracted = semgrep(ctx, output, semgrep_args)?;
    if !persist {
        return Ok(extracted);
    }

    let output_dir = extracted
        .get("output_dir")
        .and_then(|v| v.as_str())
        .ok_or_else(|| anyhow::anyhow!("scan_semgrep: extract step did not return output_dir"))?
        .to_string();

    // Default DB path mirrors `audit`'s convention of one DB per input
    // basename. `--db <path>` overrides if the operator wants a shared
    // DB across multiple semgrep runs (e.g. corpus aggregation).
    let db_path: std::path::PathBuf = match db {
        Some(p) => p.to_path_buf(),
        None => {
            let input = std::path::PathBuf::from(&ctx.require_apk()?.path);
            let stem = input
                .file_stem()
                .and_then(|s| s.to_str())
                .unwrap_or("unknown");
            std::path::PathBuf::from(format!("./droidsaw-{stem}.db"))
        }
    };

    let scan = crate::semgrep::run_and_persist(
        std::path::Path::new(&output_dir),
        semgrep_args,
        &db_path,
        None,
    )?;

    let mut merged = extracted.as_object().cloned().unwrap_or_default();
    merged.insert("semgrep_scan".into(), scan);
    merged.insert(
        "db_path".into(),
        serde_json::json!(db_path.display().to_string()),
    );
    Ok(serde_json::Value::Object(merged))
}

fn sanitize_filename(s: &str) -> String {
    s.chars()
        .map(|c| if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
        .take(64)
        .collect()
}