droidsaw 2.0.0 - Docs.rs

use droidsaw_common::Finding;
use serde_json::{json, Value};

use crate::analysis;
use crate::context::CrossLayerContext;

use super::{credentials_fp, honeycomb_fp, meta, progress};

#[allow(
    clippy::arithmetic_side_effects,
    clippy::as_conversions,
    clippy::cast_possible_wrap,
    clippy::cast_possible_truncation,
    reason = "DEX struct-field indices widen u32→usize for `.get()` bounds-checked lookup; `i + 1` / `dex_idx + 1` dex-layer labels bounded by ctx.dex.len(); `j as i64` / `len() as i64` casts widen for SQLite bind params (realistic row counts ≪ i64::MAX). Row counters (string_rows/class_rows/etc.) are display-only and handled in the saturating commit."
)]
pub fn export(ctx: &CrossLayerContext, output: &str) -> anyhow::Result<Value> {
    // RAII drain guard: hbc.string_get / function_get emit hermes
    // findings on adversarial input. Without a drain on Drop, findings
    // leak into the next bundle on the same blocking-pool worker.
    let _drain_guard = crate::context::HermesFindingDrainGuard::install_discard();

    let mut db = rusqlite::Connection::open(output)?;

    db.execute_batch(
        "
        CREATE TABLE IF NOT EXISTS strings (id INTEGER, kind TEXT, value TEXT, length INTEGER, layer TEXT, PRIMARY KEY (id, layer));
        CREATE TABLE IF NOT EXISTS functions (id INTEGER, name TEXT, param_count INTEGER, offset INTEGER, size INTEGER, layer TEXT, PRIMARY KEY (id, layer));
        CREATE TABLE IF NOT EXISTS classes (id INTEGER, name TEXT, superclass TEXT, layer TEXT, PRIMARY KEY (id, layer));
        CREATE TABLE IF NOT EXISTS edges (caller_id INTEGER, caller_layer TEXT, callee_id INTEGER, callee_layer TEXT, bridge_type TEXT, PRIMARY KEY (caller_id, caller_layer, callee_id, callee_layer));
        CREATE VIRTUAL TABLE IF NOT EXISTS strings_fts USING fts5(value, content=strings, content_rowid=rowid);
    ",
    )?;

    let tx = db.transaction()?;

    let mut string_rows = 0u64;
    let mut function_rows = 0u64;
    let mut class_rows = 0u64;
    let mut edge_rows = 0u64;

    if let Some(hbc_owned) = ctx.hbc.as_ref() {
        let hbc = hbc_owned.hbc();
        for i in 0..hbc.string_count {
            // Index-build: corrupt entries get skipped so the audit
            // index has only well-formed rows. Typed signal preserved
            // via the predecessor stream's `HermesFinding` channel
            // for OOR (still emitted by `string_get`).
            let kind = match hbc.string_get(i) {
                Ok(Some(sd)) => match sd.kind {
                    0 => "literal",
                    1 => "ident",
                    _ => "other",
                },
                Ok(None) | Err(_) => continue,
            };
            let value = hbc.string_as_str_or_empty(i);
            tx.execute(
                "INSERT OR REPLACE INTO strings VALUES (?1,?2,?3,?4,'hbc')",
                rusqlite::params![i, kind, &*value, value.len() as i64],
            )?;
            // DISPLAY-ONLY: row counter for `row_counts` JSON field.
            string_rows = string_rows.saturating_add(1);
        }
        for fid in 0..hbc.function_count {
            let f = hbc.function_get(fid);
            let name = if f.name_id < hbc.string_count {
                hbc.string_as_str_or_empty(f.name_id).into_owned()
            } else {
                String::new()
            };
            tx.execute(
                "INSERT OR REPLACE INTO functions VALUES (?1,?2,?3,?4,?5,'hbc')",
                rusqlite::params![fid, name, f.param_count, f.offset, f.size],
            )?;
            function_rows = function_rows.saturating_add(1);
        }
    }

    for (i, dex) in ctx.dex.iter().enumerate() {
        let layer = format!("dex{}", i + 1);
        for (j, entry) in dex.strings.iter().enumerate() {
            let s = entry.as_str_lossy();
            tx.execute(
                "INSERT OR REPLACE INTO strings VALUES (?1,'dex',?2,?3,?4)",
                rusqlite::params![j as i64, s, s.len() as i64, layer],
            )?;
            string_rows = string_rows.saturating_add(1);
        }
        for (j, cd) in dex.class_defs.iter().enumerate() {
            // Shadow gate: the `classes` table is keyed by the enumerate
            // position `j`, so a duplicate-`class_idx` shadow row gets a
            // distinct key and BOTH rows persist — amplifying class counts
            // and introducing `class_idx`-join ambiguity for downstream
            // SQLite consumers (xrefs queries, finding correlation). Skip
            // the shadow row before the INSERT so the export reflects the
            // first-wins canonical class that `class_def_for_type` pins.
            if dex.class_def_is_shadowed(j) {
                continue;
            }
            let name = dex
                .type_descriptors
                .get(cd.class_idx.0 as usize)
                .cloned()
                .unwrap_or_default();
            let super_name = cd
                .superclass_idx
                .and_then(|s| dex.type_descriptors.get(s.0 as usize))
                .cloned()
                .unwrap_or_default();
            tx.execute(
                "INSERT OR REPLACE INTO classes VALUES (?1,?2,?3,?4)",
                rusqlite::params![j as i64, name, super_name, layer],
            )?;
            class_rows = class_rows.saturating_add(1);
        }
        for (j, m) in dex.methods.iter().enumerate() {
            let name = dex
                .strings
                .get(m.name_idx.0 as usize)
                .map(|e| e.as_str_lossy().to_string())
                .unwrap_or_default();
            tx.execute(
                "INSERT OR REPLACE INTO functions VALUES (?1,?2,0,0,0,?3)",
                rusqlite::params![j as i64, name, layer],
            )?;
            function_rows = function_rows.saturating_add(1);
        }
    }

    if let Some(hbc_owned) = ctx.hbc.as_ref() {
        let hbc = hbc_owned.hbc();
        let hbc_data = hbc_owned.bytes();
        let scan = droidsaw_hermes::scanner::scan_parsed(hbc, hbc_data);
        let bridge = analysis::bridge::BridgeResolver::resolve(ctx);

        let nm_id =
            (0..hbc.string_count).find(|&i| hbc.string_as_str_or_empty(i) == "NativeModules");
        if let Some(nm_id) = nm_id
            && let Some(ref_fids) = scan.string_refs.get(&nm_id)
        {
            for &fid in ref_fids {
                for (&str_id, fids) in &scan.string_refs {
                    if fids.contains(&fid) && str_id != nm_id {
                        let method_name = hbc.string_as_str_or_empty(str_id).into_owned();
                        // Legacy by-method-name lookup — this xref export
                        // builder has only the JS-side method-name string
                        // from the string-table scan, no (module, method)
                        // pair, so the per-(module, method) `bridge.mappings`
                        // index can't be keyed against it.
                        if let Some(targets) = bridge.by_method.get(&method_name) {
                            for (dex_idx, m_idx) in targets {
                                let layer = format!("dex{}", dex_idx + 1);
                                tx.execute(
                                    "INSERT OR IGNORE INTO edges VALUES (?1,'hbc',?2,?3,'bridge')",
                                    rusqlite::params![fid, m_idx.0, layer],
                                )?;
                                edge_rows = edge_rows.saturating_add(1);
                            }
                        }
                    }
                }
            }
        }
    }

    tx.commit()?;
    db.execute("INSERT INTO strings_fts(strings_fts) VALUES('rebuild')", [])?;

    progress!("exported to {output:?}");
    let out = json!({
        "output": output,
        "tables": ["strings", "functions", "classes", "edges", "strings_fts"],
        "row_counts": {
            "strings": string_rows,
            "functions": function_rows,
            "classes": class_rows,
            "edges": edge_rows,
        },
        "_meta": meta(
            1,
            false,
            "query with `sqlite3 <output>`; strings_fts is a full-text search index over strings.value",
            &["audit", "info", "sbom"],
        ),
    });
    Ok(out)
}

/// Write a slice of findings to a SQLite database.
///
/// Schema:
/// - `findings(id, severity, layer, id_tag, detail, cwe)` — structured rows
/// - `findings_fts` — FTS5 content table over `detail`, ranked by BM25
///
/// Returns the path written and the row count.
/// Write findings, credentials, and xrefs to a single audit SQLite database.
///
/// Tables:
/// Creates (or opens) the audit SQLite database and ensures all tables exist:
/// - `findings(severity, layer, id_tag, detail, cwe)` + `findings_fts` BM25
/// - `credentials(detector, raw, verified, extra)` + `credentials_fts` BM25
/// - `xrefs(layer, string_value, function_id, function_name)` + `xrefs_fts` BM25
/// - `taint_flows(layer, func_id, source_type, sink_type, severity, cwe)` + `taint_flows_fts` BM25
///
/// `taint_flows` is separate from `xrefs`: xrefs are string→function edges
/// (what strings appear in what functions), while taint_flows are structured
/// source→sink propagation results (user input reaching a dangerous call).
/// They answer different questions and are best queried independently.
/// Schema revision recorded in `schema_meta`. Bumped whenever
/// `migrate_findings_schema_to_current` adds a migration step.
///
/// **v2** — threat-model foundations: adds new columns on
/// `findings` (`adversary_profile_relevance`, `completeness`, `source_api`,
/// `source_api_args`, `resolution`) plus three new tables (`third_parties`,
/// `acquisition_metadata`, `declared_claims`). Strictly additive: every v1
/// reader continues to read its old columns at the old indices.
///
/// **v3** — credentials-view extension: drops + recreates the
/// `credentials` view so the documented audit-response queries
/// (`SELECT detector,raw,verified FROM credentials WHERE verified=1`)
/// actually parse + execute. The new projection exposes
/// `detector` / `raw` / `verified` columns via `json_extract` over the
/// underlying `findings.extra` JSON. Strictly additive at the SQL level
/// (DROP VIEW + CREATE VIEW preserves the underlying findings rows).
///
/// **v4** — modular-modes incremental DB: adds `run_id`,
/// `mode`, `signature_hash` columns to `findings` (all nullable to keep
/// the v1/v2/v3 reader backward-compatible) plus a partial UNIQUE index
/// `findings_signature_hash_uniq` on `signature_hash WHERE signature_hash
/// IS NOT NULL`. Powers the `audit --mode=<basic|full|semgrep|trufflehog>`
/// CLI surface and the matching MCP `mode`/`update_db` params: re-running
/// the same mode against an existing DB upserts by `signature_hash`
/// (`INSERT OR IGNORE` returns 0 inserts on duplicate identity); running a
/// different mode adds rows without clobbering. Strictly additive: legacy
/// `write_findings_db` callers leave `run_id`/`mode` NULL and the partial
/// index ignores those rows, so legacy readers are unaffected.
///
/// **v5** — taint offset wiring: adds `source_offset` and
/// `sink_offset` (both nullable `INTEGER`) to the `taint_flows` table.
/// DEX taint findings populate both from the Dalvik instruction address at
/// the source invoke and the sink invoke respectively. HBC and bridge-seeded
/// findings leave them NULL by design (see `hbc_taint.rs` doc-comment).
/// Strictly additive: existing `taint_flows` readers see the new columns
/// as NULL until re-audited with the v5 binary.
///
/// **v6** — cross-layer taint composite: adds the
/// `cross_layer_taint_flows` table + `cross_layer_taint_flows_fts` FTS5
/// virtual table + two indexes (`idx_cross_layer_bridge` on `(js_module,
/// js_method)`, `idx_cross_layer_severity` on `(severity)`) + the
/// `cross_layer_taint_critical` view. One row per
/// `CROSS_LAYER_TAINT_FLOW` finding stitched by `cross_layer_stitch.rs`:
/// columns project the `CrossLayerTaintFinding<MethodIdx>` payload that
/// rides `Finding.extra` (js_module / js_method / dex_idx / method_idx /
/// js_func_id / native_func_id / native_class_descriptor /
/// native_method_signature / source_type / sink_type / severity / cwe).
/// Strictly additive: legacy `taint_flows` rows unchanged; pre-v6
/// readers ignore the new table.
pub const FINDINGS_SCHEMA_REV: i64 = 6;

/// Apply additive schema migrations on `db` so it is at the current
/// `FINDINGS_SCHEMA_REV`. Reads `schema_meta` to determine the starting
/// revision; runs every step strictly greater than that revision; records
/// the new revision in `schema_meta` on success.
///
/// Called by `write_findings_db` after the initial `CREATE TABLE IF NOT
/// EXISTS` batch creates the v1 tables. The migrations land additively:
/// `ADD COLUMN` for findings; `CREATE TABLE IF NOT EXISTS` for the new
/// threat-model tables. Reads of the v1 schema continue to work unchanged.
pub fn migrate_findings_schema_to_current(db: &mut rusqlite::Connection) -> anyhow::Result<()> {
    db.execute(
        "CREATE TABLE IF NOT EXISTS schema_meta (
            schema_rev INTEGER PRIMARY KEY,
            applied_at TEXT NOT NULL
        )",
        [],
    )?;
    let current: i64 = db.query_row(
        "SELECT COALESCE(MAX(schema_rev), 0) FROM schema_meta",
        [],
        |r| r.get(0),
    )?;
    if current >= FINDINGS_SCHEMA_REV {
        return Ok(());
    }

    let tx = db.transaction()?;

    if current < 2 {
        // Findings columns — additive. ALTER TABLE … ADD COLUMN has no IF NOT
        // EXISTS form in SQLite, so we guard via PRAGMA table_info to keep the
        // call idempotent across re-runs that somehow saw schema_meta blanked.
        if !findings_has_column(&tx, "adversary_profile_relevance")? {
            tx.execute_batch(
                "ALTER TABLE findings ADD COLUMN adversary_profile_relevance TEXT NOT NULL DEFAULT '[]';
                 ALTER TABLE findings ADD COLUMN completeness               TEXT NOT NULL DEFAULT 'present';
                 ALTER TABLE findings ADD COLUMN source_api                 TEXT;
                 ALTER TABLE findings ADD COLUMN source_api_args            TEXT;
                 ALTER TABLE findings ADD COLUMN resolution                 TEXT;",
            )?;
        }

        // New threat-model tables. CREATE TABLE IF NOT EXISTS is idempotent,
        // so a second migration pass leaves the tables alone.
        tx.execute_batch(
            "CREATE TABLE IF NOT EXISTS third_parties (
                rowid               INTEGER PRIMARY KEY,
                identity            TEXT NOT NULL,
                version             TEXT,
                first_seen_location TEXT,
                jurisdictions       TEXT NOT NULL DEFAULT '[]'
            );
            CREATE TABLE IF NOT EXISTS acquisition_metadata (
                rowid              INTEGER PRIMARY KEY CHECK (rowid = 1),
                source_kind        TEXT NOT NULL DEFAULT 'unknown',
                operator           TEXT,
                authority_ref      TEXT,
                acquired_at        TEXT,
                pre_analysis_hash  TEXT
            );
            CREATE TABLE IF NOT EXISTS declared_claims (
                rowid      INTEGER PRIMARY KEY,
                source     TEXT NOT NULL,
                category   TEXT NOT NULL,
                declared   TEXT NOT NULL,
                hash       TEXT NOT NULL
            );",
        )?;
    }

    if current < 3 {
        // Replace the v2 credentials view (which projected only
        // `rowid, severity, confidence, detail, extra`) with a v3 view
        // that exposes the documented `detector`/`raw`/`verified` columns
        // via `json_extract` over the underlying `findings.extra` JSON.
        // The trufflehog ingestion path (`write_credentials_db`) writes
        // `extra = {"detector":..., "raw":..., "verified":...}`; v3 makes
        // those queryable by name to match the audit-response query
        // examples emitted at `mcp/mod.rs:768/770/801`.
        tx.execute_batch(
            "DROP VIEW IF EXISTS credentials;
             CREATE VIEW credentials AS
                 SELECT rowid, severity, confidence, detail,
                        json_extract(extra, '$.detector') AS detector,
                        json_extract(extra, '$.raw')      AS raw,
                        json_extract(extra, '$.verified') AS verified,
                        extra
                 FROM findings WHERE source = 'trufflehog';",
        )?;
    }

    if current < 4 {
        // Modular-modes columns. Nullable so legacy `write_findings_db`
        // (which doesn't know about runs/modes) can keep inserting
        // without re-tagging every row. PRAGMA-guard each column for
        // idempotency (a partially-applied v4 leaves us free to re-run).
        if !findings_has_column(&tx, "run_id")? {
            tx.execute_batch("ALTER TABLE findings ADD COLUMN run_id TEXT;")?;
        }
        if !findings_has_column(&tx, "mode")? {
            tx.execute_batch("ALTER TABLE findings ADD COLUMN mode TEXT;")?;
        }
        if !findings_has_column(&tx, "signature_hash")? {
            tx.execute_batch("ALTER TABLE findings ADD COLUMN signature_hash TEXT;")?;
        }
        // Partial UNIQUE index: legacy NULL-signature rows (from the
        // `write_findings_db` shim, threat-model envelope writer, etc.)
        // coexist freely; only modular-mode rows participate in
        // upsert-by-identity. `IF NOT EXISTS` makes the call idempotent
        // across re-runs that saw the column-add already applied.
        tx.execute_batch(
            "CREATE UNIQUE INDEX IF NOT EXISTS findings_signature_hash_uniq \
             ON findings(signature_hash) WHERE signature_hash IS NOT NULL;",
        )?;
    }

    if current < 5 {
        // Taint-offset wiring: add nullable source_offset / sink_offset to
        // taint_flows for existing DBs. Fresh DBs get the columns from the
        // CREATE TABLE DDL; only v1–v4 DBs that already have taint_flows
        // need ALTER TABLE. Skip entirely when taint_flows doesn't exist
        // yet — it will be created with the correct schema by write_findings_db.
        if taint_flows_has_column(&tx, "func_id")? {
            // Table exists (has at least one old column). Add offset columns.
            if !taint_flows_has_column(&tx, "source_offset")? {
                tx.execute_batch("ALTER TABLE taint_flows ADD COLUMN source_offset INTEGER;")?;
            }
            if !taint_flows_has_column(&tx, "sink_offset")? {
                tx.execute_batch("ALTER TABLE taint_flows ADD COLUMN sink_offset INTEGER;")?;
            }
        }
        // If taint_flows doesn't exist yet, skip: write_findings_db will
        // CREATE it with source_offset + sink_offset already in the DDL.
    }

    if current < 6 {
        // Cross-layer taint composite: new table + FTS5 + 2 indexes + view.
        // Every DDL is `IF NOT EXISTS` so a fresh DB (which gets the same
        // batch from WRITE_FINDINGS_DB_SCHEMA on first creation) is a no-op
        // here.
        tx.execute_batch(CROSS_LAYER_TAINT_FLOWS_SCHEMA)?;
    }

    tx.execute(
        "INSERT INTO schema_meta(schema_rev, applied_at) VALUES (?1, ?2)",
        rusqlite::params![FINDINGS_SCHEMA_REV, chrono::Utc::now().to_rfc3339()],
    )?;
    tx.commit()?;
    Ok(())
}

/// DDL batch for the v6 cross-layer-taint-flows surface. Shared between the
/// fresh-DB path (embedded in [`WRITE_FINDINGS_DB_SCHEMA`]) and the v5→v6
/// migration step in [`migrate_findings_schema_to_current`]. Every statement
/// is `IF NOT EXISTS` so applying twice is a no-op.
const CROSS_LAYER_TAINT_FLOWS_SCHEMA: &str = "
    CREATE TABLE IF NOT EXISTS cross_layer_taint_flows (
        rowid                    INTEGER PRIMARY KEY,
        js_module                TEXT NOT NULL,
        js_method                TEXT NOT NULL,
        dex_idx                  INTEGER NOT NULL,
        method_idx               INTEGER NOT NULL,
        js_func_id               INTEGER NOT NULL,
        native_func_id           INTEGER NOT NULL,
        native_class_descriptor  TEXT,
        native_method_signature  TEXT,
        source_type              TEXT NOT NULL,
        sink_type                TEXT NOT NULL,
        severity                 TEXT NOT NULL,
        cwe                      INTEGER
    );
    CREATE VIRTUAL TABLE IF NOT EXISTS cross_layer_taint_flows_fts
        USING fts5(js_module, js_method, native_class_descriptor,
                   native_method_signature, source_type, sink_type,
                   content=cross_layer_taint_flows, content_rowid=rowid,
                   tokenize='porter unicode61');
    CREATE INDEX IF NOT EXISTS idx_cross_layer_bridge
        ON cross_layer_taint_flows(js_module, js_method);
    CREATE INDEX IF NOT EXISTS idx_cross_layer_severity
        ON cross_layer_taint_flows(severity);
    CREATE VIEW IF NOT EXISTS cross_layer_taint_critical AS
        SELECT rowid, js_module, js_method, native_class_descriptor,
               native_method_signature, source_type, sink_type, severity, cwe
        FROM cross_layer_taint_flows
        WHERE severity IN ('Critical', 'High')
        ORDER BY CASE severity WHEN 'Critical' THEN 0 WHEN 'High' THEN 1 END,
                 rowid;
";

fn findings_has_column(
    db: &rusqlite::Connection,
    column: &str,
) -> rusqlite::Result<bool> {
    let mut stmt = db.prepare("PRAGMA table_info(findings)")?;
    let mut rows = stmt.query([])?;
    while let Some(row) = rows.next()? {
        let name: String = row.get(1)?;
        if name == column {
            return Ok(true);
        }
    }
    Ok(false)
}

fn taint_flows_has_column(
    db: &rusqlite::Connection,
    column: &str,
) -> rusqlite::Result<bool> {
    let mut stmt = db.prepare("PRAGMA table_info(taint_flows)")?;
    let mut rows = stmt.query([])?;
    while let Some(row) = rows.next()? {
        let name: String = row.get(1)?;
        if name == column {
            return Ok(true);
        }
    }
    Ok(false)
}

/// Stable finding-identity hash used by the v4 modular-modes upsert path.
///
/// The identity tuple is `(layer, id_tag, source, detail)`. Fields fed
/// through ASCII labels (`Layer::as_str`, `Source::as_str`) so a finding's
/// hash does not depend on `Debug`-formatting (which is allowed to change
/// freely for diagnostics). Severity is intentionally NOT part of the
/// identity — re-classification under a calibration update should still
/// upsert as the same finding rather than landing twice.
///
/// Fields are joined with a NUL byte (`0x00`) which can never appear in
/// any production input (id_tag is a Rust identifier, layer/source are
/// closed lowercase enums, detail is human-readable text). Any input
/// that managed to embed a NUL would still produce a deterministic
/// hash; collisions across distinct finding tuples require finding two
/// inputs that differ only in NUL placement, which is not feasible
/// against SHA-256.
///
/// SHA-256 hex output is 64 characters, sized for a TEXT column without
/// growing past sqlite's row-size budget for the rest of `findings`.
pub fn finding_signature_hash(f: &Finding) -> String {
    use sha2::{Digest, Sha256};
    let mut h = Sha256::new();
    h.update(f.layer.as_str().as_bytes());
    h.update(b"\0");
    h.update(f.id.as_bytes());
    h.update(b"\0");
    h.update(f.source.as_str().as_bytes());
    h.update(b"\0");
    h.update(f.detail.as_bytes());
    let digest = h.finalize();
    let mut out = String::with_capacity(64);
    for byte in digest.iter() {
        // Two-hex-char render. `format!` would allocate; this preallocates.
        let hi = byte >> 4;
        let lo = byte & 0x0f;
        out.push(hex_nibble(hi));
        out.push(hex_nibble(lo));
    }
    out
}

#[inline]
pub(super) const fn hex_nibble(n: u8) -> char {
    #[allow(clippy::arithmetic_side_effects, reason = "every arithmetic expression below is bounded by the match arm. `0..=9` ⇒ `b'0' + n ∈ 0x30..=0x39` (no u8 overflow). `10..=15` ⇒ `b'a' + n ∈ 0x6b..=0x70` then `- 10` ⇒ `0x61..=0x66` (no u8 overflow on either step). `_` arm returns a sentinel since `unreachable!()` would trip the panic-deny lint; callers feed `byte >> 4` or `byte & 0x0f` so this arm is never reached.")]
    #[allow(clippy::as_conversions, reason = "PROOF: u8 → char widening; the arm guards confine the u8 to the 0x30..=0x39 (digits) or 0x61..=0x66 (lowercase hex) range, which are all valid ASCII code points. `as char` is the canonical byte-to-ASCII-char conversion.")]
    match n {
        0..=9 => (b'0' + n) as char,
        10..=15 => (b'a' + (n - 10)) as char,
        _ => '?',
    }
}

/// Modular-modes incremental-DB writer: appends `findings` to the audit
/// SQLite at `output` under the supplied `run_id` / `mode` tags, with
/// upsert-by-identity semantics.
///
/// - **`update_db = true` (default for `audit --mode=<mode>`)**: opens
///   the DB additively. `INSERT OR IGNORE` against the partial UNIQUE
///   index on `signature_hash` means re-running the same mode does not
///   accumulate duplicate rows. Running a different mode adds new rows;
///   prior modes' rows remain untouched (the index's IGNORE collision
///   set is bounded to actual signature collisions, not mode-keyed).
///
/// - **`update_db = false` (`--no-update-db`)**: clears prior rows
///   filtered by `(mode == self.mode)` before inserting the new rows.
///   When `mode` is `None`, clears every row in `findings` (legacy
///   one-shot mode — equivalent to `write_findings_db` semantics on a
///   cleared file). The supporting tables (`finding_xrefs`,
///   `taint_flows`, etc.) are not touched here; the caller is
///   responsible for re-populating them.
///
/// Returns the count of newly-inserted rows (i.e. excluding rows
/// suppressed by the UNIQUE-index IGNORE on duplicate identity). The
/// `findings_fts` virtual table is rebuilt at the end so FTS reads see
/// the freshly-inserted rows.
pub fn write_findings_db_with_run(
    findings: &[Finding],
    output: &std::path::Path,
    run_id: Option<&str>,
    mode: Option<&str>,
    update_db: bool,
) -> anyhow::Result<usize> {
    let mut db = rusqlite::Connection::open(output)?;
    ensure_findings_schema(&mut db)?;

    if !update_db {
        // One-shot semantics. If `mode` is set, scope the delete; else
        // clear the whole table (legacy `write_findings_db` shape).
        match mode {
            Some(m) => {
                db.execute("DELETE FROM findings WHERE mode = ?1", rusqlite::params![m])?;
            }
            None => {
                db.execute("DELETE FROM findings", [])?;
            }
        }
    }

    let tx = db.transaction()?;
    let mut inserted = 0usize;
    {
        let mut stmt = tx.prepare(
            "INSERT OR IGNORE INTO findings \
             (severity, layer, id_tag, gauge_class, source, confidence, detail, cwe, extra, run_id, mode, signature_hash) \
             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
        )?;
        for f in findings {
            let sig = finding_signature_hash(f);
            let n = stmt.execute(rusqlite::params![
                format!("{:?}", f.severity),
                format!("{:?}", f.layer),
                f.id,
                f.gauge_class.as_str(),
                f.source.as_str(),
                f.confidence.as_str(),
                f.detail,
                f.cwe,
                f.extra,
                run_id,
                mode,
                sig,
            ])?;
            // `INSERT OR IGNORE` returns 0 on conflict-suppress, 1 on
            // actual insert. Saturating-add is exact for any realistic
            // findings.len() < usize::MAX.
            //
            // PROOF: `n` is `INSERT OR IGNORE` result, in {0, 1}; bounded.
            #[allow(clippy::arithmetic_side_effects, reason = "`n` is `INSERT OR IGNORE` result, in {0, 1}; bounded.")]
            { inserted += n; }
        }
    }
    tx.commit()?;
    db.execute("INSERT INTO findings_fts(findings_fts) VALUES('rebuild')", [])?;
    Ok(inserted)
}

/// Apply the `write_findings_db` schema-create batch to `db`. Factored
/// out so `write_findings_db_with_run` can opt into the same DDL on a
/// fresh DB without duplicating the SQL string.
fn ensure_findings_schema(db: &mut rusqlite::Connection) -> anyhow::Result<()> {
    db.execute_batch(WRITE_FINDINGS_DB_SCHEMA)?;
    migrate_findings_schema_to_current(db)?;
    Ok(())
}

/// Path-taking wrapper around `ensure_findings_schema` for callers
/// that hold a `&Path` rather than an open `Connection`. Used by
/// `crate::semgrep::run::run_and_persist` to ensure the findings table
/// plus `semgrep_hits` / `semgrep_hotspots` views exist before
/// [`write_semgrep_db`] inserts into them. Idempotent — safe to call
/// against a DB that already has the schema (every DDL is
/// `CREATE … IF NOT EXISTS` and the migration table tracks applied revs).
pub fn ensure_findings_db_schema(output: &std::path::Path) -> anyhow::Result<()> {
    let mut db = rusqlite::Connection::open(output)?;
    ensure_findings_schema(&mut db)
}

/// Schema-create batch shared by `write_findings_db` (legacy shim) and
/// `write_findings_db_with_run` (modular-modes path). Kept verbatim to
/// `IF NOT EXISTS` form for idempotency across both callers.
const WRITE_FINDINGS_DB_SCHEMA: &str =
    "
        CREATE TABLE IF NOT EXISTS findings (
            rowid          INTEGER PRIMARY KEY,
            severity       TEXT NOT NULL,
            layer          TEXT NOT NULL,
            id_tag         TEXT NOT NULL,
            gauge_class    TEXT NOT NULL DEFAULT 'Semantic',
            source         TEXT NOT NULL DEFAULT 'manifest',
            confidence     TEXT NOT NULL DEFAULT 'unverified',
            dismiss_reason TEXT,
            detail         TEXT NOT NULL,
            cwe            INTEGER,
            extra          TEXT
        );
        CREATE VIRTUAL TABLE IF NOT EXISTS findings_fts
            USING fts5(detail, id_tag, source, content=findings, content_rowid=rowid, tokenize='porter unicode61');

        CREATE VIEW IF NOT EXISTS credentials AS
            SELECT rowid, severity, confidence, detail,
                   json_extract(extra, '$.detector') AS detector,
                   json_extract(extra, '$.raw')      AS raw,
                   json_extract(extra, '$.verified') AS verified,
                   extra
            FROM findings WHERE source = 'trufflehog';

        CREATE TABLE IF NOT EXISTS xrefs (
            rowid         INTEGER PRIMARY KEY,
            layer         TEXT NOT NULL,
            string_value  TEXT NOT NULL,
            function_id   INTEGER,
            function_name TEXT
        );
        CREATE VIRTUAL TABLE IF NOT EXISTS xrefs_fts
            USING fts5(string_value, function_name, content=xrefs, content_rowid=rowid, tokenize='porter unicode61');

        CREATE TABLE IF NOT EXISTS taint_flows (
            rowid          INTEGER PRIMARY KEY,
            layer          TEXT NOT NULL,
            func_id        INTEGER NOT NULL,
            source_type    TEXT NOT NULL,
            sink_type      TEXT NOT NULL,
            severity       TEXT NOT NULL,
            cwe            INTEGER,
            source_offset  INTEGER,
            sink_offset    INTEGER
        );
        CREATE VIRTUAL TABLE IF NOT EXISTS taint_flows_fts
            USING fts5(source_type, sink_type, content=taint_flows, content_rowid=rowid, tokenize='porter unicode61');

        CREATE TABLE IF NOT EXISTS cross_layer_taint_flows (
            rowid                    INTEGER PRIMARY KEY,
            js_module                TEXT NOT NULL,
            js_method                TEXT NOT NULL,
            dex_idx                  INTEGER NOT NULL,
            method_idx               INTEGER NOT NULL,
            js_func_id               INTEGER NOT NULL,
            native_func_id           INTEGER NOT NULL,
            native_class_descriptor  TEXT,
            native_method_signature  TEXT,
            source_type              TEXT NOT NULL,
            sink_type                TEXT NOT NULL,
            severity                 TEXT NOT NULL,
            cwe                      INTEGER
        );
        CREATE VIRTUAL TABLE IF NOT EXISTS cross_layer_taint_flows_fts
            USING fts5(js_module, js_method, native_class_descriptor,
                       native_method_signature, source_type, sink_type,
                       content=cross_layer_taint_flows, content_rowid=rowid,
                       tokenize='porter unicode61');
        CREATE INDEX IF NOT EXISTS idx_cross_layer_bridge
            ON cross_layer_taint_flows(js_module, js_method);
        CREATE INDEX IF NOT EXISTS idx_cross_layer_severity
            ON cross_layer_taint_flows(severity);
        CREATE VIEW IF NOT EXISTS cross_layer_taint_critical AS
            SELECT rowid, js_module, js_method, native_class_descriptor,
                   native_method_signature, source_type, sink_type, severity, cwe
            FROM cross_layer_taint_flows
            WHERE severity IN ('Critical', 'High')
            ORDER BY CASE severity WHEN 'Critical' THEN 0 WHEN 'High' THEN 1 END,
                     rowid;

        CREATE VIEW IF NOT EXISTS semgrep_hits AS
            SELECT rowid, severity, id_tag, detail,
                   json_extract(extra, '$.class_name') AS class_name,
                   json_extract(extra, '$.check_id') AS check_id,
                   json_extract(extra, '$.message') AS message
            FROM findings WHERE source = 'semgrep';

        CREATE TABLE IF NOT EXISTS finding_xrefs (
            rowid         INTEGER PRIMARY KEY,
            finding_rowid INTEGER NOT NULL,
            layer         TEXT NOT NULL,
            string_value  TEXT NOT NULL,
            function_name TEXT NOT NULL
        );
        CREATE VIRTUAL TABLE IF NOT EXISTS finding_xrefs_fts
            USING fts5(string_value, function_name, content=finding_xrefs, content_rowid=rowid, tokenize='porter unicode61');

        CREATE VIEW IF NOT EXISTS actionable_findings AS
            SELECT rowid, severity, id_tag, source, confidence, detail, cwe
            FROM findings
            WHERE severity IN ('Critical', 'High') AND gauge_class = 'Semantic'
              AND confidence != 'dismissed'
            ORDER BY CASE severity WHEN 'Critical' THEN 0 WHEN 'High' THEN 1 END, rowid;

        CREATE VIEW IF NOT EXISTS dismissed_findings AS
            SELECT rowid, severity, id_tag, source, detail, dismiss_reason
            FROM findings WHERE confidence = 'dismissed';

        CREATE VIEW IF NOT EXISTS confirmed_findings AS
            SELECT rowid, severity, id_tag, source, detail, cwe
            FROM findings WHERE confidence = 'confirmed';

        CREATE VIEW IF NOT EXISTS confidence_summary AS
            SELECT source, confidence, COUNT(*) AS n
            FROM findings GROUP BY source, confidence ORDER BY source, confidence;

        CREATE VIEW IF NOT EXISTS finding_context AS
            SELECT f.rowid AS finding_rowid, f.severity, f.id_tag, f.detail,
                   fx.string_value, fx.function_name, fx.layer
            FROM findings f
            JOIN finding_xrefs fx ON f.rowid = fx.finding_rowid
            ORDER BY f.severity, fx.string_value;

        CREATE VIEW IF NOT EXISTS finding_urls AS
            SELECT DISTINCT f.id_tag, f.severity, fx.string_value AS url
            FROM findings f
            JOIN finding_xrefs fx ON f.rowid = fx.finding_rowid
            WHERE fx.string_value LIKE 'http%' OR fx.string_value LIKE '%://%'
            ORDER BY f.severity, fx.string_value;

        CREATE VIEW IF NOT EXISTS taint_critical AS
            SELECT func_id, source_type, sink_type, severity, cwe, layer
            FROM taint_flows
            WHERE severity IN ('Critical', 'High')
            ORDER BY CASE severity WHEN 'Critical' THEN 0 WHEN 'High' THEN 1 END;

        CREATE VIEW IF NOT EXISTS semgrep_hotspots AS
            SELECT class_name, COUNT(*) AS n, GROUP_CONCAT(DISTINCT check_id) AS rules
            FROM semgrep_results
            GROUP BY class_name
            HAVING n > 1
            ORDER BY n DESC;

        CREATE VIEW IF NOT EXISTS audit_summary AS
            SELECT
                (SELECT COUNT(*) FROM findings) AS total_findings,
                (SELECT COUNT(*) FROM findings WHERE severity IN ('Critical','High') AND gauge_class='Semantic' AND confidence!='dismissed') AS actionable,
                (SELECT COUNT(*) FROM findings WHERE source='taint') AS taint_flows,
                (SELECT COUNT(*) FROM findings WHERE source='trufflehog') AS credential_hits,
                (SELECT COUNT(*) FROM findings WHERE source='semgrep') AS semgrep_hits,
                (SELECT COUNT(*) FROM findings WHERE confidence='confirmed') AS confirmed,
                (SELECT COUNT(*) FROM findings WHERE confidence='dismissed') AS dismissed,
                (SELECT COUNT(*) FROM finding_xrefs) AS finding_xrefs;
        ";

/// Legacy single-shot findings writer. Backed by the modular-modes
/// upsert path with `(run_id=None, mode=None, update_db=true)`, which
/// matches the prior "open or create the DB, append findings"
/// semantics. Rows inserted via this path leave `run_id` and `mode`
/// NULL and DO carry a `signature_hash` (so callers that later switch
/// to `write_findings_db_with_run` get upsert-by-identity for free).
///
/// Returns the number of newly-inserted rows. Repeated calls with the
/// same `findings` slice on the same `output` path return 0 on
/// subsequent calls because the partial UNIQUE index suppresses
/// re-inserts of identical signature hashes — a strictly stronger
/// guarantee than the prior "always reports findings.len()" shape, but
/// useful for callers that want the actual delta.
pub fn write_findings_db(
    findings: &[Finding],
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    write_findings_db_with_run(findings, output, None, None, true)
}

/// Detector-local FP suppressor for trufflehog NDJSON ingestion.
///
/// Returns `true` if the `(detector, raw)` pair is a known false-positive
/// shape that should be dropped before insertion into the `findings`
/// table. Returns `false` (admit the row) for every detector this
/// function does not have an explicit calibration for — i.e. the default
/// is fail-open, NOT fail-closed.
///
/// **Why this lives at ingestion, not as a CLI flag.** Trufflehog's
/// upstream regex for the Square detector matches the substring `sq0`
/// followed by 30-43 chars of base64-ish content (`[A-Za-z0-9_-]`).
/// On a 4.8 MB DEX with thousands of long base64-encoded resource
/// identifiers (Facebook Audience Network's `assets/audience_network.dex`
/// is the motivating case), substring hits are inevitable and
/// meaningless: a class-metadata entry like `Lcom/foo/sq0abcXYZ...`
/// trips the upstream regex but is not a Square API key.
///
/// The fix is detector-local regex anchoring per Square's documented
/// key shapes (Square API documentation):
///
/// - Production access token: `^sq0atp-[A-Za-z0-9_-]{43}$` (50 chars total).
/// - Production refresh token: `^sq0rtp-[A-Za-z0-9_-]{43}$` (50 chars total).
/// - Sandbox access token: `^sandbox-sq0[a-z]tb-[A-Za-z0-9_-]{22}$`.
/// - Application ID: `^sq0idp-[A-Za-z0-9_-]{22}$` (29 chars total).
///
/// A `Raw` value emitted by trufflehog under `DetectorName: "Square"`
/// that does NOT match any of those four anchored shapes is a
/// substring-shaped FP and is suppressed.
///
/// Siblings (Circle, Honeycomb) extend this function with their own
/// detector branches as they land. The shape is intentionally
/// extension-friendly: each detector gets its own branch, no shared
/// state across detectors.
fn trufflehog_is_known_fp(
    detector: &str,
    raw: &str,
    square_set: &regex::RegexSet,
) -> bool {
    match detector {
        "Square" => !square_set.is_match(raw),
        _ => false,
    }
}

/// Build the anchored Square-key regex set used by
/// [`trufflehog_is_known_fp`]. Constructed once per
/// [`write_credentials_db`] call, outside the per-row loop. Returns the
/// typed `regex::Error` via `?` (typed `Err` on regex-eval failures,
/// no `.unwrap()` on regex construction).
fn square_key_regex_set() -> Result<regex::RegexSet, regex::Error> {
    // Square API key shapes per Square's documentation. Anchored on
    // both ends so substring `sq0` hits inside other strings (DEX class
    // metadata, base64-encoded resource identifiers) are rejected.
    regex::RegexSet::new([
        // Production access token: sq0atp- + 43 chars.
        r"^sq0atp-[A-Za-z0-9_-]{43}$",
        // Production refresh token: sq0rtp- + 43 chars.
        r"^sq0rtp-[A-Za-z0-9_-]{43}$",
        // Sandbox access token: sandbox-sq0[a-z]tb- + 22 chars.
        // The internal `[a-z]` covers documented variants (atb/rtb).
        r"^sandbox-sq0[a-z]tb-[A-Za-z0-9_-]{22}$",
        // Application ID: sq0idp- + 22 chars.
        r"^sq0idp-[A-Za-z0-9_-]{22}$",
    ])
}

/// Append trufflehog NDJSON output to the `credentials` table in the audit DB.
/// Append trufflehog NDJSON output as findings with source='trufflehog'.
///
/// Detector-local FP suppression applies before insertion. Four filters
/// run in sequence per line:
///
/// 1. `trufflehog_is_known_fp` (Square) — substring-shaped FPs are
///    dropped silently and do NOT contribute to the returned count.
/// 2. `credentials_fp::classify_circle_raw_shape` (Circle) — Circle
///    /CircleCI hits whose `Raw` matches an EIP-55 Ethereum address or
///    ASN.1 DER ECDSA signature prefix are downgraded Critical → Info
///    with `id_tag` suffix `_FP_<SHAPE>` and `extra.fp_shape` recorded.
/// 3. `honeycomb_fp` (Honeycomb) — Honeycomb hits whose `Raw` matches
///    a Java class-name shape (`Honeycomb[A-Z][A-Za-z0-9_$]*`) are
///    downgraded Critical → Info with `id_tag` suffix `_FP_ANDROID_CLASS`.
///    Honeycomb hits whose `Raw` matches a documented key anchor (32
///    lowercase hex / `hc[abc]i[ack]_<28-alphanum>` ingest key) are
///    re-promoted to Critical, overriding row-1a Low downgrade,
///    with `extra.anchor` recording which shape matched.
/// 4. [`crate::detectors::calibrate`] (detector-pattern-noise-calibration)
///    — per-line `(detector, raw)` pair routed through:
///    - `Calibration::Skip` — line dropped (class-name FP shapes).
///    - `Calibration::Emit(CalibratedSeverity::{Critical, Low})`.
///
/// Real-positive surface preserved: `Verified: true` findings bypass the
/// Circle + Honeycomb FP shape suppressors + calibration entirely;
/// verified credentials always emit at Critical.
pub fn write_credentials_db(
    trufflehog_ndjson: &str,
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    use crate::detectors::{calibrate, Calibration};

    // Pre-compile the Square anchored-prefix regex set once outside the
    // row loop. `?` propagates any regex-construction error typed.
    let square_set = square_key_regex_set()?;

    let mut db = rusqlite::Connection::open(output)?;
    let tx = db.transaction()?;
    let mut count = 0usize;
    {
        let mut stmt = tx.prepare(
            "INSERT INTO findings (severity, layer, id_tag, gauge_class, source, confidence, detail, cwe, extra) \
             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
        )?;
        for line in trufflehog_ndjson.lines() {
            let Ok(v): Result<serde_json::Value, _> = serde_json::from_str(line) else {
                continue;
            };
            let detector = v.get("DetectorName").and_then(|s| s.as_str()).unwrap_or("unknown");
            let raw = v.get("Raw").and_then(|s| s.as_str()).unwrap_or("");
            // Detector-local FP suppression. Known substring-shaped FPs
            // (e.g. Square's `sq0` substring without the full anchored
            // prefix) are dropped before insertion.
            if trufflehog_is_known_fp(detector, raw, &square_set) {
                continue;
            }
            // Trufflehog's `Verified: bool` field — survives the trip into
            // `extra` JSON so the v3 `credentials` view's `verified` column
            // (json_extract(extra,'$.verified')) returns 1/0 for the
            // documented `WHERE verified=1` query.
            let verified = v.get("Verified").and_then(|s| s.as_bool()).unwrap_or(false);

            // Filter 2: Circle/CircleCI FP shape — downgrade Critical → Info
            // with shape tag. Only applies when verified=false (active-verification
            // confirmation overrides any shape heuristic).
            let circle_fp_shape = if !verified && credentials_fp::is_circle_detector(detector) {
                credentials_fp::classify_circle_raw_shape(raw)
            } else {
                None
            };

            // Filter 3: Honeycomb FP shape + real-key anchor. Two outcomes:
            //   - class-name shape → Info downgrade with `_FP_ANDROID_CLASS` tag
            //   - real-key anchor → Critical re-promotion (overrides row-1a Low)
            // Verified=true bypasses both. Anchor is checked first; if a raw
            // matches the anchor we keep it Critical even though the class-name
            // gate would not match anyway.
            let honeycomb_fp_shape = if !verified && honeycomb_fp::is_honeycomb_detector(detector) {
                honeycomb_fp::classify_honeycomb_raw_shape(raw)
            } else {
                None
            };
            let honeycomb_anchor = if !verified && honeycomb_fp::is_honeycomb_detector(detector) {
                honeycomb_fp::classify_honeycomb_raw_anchor(raw)
            } else {
                None
            };

            // Filter 4 (compose with circle + honeycomb): #5 multi-detector
            // calibration runs for non-Circle non-Honeycomb-anchor non-verified
            // hits. Verified bypasses all filters.
            let (severity, id_tag, detail, extra) = if let Some(shape) = circle_fp_shape {
                // Circle FP shape: Info with shape tag (overrides #5 for these hits)
                let id_tag = format!(
                    "CREDENTIAL_{}_FP_{}",
                    detector.to_uppercase(),
                    shape.as_tag()
                );
                let detail = format!(
                    "{}: pattern match downgraded — raw matches well-known FP shape ({})",
                    detector,
                    shape.as_tag()
                );
                let extra = serde_json::json!({
                    "detector": detector,
                    "raw": raw,
                    "verified": verified,
                    "fp_shape": shape.as_tag(),
                })
                .to_string();
                ("Info", id_tag, detail, extra)
            } else if let Some(shape) = honeycomb_fp_shape {
                // Honeycomb class-name FP: Info with shape tag (overrides #5
                // row-1a Low). Mirrors the Circle `_FP_<SHAPE>` convention —
                // preserves the audit trail rather than dropping the line.
                let id_tag = format!(
                    "CREDENTIAL_{}_FP_{}",
                    detector.to_uppercase(),
                    shape.as_tag()
                );
                let detail = format!(
                    "{}: pattern match downgraded — raw matches AOSP/SDK class identifier shape ({})",
                    detector,
                    shape.as_tag()
                );
                let extra = serde_json::json!({
                    "detector": detector,
                    "raw": raw,
                    "verified": verified,
                    "fp_shape": shape.as_tag(),
                })
                .to_string();
                ("Info", id_tag, detail, extra)
            } else if let Some(anchor) = honeycomb_anchor {
                // Honeycomb real-key anchor: Critical re-promotion (overrides
                // #5's row-1a blanket Low downgrade for the documented key
                // shapes). The detectors module's docstring (`src/detectors/mod.rs:18-23`)
                // explicitly anticipates this re-promotion.
                let id_tag = format!("CREDENTIAL_{}", detector.to_uppercase());
                let detail = format!(
                    "{}: pattern match on credential material (unverified; raw matches documented key anchor {})",
                    detector,
                    anchor.as_tag()
                );
                let extra = serde_json::json!({
                    "detector": detector,
                    "raw": raw,
                    "verified": verified,
                    "anchor": anchor.as_tag(),
                })
                .to_string();
                ("Critical", id_tag, detail, extra)
            } else if verified {
                // Verified=true: Critical regardless of calibration.
                let id_tag = format!("CREDENTIAL_{}", detector.to_uppercase());
                let detail = format!(
                    "{}: pattern match on credential material (verified)",
                    detector
                );
                let extra = serde_json::json!({
                    "detector": detector,
                    "raw": raw,
                    "verified": true,
                })
                .to_string();
                ("Critical", id_tag, detail, extra)
            } else {
                // Unverified non-Circle: run through #5 multi-detector calibration.
                match calibrate(detector, raw) {
                    Calibration::Skip => continue,
                    Calibration::Emit(sev) => {
                        let id_tag = format!("CREDENTIAL_{}", detector.to_uppercase());
                        let detail = format!(
                            "{}: pattern match on credential material (unverified)",
                            detector
                        );
                        let extra = serde_json::json!({
                            "detector": detector,
                            "raw": raw,
                            "verified": false,
                        })
                        .to_string();
                        (sev.as_str(), id_tag, detail, extra)
                    }
                }
            };


            stmt.execute(rusqlite::params![
                severity, "Apk", &id_tag, "Semantic", "trufflehog", "unverified",
                &detail, Option::<u16>::None, &extra,
            ])?;
            // DISPLAY-ONLY: returned row count; saturates to stay monotonic.
            count = count.saturating_add(1);
        }
    }
    tx.commit()?;
    db.execute("INSERT INTO findings_fts(findings_fts) VALUES('rebuild')", [])?;
    Ok(count)
}


/// Append xref rows to the audit DB.
/// `entries` is a slice of (layer, string_value, function_id, function_name).
pub fn write_xrefs_db(
    entries: &[(String, String, Option<u32>, String)],
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    let mut db = rusqlite::Connection::open(output)?;
    let tx = db.transaction()?;
    {
        let mut stmt = tx.prepare(
            "INSERT INTO xrefs (layer, string_value, function_id, function_name) VALUES (?1, ?2, ?3, ?4)",
        )?;
        for (layer, sv, fid, fname) in entries {
            stmt.execute(rusqlite::params![layer, sv, fid, fname])?;
        }
    }
    tx.commit()?;
    db.execute("INSERT INTO xrefs_fts(xrefs_fts) VALUES('rebuild')", [])?;
    Ok(entries.len())
}

/// Persist semgrep JSON results as findings with source='semgrep'.
pub fn write_semgrep_db(
    semgrep_json: &str,
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    let parsed: serde_json::Value = serde_json::from_str(semgrep_json)?;
    let results = parsed
        .get("results")
        .and_then(|v| v.as_array())
        .cloned()
        .unwrap_or_default();
    let mut db = rusqlite::Connection::open(output)?;
    let tx = db.transaction()?;
    let mut count = 0usize;
    {
        let mut stmt = tx.prepare(
            "INSERT INTO findings (severity, layer, id_tag, gauge_class, source, confidence, detail, cwe, extra) \
             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
        )?;
        for r in &results {
            let check_id = r["check_id"].as_str().unwrap_or("");
            let path = r.get("path").and_then(Value::as_str).unwrap_or("");
            let class_name = std::path::Path::new(path)
                .file_stem()
                .and_then(|s| s.to_str())
                .map(|s| s.replace('_', "."));
            let start_line = r.get("start").and_then(|v| v.get("line")).and_then(Value::as_i64);
            let end_line = r.get("end").and_then(|v| v.get("line")).and_then(Value::as_i64);
            let extra = r.get("extra");
            let sg_severity = extra
                .and_then(|e| e.get("severity"))
                .and_then(Value::as_str)
                .unwrap_or("INFO");
            let message = extra
                .and_then(|e| e.get("message"))
                .and_then(Value::as_str)
                .unwrap_or("");
            let metadata = extra.and_then(|e| e.get("metadata"));
            let cwe_str = metadata
                .and_then(|m| m.get("cwe"))
                .and_then(Value::as_array)
                .map(|arr| arr.iter().filter_map(Value::as_str).collect::<Vec<_>>().join("; "));
            let category = metadata.and_then(|m| m.get("category")).and_then(Value::as_str);

            let severity = match sg_severity {
                "ERROR" => "High",
                "WARNING" => "Medium",
                _ => "Info",
            };
            let id_tag = format!("SEMGREP_{}", check_id);
            let detail = format!("{}: {} ({})", check_id, message, class_name.as_deref().unwrap_or(path));
            let extra = serde_json::json!({
                "check_id": check_id, "path": path, "class_name": class_name,
                "start_line": start_line, "end_line": end_line,
                "message": message, "cwe": cwe_str, "category": category,
            }).to_string();

            stmt.execute(rusqlite::params![
                severity, "Dex", &id_tag, "Semantic", "semgrep", "unverified",
                &detail, Option::<u16>::None, &extra,
            ])?;
            // DISPLAY-ONLY: returned row count; saturates to stay monotonic.
            count = count.saturating_add(1);
        }
    }
    tx.commit()?;
    if count > 0 {
        db.execute("INSERT INTO findings_fts(findings_fts) VALUES('rebuild')", [])?;
    }
    Ok(count)
}

/// Triage a finding: confirm or dismiss.
pub fn triage_finding(
    db_path: &std::path::Path,
    rowid: i64,
    action: &str,
    reason: Option<&str>,
) -> anyhow::Result<serde_json::Value> {
    let db = rusqlite::Connection::open(db_path)?;
    match action {
        "confirm" => {
            db.execute(
                "UPDATE findings SET confidence = 'confirmed' WHERE rowid = ?1",
                rusqlite::params![rowid],
            )?;
        }
        "dismiss" => {
            let reason = reason.unwrap_or("dismissed by analyst");
            db.execute(
                "UPDATE findings SET confidence = 'dismissed', dismiss_reason = ?1 WHERE rowid = ?2",
                rusqlite::params![reason, rowid],
            )?;
        }
        _ => anyhow::bail!("action must be 'confirm' or 'dismiss'"),
    }
    // Return the updated finding
    let mut stmt = db.prepare(
        "SELECT rowid, severity, id_tag, source, confidence, dismiss_reason, detail FROM findings WHERE rowid = ?1",
    )?;
    let row = stmt.query_row(rusqlite::params![rowid], |r| {
        Ok(serde_json::json!({
            "rowid": r.get::<_, i64>(0)?,
            "severity": r.get::<_, String>(1)?,
            "id_tag": r.get::<_, String>(2)?,
            "source": r.get::<_, String>(3)?,
            "confidence": r.get::<_, String>(4)?,
            "dismiss_reason": r.get::<_, Option<String>>(5)?,
            "detail": r.get::<_, String>(6)?,
        }))
    })?;
    Ok(row)
}

/// Persist finding-scoped xrefs to the audit DB.
///
/// For each High/Critical finding, extracts the Dalvik class descriptor
/// embedded in the `detail` string (pattern `Lcom/foo/Bar;`) and writes
/// every string constant loaded by any method of that class as a
/// `finding_xrefs` row, joined to the finding via its DB rowid.
///
/// **Root-cause fix:** DEX_TAINT_FLOW / BRIDGE_TAINT_FLOW findings
/// previously stored no class descriptor in their detail string —
/// only `func #{:#x}` — so `target_classes` was always empty and the
/// function returned 0.  The fix adds `class_descriptor` (the containing
/// method's Dalvik descriptor) to the `TaintFinding` struct and embeds it
/// in the detail string via `taint_finding_to_finding`, so the regex
/// extraction here works for taint-flow findings as well as manifest/APK
/// findings that always carried descriptors.
///
/// # Failure modes
/// - `require_apk` returns `Err` → propagated (no APK context, no DEX bytes).
/// - `Xrefs::build` fails for one DEX layer → that layer is skipped.
/// - No High/Critical findings with extractable class refs → returns `Ok(0)`.
///
// PROOF: `i + 1` dex-layer label bounded by ctx.dex.len() ≤ isize::MAX.
#[allow(clippy::arithmetic_side_effects, clippy::as_conversions, reason = "`i + 1` dex-layer label bounded by ctx.dex.len() ≤ isize::MAX.")]
pub fn write_finding_xrefs_db(
    ctx: &CrossLayerContext,
    _findings: &[Finding],
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    use std::collections::HashMap;

    // Regex matches Dalvik class descriptors embedded in detail strings:
    //   "... in Lcom/foo/Bar; ..."  →  cap[1] = "com/foo/Bar"
    let class_ref_re = regex::Regex::new(r"L([a-zA-Z0-9_$/]+);")?;

    // ── Step 1: read finding rowids and extract target classes from DB ──────
    // class_to_rowids: dotted class name → list of DB rowids for findings
    // that mention that class.  One class can match multiple findings (e.g.
    // two taint flows in the same class), and one finding can add multiple
    // classes.
    let mut db = rusqlite::Connection::open(output)?;
    let mut class_to_rowids: HashMap<String, Vec<i64>> = HashMap::new();

    {
        let mut stmt = db.prepare(
            "SELECT rowid, detail FROM findings WHERE severity IN ('High', 'Critical')",
        )?;
        let mut rows = stmt.query([])?;
        while let Some(row) = rows.next()? {
            let rowid: i64 = row.get(0)?;
            let detail: String = row.get(1)?;
            for cap in class_ref_re.captures_iter(&detail) {
                let class_dotted = cap[1].replace('/', ".");
                class_to_rowids.entry(class_dotted).or_default().push(rowid);
            }
        }
    }

    if class_to_rowids.is_empty() {
        // No High/Critical findings carry extractable class descriptors.
        return Ok(0);
    }

    // ── Step 2: walk xrefs per dex layer and insert finding_xrefs rows ─────
    let apk = ctx.require_apk()?;
    let tx = db.transaction()?;
    let mut count = 0usize;
    {
        let mut stmt = tx.prepare(
            "INSERT INTO finding_xrefs (finding_rowid, layer, string_value, function_name) \
             VALUES (?1, ?2, ?3, ?4)",
        )?;

        for ((li, dex), apk_dex) in ctx.dex.iter().enumerate().zip(apk.dex.iter()) {
            let label = format!("dex{}", li + 1);
            let raw = &apk_dex.data;
            let xrefs = match droidsaw_dex::xrefs::Xrefs::build(dex, raw) {
                Ok(x) => x,
                Err(_) => continue,
            };
            // For each (string, method) pair: if the method's class appears in
            // class_to_rowids, emit a finding_xrefs row for each matching finding.
            for (s, methods) in &xrefs.string_to_methods {
                for m in methods {
                    let class_dotted = m.class.trim_start_matches('L')
                        .trim_end_matches(';')
                        .replace('/', ".");
                    let Some(rowids) = class_to_rowids.get(&class_dotted) else {
                        continue;
                    };
                    let func_name = format!("{}->{}{}", m.class, m.name, m.proto);
                    for &rowid in rowids {
                        stmt.execute(rusqlite::params![rowid, &label, s, &func_name])?;
                        count = count.saturating_add(1);
                    }
                }
            }
        }
    }
    tx.commit()?;
    if count > 0 {
        db.execute(
            "INSERT INTO finding_xrefs_fts(finding_xrefs_fts) VALUES('rebuild')",
            [],
        )?;
    }
    Ok(count)
}

/// Append taint flow rows to the audit DB.
/// Each row records a source→sink edge: the layer, function ID, source type,
/// sink type, severity, and CWE. Use `db_query` with FTS5 to search:
///   `SELECT * FROM taint_flows_fts WHERE taint_flows_fts MATCH 'SqlExecute'`
pub fn write_taint_flows_db(
    findings: &[Finding],
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    let mut db = rusqlite::Connection::open(output)?;
    let tx = db.transaction()?;
    let mut count = 0usize;
    {
        let mut stmt = tx.prepare(
            "INSERT INTO taint_flows \
             (layer, func_id, source_type, sink_type, severity, cwe, source_offset, sink_offset) \
             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
        )?;
        for f in findings.iter().filter(|f| matches!(f.id.as_str(), "DEX_TAINT_FLOW" | "BRIDGE_TAINT_FLOW" | "HBC_TAINT_FLOW")) {
            // Source/sink come from the detail string; func_id rides the
            // typed `Finding.func_id` field (set by the 3 taint factories
            // via `.with_func()`). Round-tripping the func_id through
            // `(func #{:#x}; ...)` text was lossy on the `;` suffix —
            // the v2 fix reads it directly off the typed column. See
            // `tests/db_taint_flows_func_id.rs` for the regression.
            //
            // source_offset / sink_offset are extracted from Finding.extra JSON
            // (set by `taint_finding_to_finding` for DEX_TAINT_FLOW; NULL for
            // HBC_TAINT_FLOW and BRIDGE_TAINT_FLOW which have no instruction addr).
            let (source_type, sink_type) = parse_taint_source_sink(&f.detail);
            let func_id = f.func_id.unwrap_or(0);
            let (source_offset, sink_offset): (Option<i64>, Option<i64>) = f.extra
                .as_deref()
                .and_then(|s| serde_json::from_str::<serde_json::Value>(s).ok())
                .map(|v| {
                    let src = v.get("source_offset").and_then(|x| x.as_i64());
                    let snk = v.get("sink_offset").and_then(|x| x.as_i64());
                    (src, snk)
                })
                .unwrap_or((None, None));
            stmt.execute(rusqlite::params![
                format!("{:?}", f.layer),
                func_id,
                source_type,
                sink_type,
                format!("{:?}", f.severity),
                f.cwe,
                source_offset,
                sink_offset,
            ])?;
            // DISPLAY-ONLY: returned row count.
            count = count.saturating_add(1);
        }
    }
    tx.commit()?;
    if count > 0 {
        db.execute("INSERT INTO taint_flows_fts(taint_flows_fts) VALUES('rebuild')", [])?;
    }
    Ok(count)
}

/// Append cross-layer taint composite rows to the audit DB.
///
/// One row per `CROSS_LAYER_TAINT_FLOW` `Finding` produced by
/// [`crate::analysis::cross_layer_stitch`]. The full
/// `CrossLayerTaintFinding<MethodIdx>` payload rides
/// `Finding.extra` (composite_to_finding writes it via
/// `serde_json::to_string`); this writer deserializes it back symmetrically
/// and projects each field into a typed column. Findings with absent or
/// unparseable `extra` are skipped (no panic — pre-Phase-5 DBs sometimes
/// carried bytes that were trimmed; survival over panic is the
/// hardening contract).
///
/// Use `db_query` with FTS5 to search across modules / methods / sinks /
/// native targets:
///   `SELECT * FROM cross_layer_taint_flows_fts WHERE cross_layer_taint_flows_fts MATCH 'Storage'`
///
/// Wraps the insert loop in `db.transaction()` — rusqlite's canonical
/// explicit-BEGIN/COMMIT pattern, avoiding the 100× fsync penalty of
/// per-row autocommit. The `cross_layer_taint_flows_fts` virtual table
/// is rebuilt at the end so FTS reads see the freshly-inserted rows.
///
/// Returns the count of inserted rows (excluding skipped ones).
pub fn write_cross_layer_taint_flows_db(
    findings: &[Finding],
    output: &std::path::Path,
) -> anyhow::Result<usize> {
    use droidsaw_common::cross_layer_taint::CrossLayerTaintFinding;
    use droidsaw_dex::ids::MethodIdx;

    let mut db = rusqlite::Connection::open(output)?;
    let tx = db.transaction()?;
    let mut count = 0usize;
    {
        let mut stmt = tx.prepare(
            "INSERT INTO cross_layer_taint_flows \
             (js_module, js_method, dex_idx, method_idx, js_func_id, \
              native_func_id, native_class_descriptor, native_method_signature, \
              source_type, sink_type, severity, cwe) \
             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
        )?;
        for f in findings.iter().filter(|f| f.id.as_str() == "CROSS_LAYER_TAINT_FLOW") {
            // Parser-path read: Finding.extra is attacker-untrusted only
            // in the sense that the same byte channel carries other findings'
            // serde payloads; stitcher-emitted composites are guaranteed
            // round-trippable by the debug-time gauge in `composite_to_finding`.
            // Silently skipping malformed entries matches the hardening
            // discipline: survival over panic on the audit-DB write path.
            let Some(extra) = f.extra.as_deref() else {
                continue;
            };
            let Ok(c) = serde_json::from_str::<CrossLayerTaintFinding<MethodIdx>>(extra) else {
                continue;
            };
            // Variant-tag string for source/sink columns — matches the
            // `{:?}.split('{').next()` shape used by `composite_to_finding`
            // when building Finding.detail, so FTS searches on either
            // detail or the column return the same hits.
            let source_type = format!("{:?}", c.js_source);
            let source_type = source_type.split('{').next().unwrap_or("Unknown").trim();
            let sink_type = format!("{:?}", c.native_sink);
            let sink_type = sink_type.split('{').next().unwrap_or("Unknown").trim();
            // dex_idx is a usize from the multi-DEX bundle index; saturating
            // to i64::MAX is safe-by-construction since realistic apks carry
            // a handful of DEX entries (Android packaging caps at 100).
            let dex_idx_i64 = i64::try_from(c.bridge.dex_idx).unwrap_or(i64::MAX);
            stmt.execute(rusqlite::params![
                c.bridge.js_module.as_str(),
                c.bridge.js_method.as_str(),
                dex_idx_i64,
                i64::from(c.bridge.method_idx.0),
                c.js_func_id,
                c.native_func_id,
                c.native_class_descriptor.as_deref(),
                c.native_method_signature.as_deref(),
                source_type,
                sink_type,
                format!("{:?}", c.severity),
                c.cwe,
            ])?;
            count = count.saturating_add(1);
        }
    }
    tx.commit()?;
    if count > 0 {
        db.execute(
            "INSERT INTO cross_layer_taint_flows_fts(cross_layer_taint_flows_fts) VALUES('rebuild')",
            [],
        )?;
    }
    Ok(count)
}

/// Parse the source/sink strings out of a taint finding's detail field.
/// Handles all three detail-string shapes emitted by the factory functions:
/// - "intra-method taint: SourceType → SinkType (func #0x1234; ...)"
/// - "bridge taint: ReactBridgeParam[method] → SinkType (func #0x1234)"
/// - "hbc taint: JsParam → SinkType (func #0x1234)"
///
/// Note: func_id is NOT re-parsed from the `(func #...)` substring here
/// — that round-trip is lossy on the `;` suffix and would silently drop
/// to 0. The typed `Finding.func_id` field carries the value; this
/// helper is source/sink-only.
pub(super) fn parse_taint_source_sink(detail: &str) -> (String, String) {
    let inner = detail
        .strip_prefix("intra-method taint: ")
        .or_else(|| detail.strip_prefix("bridge taint: "))
        .or_else(|| detail.strip_prefix("hbc taint: "))
        .unwrap_or(detail);
    let (flow_part, _) = inner.split_once(" (func").unwrap_or((inner, ""));
    let (source, sink) = flow_part.split_once(" → ").unwrap_or((flow_part, "Unknown"));
    (source.trim().to_string(), sink.trim().to_string())
}
#[cfg(test)]
mod trufflehog_fp_tests {
    use super::{square_key_regex_set, trufflehog_is_known_fp};

    #[test]
    fn square_real_production_access_token_admitted() {
        // Real Square production access token shape: `sq0atp-` + 43
        // chars from `[A-Za-z0-9_-]`. Must NOT be flagged as an FP.
        let set = square_key_regex_set().expect("regex set");
        let real = "sq0atp-AbCdEfGhIjKlMnOpQrStUvWxYz0123456789_-aBcDe";
        assert_eq!(real.len(), 7 + 43, "test fixture length sanity");
        assert!(
            !trufflehog_is_known_fp("Square", real, &set),
            "real production access token must be admitted"
        );
    }

    #[test]
    fn square_real_production_refresh_token_admitted() {
        let set = square_key_regex_set().expect("regex set");
        let real = "sq0rtp-AbCdEfGhIjKlMnOpQrStUvWxYz0123456789_-aBcDe";
        assert_eq!(real.len(), 7 + 43);
        assert!(!trufflehog_is_known_fp("Square", real, &set));
    }

    #[test]
    fn square_real_sandbox_access_token_admitted() {
        // Sandbox shape: `sandbox-sq0[a-z]tb-` + 22 chars.
        let set = square_key_regex_set().expect("regex set");
        let real = "sandbox-sq0atb-AbCdEfGhIjKlMnOpQrStUv";
        // 8 + 4 + 3 + 22 = 37 chars total ("sandbox-" = 8, "sq0" = 3, "atb-" = 4, body = 22).
        assert_eq!(real.len(), 8 + 3 + 4 + 22);
        assert!(!trufflehog_is_known_fp("Square", real, &set));
    }

    #[test]
    fn square_real_application_id_admitted() {
        // Application ID shape: `sq0idp-` + 22 chars.
        let set = square_key_regex_set().expect("regex set");
        let real = "sq0idp-AbCdEfGhIjKlMnOpQrStUv";
        assert_eq!(real.len(), 7 + 22);
        assert!(!trufflehog_is_known_fp("Square", real, &set));
    }

    #[test]
    fn square_substring_in_class_metadata_rejected() {
        // The motivating FP: `sq0` substring inside a longer
        // base64-shaped string (no `sq0[a-z]tp-` prefix anchor).
        // Trufflehog's upstream regex would match this; our anchored
        // filter drops it.
        let set = square_key_regex_set().expect("regex set");
        let fp = "sq0abcXYZdefGHI012345jkl_-mnopqr"; // no `-` after `sq0[a-z]tp/idp`
        assert!(
            trufflehog_is_known_fp("Square", fp, &set),
            "substring-shaped FP must be suppressed"
        );
    }

    #[test]
    fn square_wrong_length_rejected() {
        // Right prefix, wrong body length → not a real Square key.
        let set = square_key_regex_set().expect("regex set");
        // sq0atp- needs exactly 43 chars after the dash; this has 10.
        let too_short = "sq0atp-AbCdEfGhIj";
        assert!(trufflehog_is_known_fp("Square", too_short, &set));
        // sq0idp- needs exactly 22 chars; this has 30.
        let too_long = "sq0idp-AbCdEfGhIjKlMnOpQrStUvWxYz0123456789";
        assert!(trufflehog_is_known_fp("Square", too_long, &set));
    }

    #[test]
    fn square_anchored_no_substring_match() {
        // Anchored regex must reject content that contains a valid-looking
        // Square key with leading/trailing junk (not a full-string match).
        let set = square_key_regex_set().expect("regex set");
        let leading = "junk_sq0atp-AbCdEfGhIjKlMnOpQrStUvWxYz0123456789_-aBcDe";
        let trailing = "sq0atp-AbCdEfGhIjKlMnOpQrStUvWxYz0123456789_-aBcDe_junk";
        assert!(trufflehog_is_known_fp("Square", leading, &set));
        assert!(trufflehog_is_known_fp("Square", trailing, &set));
    }

    #[test]
    fn other_detectors_pass_through() {
        // Non-Square detectors are NOT subject to Square's anchored
        // filter; they pass through unchanged until siblings extend
        // the dispatch.
        let set = square_key_regex_set().expect("regex set");
        assert!(!trufflehog_is_known_fp("AWS", "AKIAIOSFODNN7EXAMPLE", &set));
        assert!(!trufflehog_is_known_fp("PrivateKey", "-----BEGIN", &set));
        assert!(!trufflehog_is_known_fp("Slack", "xoxb-1234", &set));
        // Unknown detector — fail-open per the function contract.
        assert!(!trufflehog_is_known_fp("UnknownDetector", "anything", &set));
    }
}

#[cfg(test)]
mod shadow_gate_tests {
    use super::export;
    use crate::analysis::dup_class_fixture;
    use crate::context::CrossLayerContext;

    /// The SQLite `classes` table is keyed by the enumerate position
    /// `j`, so a duplicate-`class_idx` pair gets two distinct keys and
    /// — without the shadow gate — BOTH rows persist via
    /// `INSERT OR REPLACE`. This is the highest-stakes site: a persisted
    /// shadow row amplifies class counts and introduces `class_idx`-join
    /// ambiguity for downstream consumers. The gate must skip the shadow
    /// row before its INSERT, leaving exactly one `classes` row.
    #[test]
    fn export_persists_only_canonical_class_row() {
        let dex = dup_class_fixture::for_export();
        let ctx = CrossLayerContext {
            path: "test://dup-class".to_string(),
            apk: None,
            hbc: None,
            hbc_parse_error: None,
            dex: vec![dex],
            dex_direct_bytes: None,
            loaded_split_names: Vec::new(),
            hermes_findings: Vec::new(),
            permissive_recovery: droidsaw_apk::PermissiveRecoveryOpts::default(),
        };

        let tmp = tempfile::NamedTempFile::new().expect("temp db");
        let db_path = tmp.path().to_str().expect("utf-8 path").to_string();
        export(&ctx, &db_path).expect("export succeeds");

        let conn = rusqlite::Connection::open(&db_path).expect("open db");
        let class_count: i64 = conn
            .query_row(
                "SELECT COUNT(*) FROM classes WHERE layer = 'dex1'",
                [],
                |row| row.get(0),
            )
            .expect("count classes");
        assert_eq!(
            class_count, 1,
            "only the canonical (first-wins) class row may persist; \
             the shadow duplicate-class_idx row must be gated out"
        );
    }
}