droidsaw 1.0.0 - Docs.rs

// SPDX-License-Identifier: BSD-3-Clause

//! Canonical `AuditEnvelope` — the ONE response shape for all `audit` paths.
//!
//! Both the CLI `audit` command (via [`crate::commands::audit_light_with_mode`])
//! and the MCP `audit` tool (via `run_core_audit_blocking`) produce an
//! `AuditEnvelope`.  The intentional divergence between the two adapters is
//! in **which optional fields are populated**, not in the shape itself:
//!
//! | Field             | CLI (`audit_light`) | MCP (`audit`)      |
//! |-------------------|---------------------|--------------------|
//! | `findings`        | full inline list    | empty (in DB)      |
//! | `top_findings`    | empty               | ≤5 Critical/High   |
//! | `truncated`       | false               | true when `top_findings` < total High+ |
//! | `db_path`         | `None`              | `Some("…")`        |
//! | `db_queries`      | `None`              | `Some({…})`        |
//! | `detectors`       | `Some({…})`         | `None`             |
//! | `timings_ms`      | `None`              | `Some({…})`        |
//!
//! `schema_version` is carried on every response so downstream consumers can
//! gate on breaking changes (see `threat_model::envelope::SCHEMA_VERSION` for
//! the bump policy; this is the *response* version, independent of the
//! evidence-envelope wire format).

use std::collections::BTreeMap;

use droidsaw_common::Finding;
use serde::{Deserialize, Serialize};
use serde_json::Value;

/// Current `AuditEnvelope` schema version.
///
/// Bump only for **breaking** field changes (removed field, renamed field,
/// changed type). New optional fields are additive and do not require a bump.
pub const AUDIT_ENVELOPE_VERSION: u32 = 1;

/// Per-APK shape statistics derived from the `CrossLayerContext` the audit
/// pipeline already builds.  Surfaced in the envelope so corpus-sweep scripts
/// can read these gauges directly from the audit JSON instead of attempting
/// to scrape them out of the `meta` block (which only holds navigation
/// metadata).
///
/// Three field names (`hbc_function_count`, `dex_methods_total`,
/// `dex_classes_total`) align with comparable Java/Android decompiler bench
/// schemas (`functions_decompiled`, `methods_emitted`, `classes_emitted`)
/// to enable cross-tool join queries.  Semantic note: the droidsaw counts
/// are "definitions present in parsed DEX/HBC", not "what decompilation
/// actually rendered" — droidsaw audit does not decompile.
///
/// `None` is emitted when the context did not originate from a container
/// input.  Additive field — old consumers that do not know about
/// `apk_summary` simply ignore it.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ApkSummary {
    /// `true` when a Hermes bytecode bundle was found in the APK container.
    pub has_hbc: bool,
    /// Byte length of the embedded HBC bundle, or `0` when `has_hbc` is
    /// `false`.
    pub hbc_bytes: u64,
    /// Number of functions declared in the parsed HBC bundle (jadx-aligned:
    /// mirrors `functions_decompiled`).  `0` when `has_hbc` is `false`.
    pub hbc_function_count: u32,
    /// Number of DEX entries in the APK container (classes.dex,
    /// classes2.dex, …).  For standalone `.dex` input this is always `1`.
    pub dex_count: u32,
    /// Sum of the raw byte lengths of all DEX entries.
    pub dex_total_bytes: u64,
    /// Total method-definition count across all DEX entries (sum of
    /// `direct_methods + virtual_methods` in every parsed class_data).
    /// Jadx-aligned: mirrors `methods_emitted`.  Distinct from the
    /// `method_ids` pool size, which also counts external references.
    pub dex_methods_total: u64,
    /// Total class-definition count across all DEX entries (sum of
    /// `class_defs.len()`).  Jadx-aligned: mirrors `classes_emitted`.
    pub dex_classes_total: u64,
}

/// One canonical response shape for every `audit` invocation, whether issued
/// via the CLI or MCP transport.
///
/// See module-level doc for the intentional-divergence table.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuditEnvelope {
    /// Envelope schema version (always [`AUDIT_ENVELOPE_VERSION`]).
    pub schema_version: u32,

    /// Full finding list. Populated by CLI; empty for MCP (findings live in
    /// the sqlite DB at `db_path`).
    pub findings: Vec<Finding>,

    /// Post-dedup finding count (matches `SELECT COUNT(*) FROM findings`).
    pub finding_count: u64,

    /// Pre-dedup raw count from the detector pipeline.  The gap
    /// `findings_emitted - finding_count` is the number of duplicates the
    /// signature-hash UNIQUE index collapsed.
    pub findings_emitted: u64,

    /// Count of confirmed taint-flow findings present in `.findings`,
    /// i.e. those with `id ∈ {HBC_TAINT_FLOW, DEX_TAINT_FLOW,
    /// BRIDGE_TAINT_FLOW}`. These are the cross-layer source→sink flows
    /// the taint pipeline asserts.
    ///
    /// Deliberately excludes `JNI_TAINTED_NATIVE_CALL` (different class
    /// — tainted data reached native code, but the sink behavior is
    /// opaque without symbol resolution) and
    /// `BRIDGE_RESOLUTION_AMBIGUOUS` (resolution ambiguity, not a
    /// confirmed flow). Operators wanting either category should filter
    /// `.findings` directly.
    ///
    /// Both CLI and MCP audit paths populate this from
    /// [`AuditEnvelope::count_taint_flow_findings`] over the same
    /// finding set, so the value is identical regardless of transport.
    pub taint_flow_count: u64,

    /// Severity histogram keyed on the `Debug` name of
    /// [`droidsaw_common::Severity`] (e.g. `"Critical"`, `"High"`).
    pub severity_summary: BTreeMap<String, u64>,

    /// Capped projection (≤ 5) of Critical/High findings for MCP consumers
    /// that want immediate signal without a follow-up `query` call.
    /// Empty when the CLI path is used (full list is in `findings`).
    pub top_findings: Vec<Value>,

    /// `true` when `top_findings` is a truncated view of the full High+
    /// Critical set (i.e., there are more than 5 such findings).
    pub truncated: bool,

    /// Path to the sqlite DB written by this audit run.
    /// `None` for the CLI path (no DB is written).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub db_path: Option<String>,

    /// Pre-built SQL queries for common follow-up patterns.
    /// Populated by the MCP path; `None` for CLI.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub db_queries: Option<Value>,

    /// Number of finding-xref rows written.  `None` for CLI.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub finding_xrefs_written: Option<u64>,

    /// Per-detector status summary.  Populated by CLI; `None` for MCP
    /// (MCP reports detector results inline under `trufflehog` / `semgrep`).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub detectors: Option<Value>,

    /// Trufflehog subprocess result (both paths).  `None` when mode doesn't
    /// run trufflehog (`basic`, `semgrep`).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub trufflehog: Option<Value>,

    /// Semgrep result (both paths).  `None` when mode doesn't run semgrep.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub semgrep: Option<Value>,

    /// Per-phase wall-clock timing (milliseconds).  Populated by MCP;
    /// `None` for CLI (CLI callers read progress from stderr).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timings_ms: Option<Value>,

    /// Per-APK shape statistics (HBC presence + byte size, DEX count + total
    /// bytes).  Populated by both the CLI and MCP audit paths when the input
    /// was an APK / AAB / XAPK container.  `None` for raw HBC / DEX inputs
    /// and for old envelopes deserialized from before this field existed.
    #[serde(skip_serializing_if = "Option::is_none", default)]
    pub apk_summary: Option<ApkSummary>,

    /// Navigation metadata for agent consumers: count, truncation flag,
    /// usage hint, and related subcommand names.
    pub meta: AuditMeta,
}

impl AuditEnvelope {
    /// Count taint-flow findings in a slice — the canonical accounting
    /// for [`Self::taint_flow_count`]. Both CLI and MCP audit paths
    /// MUST go through this helper rather than maintaining separate
    /// filter lists; the prior duplication left the CLI path counting
    /// `0` while MCP counted correctly.
    ///
    /// Matches exactly three IDs: `HBC_TAINT_FLOW`, `DEX_TAINT_FLOW`,
    /// `BRIDGE_TAINT_FLOW`. See the field-level doc on
    /// `taint_flow_count` for the rationale of which IDs are in scope.
    #[allow(
        clippy::as_conversions,
        reason = "PROOF: filter().count() returns usize bounded by findings.len() ≤ usize::MAX; usize→u64 widening is lossless on every supported 64-bit target."
    )]
    pub fn count_taint_flow_findings(findings: &[Finding]) -> u64 {
        findings
            .iter()
            .filter(|f| {
                matches!(
                    f.id.as_str(),
                    "HBC_TAINT_FLOW" | "DEX_TAINT_FLOW" | "BRIDGE_TAINT_FLOW"
                )
            })
            .count() as u64
    }
}

/// Navigation + pagination metadata embedded in every `AuditEnvelope`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AuditMeta {
    /// Number of items in the primary result array (`findings` for CLI,
    /// `top_findings` for MCP).
    pub count: u64,
    /// True when the primary result array was cut by a limit.
    pub truncated: bool,
    /// Plain-text agent hint for the next useful action.
    pub hint: String,
    /// 2-4 sibling subcommand names the agent can chain into.
    pub related: Vec<String>,
    /// Rayon thread pool size used during this audit. Lets an agent
    /// consuming the JSON verify the audit ran with the intended
    /// concurrency (e.g. that `--single-thread` was honored). Always
    /// equal to `rayon::current_num_threads()` at audit-emit time. A
    /// value of 0 deserialized from an older envelope indicates the
    /// field was absent — treat as unknown.
    #[serde(default)]
    pub thread_pool_size: usize,
}

#[cfg(test)]
mod tests {
    use super::*;

    fn minimal_envelope() -> AuditEnvelope {
        AuditEnvelope {
            schema_version: AUDIT_ENVELOPE_VERSION,
            findings: vec![],
            finding_count: 0,
            findings_emitted: 0,
            taint_flow_count: 0,
            severity_summary: BTreeMap::new(),
            top_findings: vec![],
            truncated: false,
            db_path: None,
            db_queries: None,
            finding_xrefs_written: None,
            detectors: None,
            trufflehog: None,
            semgrep: None,
            timings_ms: None,
            apk_summary: None,
            meta: AuditMeta {
                count: 0,
                truncated: false,
                hint: "test".to_string(),
                related: vec![],
                thread_pool_size: 1,
            },
        }
    }

    // ── round-trip: CLI shape (findings inline, no db_path) ───────────────

    #[test]
    fn cli_shape_roundtrip() {
        let env = minimal_envelope();
        let json = serde_json::to_string(&env).expect("serialize");
        let back: AuditEnvelope = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back.schema_version, AUDIT_ENVELOPE_VERSION);
        assert!(back.findings.is_empty());
        assert!(back.db_path.is_none());
        assert!(back.db_queries.is_none());
        assert!(!back.truncated);
    }

    // ── round-trip: MCP shape (db_path + top_findings, no inline findings) ─

    #[test]
    fn mcp_shape_roundtrip() {
        let mut env = minimal_envelope();
        env.finding_count = 12;
        env.findings_emitted = 15;
        env.taint_flow_count = 3;
        env.severity_summary.insert("High".to_string(), 8);
        env.severity_summary.insert("Critical".to_string(), 4);
        env.top_findings = vec![
            serde_json::json!({"severity": "Critical", "id": "HARDCODED_KEY", "detail": "...", "cwe": 321}),
        ];
        env.truncated = true;
        env.db_path = Some("/tmp/droidsaw-audit-abc.db".to_string());
        env.db_queries = Some(serde_json::json!({"all_high": "SELECT ..."}));
        env.finding_xrefs_written = Some(42);
        env.timings_ms = Some(serde_json::json!({"core_total": 1200}));
        env.meta = AuditMeta {
            count: 1,
            truncated: true,
            hint: "pair with query for full list".to_string(),
            related: vec!["query".to_string(), "investigate".to_string()],
            thread_pool_size: 1,
        };

        let json = serde_json::to_string(&env).expect("serialize");

        // Verify key MCP-specific fields survive round-trip.
        let back: AuditEnvelope = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back.schema_version, AUDIT_ENVELOPE_VERSION);
        assert_eq!(back.finding_count, 12);
        assert_eq!(back.findings_emitted, 15);
        assert_eq!(back.taint_flow_count, 3);
        assert_eq!(back.severity_summary.get("Critical").copied(), Some(4));
        assert_eq!(back.top_findings.len(), 1);
        assert!(back.truncated);
        assert_eq!(back.db_path.as_deref(), Some("/tmp/droidsaw-audit-abc.db"));
        assert!(back.db_queries.is_some());
        assert_eq!(back.finding_xrefs_written, Some(42));
        assert!(back.timings_ms.is_some());
        assert_eq!(back.meta.count, 1);
        assert!(back.meta.truncated);
    }

    // ── count_taint_flow_findings: canonical taint accounting ─────────────

    #[test]
    fn count_taint_flow_findings_matches_three_ids_only() {
        use droidsaw_common::{Finding, Layer, Severity};
        let findings = vec![
            Finding::new("HBC_TAINT_FLOW", Layer::Hbc, Severity::Critical, "eval"),
            Finding::new("DEX_TAINT_FLOW", Layer::Dex, Severity::High, "ipc"),
            Finding::new("DEX_TAINT_FLOW", Layer::Dex, Severity::High, "fs"),
            Finding::new("BRIDGE_TAINT_FLOW", Layer::Dex, Severity::High, "bridge"),
            // Out-of-scope IDs that must NOT be counted:
            Finding::new("JNI_TAINTED_NATIVE_CALL", Layer::Dex, Severity::Medium, "jni"),
            Finding::new("BRIDGE_RESOLUTION_AMBIGUOUS", Layer::Dex, Severity::Info, "amb"),
            Finding::new("HARDCODED_KEY", Layer::Apk, Severity::Critical, "key"),
        ];
        let n = AuditEnvelope::count_taint_flow_findings(&findings);
        assert_eq!(
            n, 4,
            "must count exactly the three TAINT_FLOW ids (HBC, DEX, BRIDGE); \
             JNI_TAINTED_NATIVE_CALL and BRIDGE_RESOLUTION_AMBIGUOUS excluded by design"
        );
    }

    #[test]
    fn count_taint_flow_findings_zero_on_empty_input() {
        assert_eq!(AuditEnvelope::count_taint_flow_findings(&[]), 0);
    }

    #[test]
    fn count_taint_flow_findings_zero_on_non_taint_findings_only() {
        use droidsaw_common::{Finding, Layer, Severity};
        let findings = vec![
            Finding::new("HARDCODED_KEY", Layer::Apk, Severity::Critical, "x"),
            Finding::new("V1_MANIFEST_MISMATCH", Layer::Apk, Severity::High, "y"),
        ];
        assert_eq!(AuditEnvelope::count_taint_flow_findings(&findings), 0);
    }

    // ── schema_version field appears in JSON ──────────────────────────────

    #[test]
    fn schema_version_present_in_json() {
        let env = minimal_envelope();
        let json = serde_json::to_string(&env).expect("serialize");
        assert!(
            json.contains("\"schema_version\":1"),
            "schema_version must appear in JSON; got: {json}",
        );
    }

    // ── optional fields skip_serializing_if ──────────────────────────────

    #[test]
    fn none_optional_fields_omitted_from_json() {
        let env = minimal_envelope();
        let json = serde_json::to_string(&env).expect("serialize");
        // Fields with skip_serializing_if = Option::is_none should be absent.
        assert!(!json.contains("\"db_path\""), "db_path must be absent when None");
        assert!(!json.contains("\"db_queries\""), "db_queries must be absent when None");
        assert!(!json.contains("\"detectors\""), "detectors must be absent when None");
        assert!(!json.contains("\"timings_ms\""), "timings_ms must be absent when None");
        assert!(!json.contains("\"trufflehog\""), "trufflehog must be absent when None");
        assert!(!json.contains("\"semgrep\""), "semgrep must be absent when None");
        assert!(!json.contains("\"apk_summary\""), "apk_summary must be absent when None");
    }

    // ── symmetric: CLI emits, MCP can parse ──────────────────────────────

    #[test]
    fn cli_emits_mcp_parses() {
        // Simulate a CLI emission (findings inline, no db_path).
        let mut env = minimal_envelope();
        env.finding_count = 1;
        env.findings_emitted = 1;
        env.severity_summary.insert("High".to_string(), 1);
        env.detectors = Some(serde_json::json!({"semgrep": {"status": "skipped_by_mode"}}));
        env.meta = AuditMeta {
            count: 1,
            truncated: false,
            hint: "filter by severity via jq".to_string(),
            related: vec!["export".to_string(), "audit".to_string()],
            thread_pool_size: 1,
        };

        let cli_json = serde_json::to_string(&env).expect("serialize");

        // MCP side deserializes the same JSON (no db_path, has detectors).
        let mcp_view: AuditEnvelope = serde_json::from_str(&cli_json).expect("deserialize");
        assert_eq!(mcp_view.schema_version, AUDIT_ENVELOPE_VERSION);
        assert_eq!(mcp_view.finding_count, 1);
        assert!(mcp_view.db_path.is_none());
        assert!(mcp_view.detectors.is_some());
    }

    // ── apk_summary: round-trip with populated values ─────────────────

    #[test]
    fn apk_summary_roundtrip() {
        let mut env = minimal_envelope();
        env.apk_summary = Some(ApkSummary {
            has_hbc: true,
            hbc_bytes: 1234,
            hbc_function_count: 42,
            dex_count: 2,
            dex_total_bytes: 5678,
            dex_methods_total: 80_000,
            dex_classes_total: 5_500,
        });

        let json = serde_json::to_string(&env).expect("serialize");

        // The field must be present in JSON when Some.
        assert!(json.contains("\"apk_summary\""), "apk_summary must appear in JSON when Some");

        let back: AuditEnvelope = serde_json::from_str(&json).expect("deserialize");
        let summary = back.apk_summary.expect("apk_summary must survive round-trip");
        assert_eq!(summary.has_hbc, true, "has_hbc must survive round-trip");
        assert_eq!(summary.hbc_bytes, 1234, "hbc_bytes must survive round-trip");
        assert_eq!(summary.hbc_function_count, 42, "hbc_function_count must survive round-trip");
        assert_eq!(summary.dex_count, 2, "dex_count must survive round-trip");
        assert_eq!(summary.dex_total_bytes, 5678, "dex_total_bytes must survive round-trip");
        assert_eq!(summary.dex_methods_total, 80_000, "dex_methods_total must survive round-trip");
        assert_eq!(summary.dex_classes_total, 5_500, "dex_classes_total must survive round-trip");
    }

    // ── symmetric: MCP emits, CLI can parse ──────────────────────────────

    #[test]
    fn mcp_emits_cli_parses() {
        let mut env = minimal_envelope();
        env.finding_count = 5;
        env.findings_emitted = 7;
        env.db_path = Some("/tmp/test.db".to_string());
        env.top_findings = vec![serde_json::json!({"severity": "High", "id": "FOO"})];
        env.truncated = false;
        env.timings_ms = Some(serde_json::json!({"core_total": 800}));
        env.meta = AuditMeta {
            count: 1,
            truncated: false,
            hint: "pair with query".to_string(),
            related: vec!["query".to_string()],
            thread_pool_size: 1,
        };

        let mcp_json = serde_json::to_string(&env).expect("serialize");

        // CLI side deserializes the MCP JSON (has db_path, no inline findings).
        let cli_view: AuditEnvelope = serde_json::from_str(&mcp_json).expect("deserialize");
        assert_eq!(cli_view.schema_version, AUDIT_ENVELOPE_VERSION);
        assert_eq!(cli_view.finding_count, 5);
        assert_eq!(cli_view.db_path.as_deref(), Some("/tmp/test.db"));
        assert!(cli_view.findings.is_empty());
        assert_eq!(cli_view.top_findings.len(), 1);
    }
}