droidsaw 2.0.0 - Docs.rs

#![cfg_attr(not(test), allow(clippy::as_conversions, reason = "PROOF (4 sites): all `as` casts in this module are `MethodIdx (u32 newtype) .0 as usize` or `usize as u32` (raw_idx enumeration of dex.method_ids). The widening direction is lossless on 64-bit (droidsaw's supported target set); the narrowing direction is bounded by the DEX format's u32-cap on `method_ids_size` per spec (existing per-site `cast_possible_truncation` allow at line ~600 already encodes this PROOF). `.get()` handles any OOB by returning None."))]

use std::collections::{BTreeMap, BTreeSet};
use ahash::AHashMap;
use rustc_hash::{FxHashMap, FxHashSet};
use droidsaw_dex::DexFile;
use droidsaw_dex::ids::MethodIdx;
use droidsaw_dex::ssa::{SsaBody, VarId};
use droidsaw_dex::opcodes::Opcode;
use droidsaw_dex::decode::{parse_class_data, parse_code_item, CodeItem, PoolIndex};
use droidsaw_dex::cfg::Cfg;
pub use droidsaw_common::analysis::{TaintSource, TaintSink, TaintFinding};
use droidsaw_common::finding::Layer;

/// Per-call cache for interprocedural callee builds.
///
/// `interproc_inner` rebuilds `(CodeItem, Cfg, SsaBody)` for every
/// distinct callee it follows. Within a single top-level
/// `run_interprocedural` call, the fixpoint loop (`while changed`)
/// can revisit the same Invoke instruction multiple times as taint
/// propagates through phis, AND fan-in from different invoke sites
/// can reach the same callee through different chains. Both
/// scenarios re-trigger parse → CFG → SSA construction on bytes
/// that haven't changed.
///
/// The cache keys on `(owner_dex, code_off)` — code_off uniquely
/// identifies a method's bytecode within its DEX file. Values are
/// `Rc<...>` so a cache lookup can clone the handle and drop the
/// cache borrow before passing `&mut BuildCache` into the recursive
/// call. The recursion is single-threaded (one rayon worker owns
/// the whole chain), so `Rc` is sufficient — no `Arc` overhead.
///
/// `AHashMap` rather than `FxHashMap`: `code_off` is read from parsed
/// DEX content, attacker-controllable. A crafted DEX with many methods
/// at code_offs that collide under FxHash would bucket-bloat the cache
/// and degrade per-call O(1) to O(N). AHash's randomized seed defeats
/// this; the cost (~1.2-1.5× slower per lookup) is negligible compared
/// to the parse+CFG+SSA construction the cache amortizes.
type BuildCache = AHashMap<(usize, u32), std::rc::Rc<(CodeItem, Cfg, SsaBody)>>;

/// Entry-count cap on `BuildCache`. Each entry holds a `Rc<(CodeItem, Cfg,
/// SsaBody)>` — non-trivial allocation per entry (MB-scale on large
/// methods). Without a cap, an adversarial DEX with many unique callee
/// `code_off`s reached via interproc fixpoint can balloon the cache into
/// GB territory — an OOM-class DoS path. On cap hit, new entries skip
/// insertion (cache becomes best-effort beyond N); subsequent lookups
/// re-do parse/CFG/SSA work, which is correct but slower. The cache is
/// per-call so the lifetime is bounded by `run_interprocedural`. Realistic
/// upper bound on production input: low thousands of unique callees per
/// interproc call. N=4096 leaves an order of magnitude of headroom over
/// observed production behaviour. Mirrors the `MAX_AXML_FINDINGS = 1024`
/// pattern in `droidsaw-apk/src/binary_xml.rs:155`.
const MAX_BUILD_CACHE_ENTRIES: usize = 4096;

/// Entry-count cap on `ClassAnalysis::cha_cache`. Each entry is a
/// `((u32, u32), Option<(usize, u32)>)` — ~32 bytes per entry plus HashMap
/// overhead, so at N=65536 the cache uses ~2-4 MB. Key is
/// `(class_id, method_name_id)` — both interned u32s sourced from parsed
/// DEX content, so the cache size is bounded by `class_count * method_name_count`
/// of the loaded APK. Production-corpus class × method-name product is in
/// the 10⁴-10⁵ range for large APKs; an adversarial DEX could push this
/// higher (unique method names per class). On cap hit, new entries skip
/// insertion; CHA lookups re-do the candidate iteration, which is correct
/// but slower.
///
/// Note: the cap bounds *memory*, not *work*. Past N, every lookup re-
/// iterates `by_method_name` candidates — CPU-class amplification on
/// adversarial input. The total work is bounded by the audit-level wall
/// deadline (`ParseBudget::deadline` in `droidsaw-common/src/budget.rs`),
/// which is the load-bearing CPU bound for adversarial workloads. This
/// cap and the budget cooperate: the cap removes the OOM path, the
/// budget removes the runaway-CPU path.
const MAX_CHA_CACHE_ENTRIES: usize = 65536;

/// Finding id surfaced by the audit pipeline when `cha_cache` hits its
/// entry-count cap during a run. Info-severity summary; non-zero count
/// signals that CHA-resolved virtual-call findings past the cap point
/// came from re-iteration (still correct, no semantic regression). Mirrors
/// the `AXML_FINDINGS_TRUNCATED` discipline from
/// `droidsaw-apk/src/binary_xml.rs:153`.
pub const CHA_CACHE_ENTRY_CAP_HIT: &str = "CHA_CACHE_ENTRY_CAP_HIT";

/// Finding id surfaced when one or more `BuildCache` calls hit their
/// entry-count cap during a run. See `CHA_CACHE_ENTRY_CAP_HIT` for the
/// discipline rationale.
pub const BUILD_CACHE_ENTRY_CAP_HIT: &str = "BUILD_CACHE_ENTRY_CAP_HIT";

/// Memoization of `virtual_single_impl` results, shared across rayon
/// workers for the lifetime of a single audit.
///
/// Key is `(declared_class_id, method_name_id)` — both u32 interned in
/// `ClassAnalysis::class_id` / `ClassAnalysis::method_name_id`. Value is
/// the cached `Option<(owner_dex, code_off)>` — exactly what
/// `virtual_single_impl` would return.
///
/// Lives on `ClassAnalysis` itself behind an `RwLock`. Real APKs hit the
/// same `(declared_class, method_name)` pair from many method-task
/// workers (e.g. `Cursor.getString`, `Intent.getStringExtra`, `Log.d`
/// queried while taint-analysing every method that calls them).
/// `virtual_single_impl` is the hot path in large-APK audits; a per-call
/// cache reduces within-call redundancy, and promoting the cache to
/// `ClassAnalysis` lets warmed entries serve all workers.
///
/// DETERMINISM: This cache does not flow into evidence-envelope output.
/// Same rationale as `class_id` / `supertypes`. See those docstrings.

/// Cross-DEX class analysis: code index + class hierarchy + method implementations.
/// Built once by `DexTaintAnalysis::collect_unified_code_index` and reused across
/// all taint analysis passes for the same APK.
pub struct ClassAnalysis {
    /// (caller_dex, MethodIdx) → (owner_dex, code_off) for O(log n) callee lookup.
    pub code_index: BTreeMap<(usize, MethodIdx), (usize, u32)>,
    /// class_desc → direct superclass_desc (first occurrence wins across DEX files).
    pub superclass: BTreeMap<String, String>,
    /// (class_desc, method_name) → (owner_dex, code_off) — concrete implementations only.
    pub method_impls: BTreeMap<(String, String), (usize, u32)>,
    /// Intern table: class descriptor → compact u32 id. Populated for every
    /// class_desc that appears as either a subtype or supertype in the
    /// `supertypes` closure. Lets downstream structures key on u32 instead
    /// of String → is_subtype's inner loop runs on ~5-cycle integer hashes
    /// rather than ~35-cycle string hashes + memcmp. See `supertypes`
    /// docstring for the measured CPU cost of string keys.
    ///
    /// DETERMINISM: This FxHashMap does not flow into evidence envelope output.
    /// Iteration order is internal-only: `class_id` is used only for O(1) intern
    /// lookups in `virtual_single_impl` and `collect_unified_code_index`. Output
    /// from `DexTaintAnalysis` flows through `Vec<TaintFinding>` → findings DB →
    /// SQL `ORDER BY (severity, layer, id_tag, rowid)` in `produce_unsigned_envelope`.
    /// If output usage is ever added, convert to BTreeMap or sort before serialization.
    /// See `droidsaw/src/threat_model/envelope.rs:13` for the canonical discipline.
    /// String key, attacker-controlled (class descriptors come from
    /// parsed DEX). Uses `AHashMap` for randomized hash seeding — FxHash
    /// is bijective on integer keys but vulnerable to crafted-string
    /// collision attacks. AHash's per-process seed prevents that.
    pub class_id: AHashMap<String, u32>,
    /// Intern table: method_name → compact u32 id. Populated alongside
    /// `by_method_name`. Lets the per-thread `ChaCache` key on (u32, u32)
    /// instead of (&str, &str) — no string hashing on the cache lookup
    /// hot path. Same determinism rationale as `class_id`: internal-only,
    /// never serialized. Same DoS rationale as `class_id`: method names
    /// come from parsed DEX (attacker-controlled) — AHash, not FxHash.
    pub method_name_id: AHashMap<String, u32>,
    /// Secondary index: method_name → list of (class_id, dex, code_off)
    /// that implement that method name. Class id is the interned u32 from
    /// `class_id`. Avoids the O(n_total_methods) full scan in
    /// `virtual_single_impl` — without this, CHA dispatch resolution was
    /// Without this, CHA dispatch resolution dominates audit CPU on large Play APKs.
    pub by_method_name: BTreeMap<String, Vec<(u32, usize, u32)>>,
    /// CHA-result memoization. See the `ChaCache`-companion docstring on
    /// the type alias for the determinism + sharing-pattern rationale.
    pub cha_cache: std::sync::RwLock<FxHashMap<(u32, u32), Option<(usize, u32)>>>,
    /// Transitive supertype closure per class id, including the class itself.
    /// `is_subtype(cd, target)` is one FxHashMap+FxHashSet lookup on u32
    /// pairs — no string comparisons.
    ///
    /// DETERMINISM: This FxHashMap (and its FxHashSet values) does not flow
    /// into evidence envelope output. Iteration is internal-only: `supertypes`
    /// is queried only by `is_subtype_id` during CHA dispatch resolution, whose
    /// result is a bool — not serialized. If output usage is ever added, sort the
    /// set into a Vec<u32> before serialization. See envelope.rs:13 for the
    /// canonical avoid-iteration-order discipline.
    pub supertypes: FxHashMap<u32, FxHashSet<u32>>,
    /// Methods skipped during `collect_unified_code_index` because
    /// `get_string` or `get_type_descriptor` returned `None` (corrupt
    /// class/method pool entry). Zero on well-formed DEX input.
    pub corrupted_methods_skipped: usize,
    /// Count of `cha_cache` inserts that were skipped because the cap
    /// (`MAX_CHA_CACHE_ENTRIES`) was hit. Atomic because rayon workers
    /// concurrently traverse `virtual_single_impl`. Read once per audit
    /// after the par_iter completes; if > 0 the audit pipeline emits a
    /// `CHA_CACHE_ENTRY_CAP_HIT` Finding mirroring the
    /// `AXML_FINDINGS_TRUNCATED` discipline.
    pub cha_cache_truncations: std::sync::atomic::AtomicUsize,
    /// Count of `BuildCache` inserts skipped because the cap
    /// (`MAX_BUILD_CACHE_ENTRIES`) was hit, summed across every per-method
    /// `run_interprocedural` call. Per-call caches are not visible to the
    /// caller of `run_interprocedural` directly, so we accumulate via
    /// this shared atomic. Emits `BUILD_CACHE_ENTRY_CAP_HIT` Finding if > 0.
    pub build_cache_truncations: std::sync::atomic::AtomicUsize,
}

impl ClassAnalysis {
    /// u32-keyed subtype check. Called from `virtual_single_impl` where both
    /// sides have already been interned. Integer hash + integer set contains
    /// — no string comparisons on the hot path.
    #[inline]
    fn is_subtype_id(&self, class_id: u32, target_id: u32) -> bool {
        if class_id == target_id {
            return true;
        }
        self.supertypes
            .get(&class_id)
            .map(|s| s.contains(&target_id))
            .unwrap_or(false)
    }

    /// Class Hierarchy Analysis: find the unique concrete implementation of
    /// `method_name` in `declared_class` or any of its subclasses.
    ///
    /// Returns `Some((owner_dex, code_off))` only when exactly one concrete
    /// implementation exists — monomorphic call site, safe to follow.
    /// Returns `None` for abstract or polymorphic call sites.
    pub fn virtual_single_impl(
        &self,
        declared_class: &str,
        method_name: &str,
    ) -> Option<(usize, u32)> {
        // Intern both sides before consulting the cache. `class_id` and
        // `method_name_id` were populated at ClassAnalysis construction.
        // A miss on either ID table means the call site references a
        // class/method that no loaded DEX implements — return None
        // without touching the cache (matches pre-cache behavior).
        let &declared_id = self.class_id.get(declared_class)?;
        let &method_id = self.method_name_id.get(method_name)?;
        let key = (declared_id, method_id);

        // Read path: hot. Take the read lock, fetch, drop the lock before
        // any potential write. `RwLock` allows many concurrent readers,
        // which matches the hit-heavy steady state after warm-up across
        // the rayon par_iter workers.
        if let Ok(cache) = self.cha_cache.read()
            && let Some(&cached) = cache.get(&key)
        {
            return cached;
        }

        // Cache miss — do the iteration. `by_method_name` is keyed by the
        // raw string, not the interned id, because the construction-time
        // lookup is a one-time cost and switching to u32 keys would not
        // change the candidate iteration shape.
        let candidates = self.by_method_name.get(method_name)?;
        let mut found: Option<(usize, u32)> = None;
        let mut polymorphic = false;
        for &(cd_id, dex_idx, code_off) in candidates {
            if !self.is_subtype_id(cd_id, declared_id) {
                continue;
            }
            match found {
                None => found = Some((dex_idx, code_off)),
                Some(_) => {
                    polymorphic = true;
                    break;
                }
            }
        }
        let result = if polymorphic { None } else { found };
        // Write path: take the write lock briefly. Lost-update races on
        // the same key are harmless — CHA is a pure function of the
        // immutable `ClassAnalysis`, so racing writers always insert the
        // same value. We don't `expect` the lock: a poisoned lock on a
        // pure-memoization cache is a recoverable miss, not a crash path.
        // Entry-count cap (`MAX_CHA_CACHE_ENTRIES`): when full, skip the
        // insert rather than evict. Subsequent lookups re-do the candidate
        // iteration — correct but slower. Bounds adversarial input (many
        // unique class/method-name pairs) from ballooning the cache.
        if let Ok(mut cache) = self.cha_cache.write() {
            if cache.len() < MAX_CHA_CACHE_ENTRIES {
                cache.insert(key, result);
            } else {
                // Cap hit — record the skip so the audit pipeline can emit
                // a CHA_CACHE_ENTRY_CAP_HIT Finding.
                self.cha_cache_truncations
                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
            }
        }
        result
    }
}

pub struct DexTaintAnalysis {
    pub findings: Vec<TaintFinding>,
    /// Number of invoke instructions skipped because `get_string` or
    /// `get_type_descriptor` returned `None` (corrupt method pool entry).
    /// Non-zero only on adversarial/corrupt DEX input; zero on well-formed
    /// files. Callers can surface this in audit telemetry.
    pub corrupted_methods_skipped: usize,
    /// Per-finding union of seed positions that propagated to the tainted
    /// operand at the sink, mapped via the `seed_positions` argument passed to
    /// [`DexTaintAnalysis::run_interprocedural_bridge`]. Index-aligned
    /// with `findings`. Empty for non-bridge callers (`run`,
    /// `run_with_seeds`, plain `run_interprocedural`).
    ///
    /// Each position is the JS-side arg index (`param_vars[i]` slot
    /// shifted so position=0 corresponds to JS arg0; the implicit
    /// `this` slot at `param_vars[0]` is dropped by the caller before
    /// the seed_positions map is built). Downstream analysis intersects
    /// this against `TaintSink::NativeModuleArg.arg_positions` from
    /// the HBC side to validate composable cross-layer flows.
    pub bridge_sink_reachable_positions: Vec<BTreeSet<usize>>,
}

/// Classify a method call as a taint source given the method name and
/// receiver class descriptor.
///
/// **Covered sources.** Intent extras, `SharedPreferences` reads,
/// `ContentProvider`/`Cursor` column reads, HTTP response body
/// (OkHttp/Retrofit `ResponseBody`), `InputStream`/`Reader`/`Scanner`
/// reads, and `EditText`/`TextView`/`SearchView` user-input widgets.
///
/// **Not covered (coverage gaps).** The following source classes are not
/// currently recognised and will return `None`:
/// - `ClipboardManager` get (clipboard read — `getPrimaryClip`,
///   `getText`): a common inter-app data exfil channel.
/// - `TelephonyManager` / `SmsManager` inbound SMS body reads.
/// - `LocationManager` / `FusedLocationClient` location fixes
///   (latitude/longitude as a taint source).
/// - `Camera` / `MediaRecorder` capture callbacks.
/// - `BluetoothSocket` / `BluetoothGatt` characteristic reads.
/// - `BroadcastReceiver.onReceive` intent extras (covered at the
///   component level only when the intent is accessed via `getExtra*`
///   — a `BroadcastReceiver` subclass that reads its argument directly
///   is not detected).
///
/// DEX `access_flags` bit indicating a method's implementation lives on
/// the native side of the JNI boundary. Per the DEX format spec
/// (`access_flags` encoding for `encoded_method`); matches the
/// `ACC_NATIVE` constant in JVM specifications.
const ACC_NATIVE: u32 = 0x0100;

/// Enumerate every method in `dex` whose `access_flags` carry the
/// `ACC_NATIVE` bit, returning their `MethodIdx`. These are the
/// per-DEX JNI handoff points the JNI-bridges minimum-tier gate
/// surfaces in the [`TaintSink::NativeMethod`] sink emit.
///
/// Walks each `class_def → class_data → direct_methods + virtual_methods`
/// once. A class with `class_data_off == 0` (interface placeholder /
/// shadowed class_idx entry / corrupted record) contributes nothing.
/// `parse_class_data` errors are silently skipped — the caller's
/// `dex.parse_errors` channel surfaces structurally-corrupt class_data
/// for analyst review elsewhere; this helper's job is just to
/// enumerate the well-formed native methods.
///
/// Empty `BTreeSet` is the natural "no native methods" sentinel and
/// makes downstream `.contains(&m_idx)` a no-op gate.
pub fn collect_native_methods(dex: &DexFile, raw: &[u8]) -> BTreeSet<MethodIdx> {
    let mut native = BTreeSet::new();
    for (class_defs_idx, cd) in dex.class_defs.iter().enumerate() {
        // Shadow gate: a duplicate-`class_idx` shadow row may point at a
        // different `class_data_off` whose method access_flags get OR'd
        // into `native` across both rows. An attacker controls the shadow
        // row's class_data, so without the gate the native-method set
        // reflects rows that `class_def_for_type` never resolves. Skip
        // shadowed rows so the set is the first-wins canonical view.
        if dex.class_def_is_shadowed(class_defs_idx) {
            continue;
        }
        if cd.class_data_off == 0 {
            continue;
        }
        let Ok(class_data) = parse_class_data(raw, cd.class_data_off) else {
            continue;
        };
        for em in class_data
            .direct_methods
            .iter()
            .chain(class_data.virtual_methods.iter())
        {
            if em.access_flags & ACC_NATIVE != 0 {
                native.insert(em.method_idx);
            }
        }
    }
    native
}

/// Pre-compute [`collect_native_methods`] for every DEX in `dex_files`,
/// returning a `Vec<BTreeSet<MethodIdx>>` index-aligned with `dex_files`.
/// The caller computes this once, above its per-method loop, and threads
/// the slice through `run_interprocedural` / `run_interprocedural_bridge`
/// into [`Self::interproc_inner`]. It re-parses every class_data in the
/// APK, so it must stay hoisted out of the per-source loop: calling it
/// per source is O(sources × class_data) and hangs large multidex apps.
/// The recursive per-callee walks gate-check via `.contains(&m_idx)` in
/// O(log N) against the precomputed set.
///
/// `dex_data` is index-aligned with `dex_files`; on a length mismatch
/// the missing-tail DEXes contribute empty sets (treated as "no native
/// methods to gate" by the caller).
pub fn collect_native_methods_per_dex(
    dex_files: &[DexFile],
    dex_data: &[&[u8]],
) -> Vec<BTreeSet<MethodIdx>> {
    dex_files
        .iter()
        .enumerate()
        .map(|(idx, dex)| match dex_data.get(idx) {
            Some(raw) => collect_native_methods(dex, raw),
            None => BTreeSet::new(),
        })
        .collect()
}

/// The dispatch table for covered sources is the `match method_name`
/// block below; extend it to close gaps.
fn classify_source(method_name: &str, class_desc: &str) -> Option<TaintSource> {
    match method_name {
        // Intent extras — most common entry point for inter-component taint.
        "getStringExtra" | "getIntExtra" | "getLongExtra" | "getBooleanExtra"
        | "getBundleExtra" | "getParcelableExtra" | "getSerializableExtra" => {
            Some(TaintSource::IntentExtra { key: String::new() })
        }

        // SharedPreferences
        "getString" | "getInt" | "getLong" | "getBoolean" | "getFloat"
            if class_desc.contains("SharedPreferences") || class_desc.contains("Editor") =>
        {
            Some(TaintSource::SharedPreferencesRead { key: String::new() })
        }

        // ContentProvider / Cursor column reads
        "getString" | "getInt" | "getLong" | "getBlob"
            if class_desc.contains("Cursor") =>
        {
            Some(TaintSource::ContentProviderQuery {
                uri: String::new(),
            })
        }

        // HTTP response body (OkHttp / Retrofit)
        "string" | "bytes" | "byteStream" | "charStream"
            if class_desc.contains("ResponseBody") || class_desc.contains("Response") =>
        {
            Some(TaintSource::NetworkResponse { endpoint: String::new() })
        }

        // InputStream / Reader
        "readLine" | "read" | "readUTF" | "readAllBytes"
            if class_desc.contains("InputStream")
                || class_desc.contains("Reader")
                || class_desc.contains("Scanner") =>
        {
            Some(TaintSource::FileRead { path_pattern: String::new() })
        }

        // EditText / user input widgets
        "getText" | "getQuery"
            if class_desc.contains("EditText")
                || class_desc.contains("TextView")
                || class_desc.contains("SearchView") =>
        {
            Some(TaintSource::UserInput)
        }

        _ => None,
    }
}

/// Classify a method call as a taint sink given the method name and
/// receiver class descriptor.
///
/// **Covered sinks.** `Runtime.exec` / `ProcessBuilder` (command
/// execution), `WebView.loadUrl` / `loadData` (open-redirect/XSS),
/// `SQLiteDatabase` / `rawQuery` (SQL injection), `android/util/Log`
/// (logcat leak), `Method.invoke` (reflection), `OutputStream`/`Writer`/
/// `FileChannel` writes (file exfiltration), `ContentProvider` insert/
/// update/delete (IPC boundary crossing), OkHttp/Retrofit outbound
/// requests (network exfiltration), and `Cipher`/`Mac`/`MessageDigest`
/// init/update/doFinal (tainted crypto material).
///
/// **Not covered (coverage gaps).** The following sink classes are not
/// currently recognised and will return `None`:
/// - `ClipboardManager` set (`setPrimaryClip`, `setText`): tainted data
///   placed on the clipboard is readable by any foreground app.
/// - `sendBroadcast` / `sendOrderedBroadcast`: intent-based exfiltration
///   to third-party receivers.
/// - `FileOutputStream` constructed with an explicit path (covered only
///   when the write is via a higher-level `OutputStream` wrapper above;
///   raw `FileOutputStream` constructor is not a sink here).
/// - `Socket` / `SSLSocket` raw network writes (`getOutputStream().write`
///   chains through `OutputStream`, which is covered, but the constructor
///   `new Socket(host, port)` as an exfil signal is not).
/// - `SharedPreferences.Editor.putString` / `putInt` etc. (tainted data
///   persisted to shared prefs for later cross-process read).
/// - `NotificationManager.notify` (tainted data surfaced in system UI /
///   notification shade).
///
/// The dispatch table for covered sinks is the `match method_name` block
/// below; extend it to close gaps.
fn classify_sink(method_name: &str, class_desc: &str) -> Option<TaintSink> {
    match method_name {
        // Command execution
        "exec" if class_desc.contains("Runtime") || class_desc.contains("ProcessBuilder") => {
            Some(TaintSink::RuntimeExec)
        }

        // WebView navigation — open redirect / XSS
        "loadUrl" | "loadData" | "loadDataWithBaseURL"
            if class_desc.contains("WebView") =>
        {
            Some(TaintSink::WebViewLoadUrl)
        }

        // SQL
        "execSQL" | "rawQuery" | "executeQuery" | "compileStatement"
            if class_desc.contains("SQLite")
                || class_desc.contains("Database")
                || class_desc.contains("Cursor") =>
        {
            Some(TaintSink::SqlExecute)
        }

        // Android Log — sensitive data leaks to logcat
        "d" | "e" | "w" | "v" | "i" | "wtf"
            if class_desc.contains("android/util/Log") || class_desc == "Landroid/util/Log;" =>
        {
            Some(TaintSink::LogOutput)
        }

        // Reflection — taint flowing into dynamic invocation
        "invoke"
            if class_desc.contains("Method")
                || class_desc.contains("reflect") =>
        {
            Some(TaintSink::ReflectionInvoke { class: String::new() })
        }

        // NOTE: a tainted value reaching a stream/writer *data*-write
        // (`write`/`writeBytes`/`println` on OutputStream/Writer/FileChannel)
        // is NOT a sink here. That pattern is the read-then-write file-copy
        // idiom — the taint is the DATA, never the path — so it does not
        // substantiate CWE-22 (path traversal). Real path traversal is
        // detected by `classify_path_sink` below, which fires only when the
        // taint is in the PATH argument of a file-open call.

        // ContentProvider insert/update (data crossing component boundary)
        "insert" | "update" | "delete"
            if class_desc.contains("ContentProvider")
                || class_desc.contains("ContentResolver") =>
        {
            Some(TaintSink::ContentProviderInsert { uri: String::new() })
        }

        // HTTP request body (OkHttp RequestBody, Retrofit @Body)
        "newCall" | "execute" | "enqueue"
            if class_desc.contains("OkHttpClient")
                || class_desc.contains("Call")
                || class_desc.contains("Retrofit") =>
        {
            Some(TaintSink::HttpRequest { method: String::new() })
        }

        // Crypto — tainted key/IV material
        "init" | "doFinal" | "update"
            if class_desc.contains("Cipher")
                || class_desc.contains("Mac")
                || class_desc.contains("MessageDigest") =>
        {
            Some(TaintSink::CryptoInput { operation: method_name.to_string() })
        }

        _ => None,
    }
}

/// Recognize a file-OPEN operation and return the **logical argument positions**
/// (0-based, receiver excluded) that carry the file PATH.
///
/// A tainted value in one of these positions means the attacker steers *where* a
/// file is opened/written — CWE-22 path traversal. The caller gates the
/// [`TaintSink::FilePathTraversal`] sink on the taint landing in exactly these
/// positions, so the CWE-22 claim is substantiated (the path is tainted), not
/// assumed from a mere read→write data flow (that data-side pattern is the
/// benign file-copy idiom and is deliberately not a sink — see `classify_sink`).
///
/// Coverage is the write-side `java.io` / `java.nio.file` open surface. The path
/// is conventionally the first argument; `new File(parent, child)` carries two.
/// Trailing mode/append/charset/options arguments are deliberately excluded.
/// Range-form invokes (`invoke-*/range`, >5 registers) never reach this function:
/// the enclosing match arm at the two call sites admits only
/// `InvokeVirtual`/`InvokeInterface`/`InvokeStatic`/`InvokeDirect`, so the
/// caller's `matches!(insn.op, Opcode::InvokeStatic)` static-vs-non-static test
/// is sound *because* the opcode set is pre-filtered upstream. TRAP: widening
/// that arm to admit `invoke-static/range` (a distinct opcode) without also
/// updating the static test would misclassify a ranged static call as
/// non-static and shift the path-arg mapping by one — revisit both together.
fn classify_path_sink(method_name: &str, class_desc: &str) -> Option<&'static [usize]> {
    match method_name {
        // `new File(path)` / `new File(parent, child)` — both segments are path.
        "<init>" if class_desc == "Ljava/io/File;" => Some(&[0, 1]),
        // Write-side stream/writer constructors — `new FileOutputStream(path[,
        // append])`, `new FileWriter(path[, …])`, `new RandomAccessFile(path,
        // mode)`, `new PrintWriter(path)`, … — path at arg 0; trailing
        // mode/append/charset excluded.
        "<init>"
            if class_desc == "Ljava/io/FileOutputStream;"
                || class_desc == "Ljava/io/FileWriter;"
                || class_desc == "Ljava/io/RandomAccessFile;"
                || class_desc == "Ljava/io/PrintWriter;"
                || class_desc == "Ljava/io/PrintStream;" =>
        {
            Some(&[0])
        }
        // java.nio.file write-side static APIs — `Files.write(path, …)`,
        // `Files.newOutputStream(path, …)`, `Files.createFile(path)`, … — path
        // at arg 0, data/options follow.
        "write" | "newOutputStream" | "newBufferedWriter" | "createFile"
        | "createDirectories"
            if class_desc == "Ljava/nio/file/Files;" =>
        {
            Some(&[0])
        }
        // `Paths.get(first, …)` (static) and `path.resolve(child)` (instance) —
        // the attacker-steered path segment is logical arg 0.
        "get" if class_desc == "Ljava/nio/file/Paths;" => Some(&[0]),
        "resolve" | "resolveSibling" if class_desc == "Ljava/nio/file/Path;" => Some(&[0]),
        _ => None,
    }
}

/// Map an invoke operand's index in `ssa_insn.uses` to its **logical argument
/// position** (0-based, receiver excluded). `InvokeStatic` has no receiver, so
/// the use index *is* the argument position; every other invoke form carries the
/// receiver at `uses[0]`, so the argument position is one less. Returns `None`
/// for the receiver slot of a non-static call (it is not a logical argument).
///
/// INVARIANT (load-bearing): `uses` is one entry per Dalvik *register slot*, not
/// per logical argument. A wide argument (`long`/`double`) occupies two adjacent
/// slots, so this slot→argument mapping is exact only when no wide argument
/// precedes the position of interest. Every path argument in `classify_path_sink`
/// is logical position 0, preceded only by reference args (`File`/`Path`/`String`
/// receiver-or-none) — never a wide. A future path-sink whose path follows a
/// `long`/`double` argument would break this and must widen the mapping first.
fn logical_arg_index(is_static: bool, use_idx: usize) -> Option<usize> {
    if is_static {
        Some(use_idx)
    } else {
        use_idx.checked_sub(1)
    }
}

impl DexTaintAnalysis {
    pub fn run(dex: &DexFile, ssa: &SsaBody, layer: Layer) -> Self {
        Self::run_with_seeds(dex, ssa, layer, BTreeMap::new())
    }

    /// Convenience: convert a plain `BTreeMap<VarId, TaintSource>` seed map
    /// (e.g. from bridge seeding) to the `(TaintSource, None)` form expected
    /// by `run_with_seeds` and `run_interprocedural`. Source addr is `None`
    /// for pre-seeded parameters that have no invoke-site address.
    pub fn seeds_from_sources(
        sources: BTreeMap<VarId, TaintSource>,
    ) -> BTreeMap<VarId, (TaintSource, Option<u32>)> {
        sources.into_iter().map(|(v, s)| (v, (s, None))).collect()
    }

    /// Same as `run` but pre-seeds the taint map before analysis begins.
    /// Used for bridged taint: @ReactMethod parameter VarIds are seeded as
    /// `TaintSource::ReactBridgeParam` so JS-controlled inputs are tracked
    /// through the Java body to dangerous sinks.
    ///
    /// Seeds carry `(TaintSource, Option<u32>)` — the second element is the
    /// source instruction address (Dalvik code-unit offset). `None` for
    /// parameter-seeded sources (bridge taint) that have no invoke address.
    pub fn run_with_seeds(
        dex: &DexFile,
        ssa: &SsaBody,
        layer: Layer,
        seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
    ) -> Self {
        // taints: VarId → (source, source_addr)
        let mut taints: BTreeMap<VarId, (TaintSource, Option<u32>)> = seeds;
        let mut findings = Vec::new();
        let mut corrupted_methods_skipped: usize = 0;

        let mut changed = true;
        while changed {
            changed = false;

            for block in ssa.blocks.values() {
                // Propagate through Phis — if any operand is tainted, the
                // phi destination inherits the taint.
                for phi in &block.phis {
                    for arg_var in phi.operands.values() {
                        // phi destination inherits taint including source addr.
                        if let Some(entry) = taints.get(arg_var).cloned()
                            && !taints.contains_key(&phi.dst)
                        {
                            taints.insert(phi.dst.clone(), entry);
                            changed = true;
                        }
                    }
                }

                for ssa_insn in &block.insns {
                    let insn = &ssa_insn.insn;

                    match insn.op {
                        Opcode::InvokeVirtual
                        | Opcode::InvokeInterface
                        | Opcode::InvokeStatic
                        | Opcode::InvokeDirect => {
                            if let Some(PoolIndex::Method(m_idx)) = insn.pool_idx
                                && let Some(m_id) = dex.methods.get(m_idx.0 as usize)
                            {
                                let Ok(method_name) = dex.get_string(m_id.name_idx) else {
                                    corrupted_methods_skipped =
                                        corrupted_methods_skipped.saturating_add(1);
                                    continue;
                                };
                                let method_name = method_name.to_string();
                                let Ok(class_desc) =
                                    dex.get_type_descriptor(m_id.class_idx)
                                else {
                                    corrupted_methods_skipped =
                                        corrupted_methods_skipped.saturating_add(1);
                                    continue;
                                };
                                let class_desc = class_desc.to_string();

                                // Source tagging on the destination register.
                                if let Some(dst) = &ssa_insn.dst {
                                    if let Some(src) =
                                        classify_source(&method_name, &class_desc)
                                        && !taints.contains_key(dst)
                                    {
                                        // Record the address of this source invoke.
                                        taints.insert(dst.clone(), (src, Some(insn.addr)));
                                        changed = true;
                                    }

                                    // Propagate from any tainted argument (carry addr).
                                    for use_var in &ssa_insn.uses {
                                        if let Some(entry) = taints.get(use_var).cloned()
                                            && !taints.contains_key(dst)
                                        {
                                            taints.insert(dst.clone(), entry);
                                            changed = true;
                                        }
                                    }
                                }

                                // Sink detection: any tainted argument reaching a sink.
                                if let Some(sink) =
                                    classify_sink(&method_name, &class_desc)
                                {
                                    for use_var in &ssa_insn.uses {
                                        if let Some((source, src_addr)) = taints.get(use_var) {
                                            findings.push(TaintFinding {
                                                source: source.clone(),
                                                sink: sink.clone(),
                                                layer,
                                                func_id: m_idx.0,
                                                // class_descriptor / method_signature are
                                                // filled in by the caller (commands/mod.rs)
                                                // once it resolves the containing method via
                                                // method_key_for_idx — they are not available
                                                // inside the per-instruction walker.
                                                class_descriptor: None,
                                                method_signature: None,
                                                source_offset: *src_addr,
                                                sink_offset: Some(insn.addr),
                                            });
                                        }
                                    }
                                }

                                // Path-traversal sink: fire ONLY when the taint
                                // is in a PATH argument of a file-open call, so
                                // CWE-22 is substantiated, not assumed. `uses` is
                                // [receiver?, arg0, arg1, …]; InvokeStatic has no
                                // receiver, so logical arg index = use index
                                // (non-static drops the leading receiver slot).
                                if let Some(path_positions) =
                                    classify_path_sink(&method_name, &class_desc)
                                {
                                    let is_static = matches!(insn.op, Opcode::InvokeStatic);
                                    for (use_idx, use_var) in ssa_insn.uses.iter().enumerate() {
                                        let Some(logical) = logical_arg_index(is_static, use_idx)
                                        else {
                                            continue;
                                        };
                                        if !path_positions.contains(&logical) {
                                            continue;
                                        }
                                        if let Some((source, src_addr)) = taints.get(use_var) {
                                            findings.push(TaintFinding {
                                                source: source.clone(),
                                                sink: TaintSink::FilePathTraversal {
                                                    path_pattern: String::new(),
                                                },
                                                layer,
                                                func_id: m_idx.0,
                                                class_descriptor: None,
                                                method_signature: None,
                                                source_offset: *src_addr,
                                                sink_offset: Some(insn.addr),
                                            });
                                        }
                                    }
                                }
                            }
                        }

                        // Array element or field assignment: propagate taint
                        // from the value register to the result if there is one.
                        Opcode::Aput
                        | Opcode::AputWide
                        | Opcode::AputObject
                        | Opcode::Iput
                        | Opcode::IputWide
                        | Opcode::IputObject
                        | Opcode::Sput
                        | Opcode::SputObject => {
                            if let Some(dst) = &ssa_insn.dst {
                                for use_var in &ssa_insn.uses {
                                    if let Some(entry) = taints.get(use_var).cloned()
                                        && !taints.contains_key(dst)
                                    {
                                        taints.insert(dst.clone(), entry);
                                        changed = true;
                                    }
                                }
                            }
                        }

                        _ => {
                            // Arithmetic / move / aget: propagate taint from
                            // any used register to the destination.
                            if let Some(dst) = &ssa_insn.dst {
                                for use_var in &ssa_insn.uses {
                                    if let Some(entry) = taints.get(use_var).cloned()
                                        && !taints.contains_key(dst)
                                    {
                                        taints.insert(dst.clone(), entry);
                                        changed = true;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        Self {
            findings,
            corrupted_methods_skipped,
            bridge_sink_reachable_positions: Vec::new(),
        }
    }

    /// Build a `ClassAnalysis` from all DEX files in two passes.
    ///
    /// Pass 1: walk all class data, collecting
    ///   (class_descriptor, method_name) → (dex_idx, code_off)   (method_impls)
    ///   class_descriptor → superclass_descriptor                (superclass hierarchy)
    /// Pass 2: for every MethodIdx in every DEX method pool, resolve its
    ///   (class_desc, method_name) strings and look up in the pass-1 map,
    ///   producing the final (caller_dex, MethodIdx) → (owner_dex, code_off) table.
    ///
    /// The returned `ClassAnalysis` supports both O(log n) callee lookup
    /// (code_index) and CHA-based single-implementation resolution for
    /// virtual/interface dispatch (method_impls + superclass).
    pub fn collect_unified_code_index(
        dex_files: &[DexFile],
        dex_data: &[&[u8]],
    ) -> ClassAnalysis {
        // Pass 1: (class_desc, method_name) → (dex_idx, code_off)
        // and class_desc → direct_superclass_desc
        // and method_name → [(class_id, dex_idx, code_off)]
        let mut method_impls: BTreeMap<(String, String), (usize, u32)> = BTreeMap::new();
        let mut superclass: BTreeMap<String, String> = BTreeMap::new();
        // class_id is built lazily as we see class descriptors; every
        // class_desc that either appears in a class_def or as a super
        // gets an id. Saturating id counter — 4B classes is absurd, so
        // u32 capacity is more than enough; .saturating_add prevents a
        // panic on the pathological case.
        let mut class_id: AHashMap<String, u32> = AHashMap::with_capacity(1024);
        let mut next_id: u32 = 0;
        let mut method_name_id: AHashMap<String, u32> = AHashMap::with_capacity(4096);
        let mut next_method_id: u32 = 0;
        let mut by_method_name: BTreeMap<String, Vec<(u32, usize, u32)>> = BTreeMap::new();
        let mut corrupted_methods_skipped: usize = 0;

        for ((dex_idx, dex), &data) in dex_files.iter().enumerate().zip(dex_data.iter()) {
            for (class_defs_idx, cd) in dex.class_defs.iter().enumerate() {
                // Shadow gate: a duplicate-`class_idx` shadow row would
                // register the same method under a SECOND interned
                // `class_id` in `by_method_name`, producing double tuples
                // for one logical method and method-id ambiguity in the
                // CHA / cross-layer taint stitcher. Skip shadowed rows so
                // the index reflects the first-wins canonical class set.
                if dex.class_def_is_shadowed(class_defs_idx) {
                    continue;
                }
                let Ok(class_desc) = dex.get_type_descriptor(cd.class_idx) else {
                    corrupted_methods_skipped =
                        corrupted_methods_skipped.saturating_add(1);
                    continue;
                };
                let class_desc = class_desc.to_string();
                // Record direct superclass when present.
                if let Some(super_idx) = cd.superclass_idx
                    && let Ok(super_desc) = dex.get_type_descriptor(super_idx)
                {
                    superclass.entry(class_desc.clone()).or_insert_with(|| super_desc.to_string());
                }
                if cd.class_data_off == 0 {
                    continue;
                }
                let Ok(class_data) = parse_class_data(data, cd.class_data_off) else {
                    continue;
                };
                for m in class_data.direct_methods.iter().chain(class_data.virtual_methods.iter()) {
                    if m.code_off == 0 {
                        continue;
                    }
                    if let Some(m_id) = dex.methods.get(m.method_idx.0 as usize) {
                        let Ok(method_name) = dex.get_string(m_id.name_idx) else {
                            corrupted_methods_skipped =
                                corrupted_methods_skipped.saturating_add(1);
                            continue;
                        };
                        let method_name = method_name.to_string();
                        let inserted = method_impls
                            .entry((class_desc.clone(), method_name.clone()))
                            .or_insert((dex_idx, m.code_off));
                        // Only register with the secondary index on first
                        // sight (matches method_impls's first-wins semantics
                        // from the `.or_insert(...)` above).
                        if inserted.0 == dex_idx && inserted.1 == m.code_off {
                            // Intern class_desc to a compact u32 id. First sight
                            // of the descriptor assigns the next id.
                            let cd_id = match class_id.get(&class_desc) {
                                Some(&id) => id,
                                None => {
                                    let id = next_id;
                                    next_id = next_id.saturating_add(1);
                                    class_id.insert(class_desc.clone(), id);
                                    id
                                }
                            };
                            // Intern method_name (mirrors class_id pattern).
                            // Built lazily: first sight assigns next_method_id.
                            if !method_name_id.contains_key(&method_name) {
                                method_name_id.insert(method_name.clone(), next_method_id);
                                next_method_id = next_method_id.saturating_add(1);
                            }
                            by_method_name
                                .entry(method_name)
                                .or_default()
                                .push((cd_id, dex_idx, m.code_off));
                        }
                    }
                }
            }
        }

        // Pass 2: for each DEX's method pool, resolve strings → look up pass-1 map
        let mut code_index: BTreeMap<(usize, MethodIdx), (usize, u32)> = BTreeMap::new();
        for (dex_idx, dex) in dex_files.iter().enumerate() {
            for (raw_idx, m_id) in dex.methods.iter().enumerate() {
                let Ok(class_desc) = dex.get_type_descriptor(m_id.class_idx) else {
                    corrupted_methods_skipped =
                        corrupted_methods_skipped.saturating_add(1);
                    continue;
                };
                let class_desc = class_desc.to_string();
                let Ok(method_name) = dex.get_string(m_id.name_idx) else {
                    corrupted_methods_skipped =
                        corrupted_methods_skipped.saturating_add(1);
                    continue;
                };
                let method_name = method_name.to_string();
                if let Some(&resolved) = method_impls.get(&(class_desc, method_name)) {
                    #[allow(
                        clippy::cast_possible_truncation,
                        reason = "PROOF: raw_idx enumerates dex.method_ids (a DEX method pool); DEX format caps method_ids_size at u32::MAX, so usize→u32 is lossless on every supported target."
                    )]
                    let m_idx = raw_idx as u32;
                    code_index.insert((dex_idx, MethodIdx(m_idx)), resolved);
                }
            }
        }

        // Pass 3: intern every class_desc that participates in the superclass
        // chain, then precompute transitive supertype sets keyed by u32 ids.
        // is_subtype becomes O(1) integer hash + integer set contains —
        // no string comparisons anywhere on the hot path. Pre-interning
        // phase: every key AND every value in `superclass` needs an id.
        for (child, parent) in &superclass {
            if !class_id.contains_key(child) {
                let id = next_id;
                next_id = next_id.saturating_add(1);
                class_id.insert(child.clone(), id);
            }
            if !class_id.contains_key(parent) {
                let id = next_id;
                next_id = next_id.saturating_add(1);
                class_id.insert(parent.clone(), id);
            }
        }
        let mut supertypes: FxHashMap<u32, FxHashSet<u32>> =
            FxHashMap::with_capacity_and_hasher(superclass.len(), Default::default());
        for class_desc in superclass.keys() {
            let Some(&class_id_val) = class_id.get(class_desc) else { continue; };
            let mut set: FxHashSet<u32> = FxHashSet::default();
            set.insert(class_id_val);
            let mut current = class_desc.as_str();
            // Cycle-safe: any pathological DEX with a superclass cycle would
            // otherwise loop forever. Bound by the class hierarchy size.
            let mut steps = 0usize;
            while steps < superclass.len() {
                match superclass.get(current) {
                    Some(parent) => {
                        let Some(&parent_id) = class_id.get(parent) else { break; };
                        if !set.insert(parent_id) {
                            break; // already seen → cycle or converged
                        }
                        current = parent.as_str();
                    }
                    None => break,
                }
                steps = steps.saturating_add(1);
            }
            supertypes.insert(class_id_val, set);
        }

        ClassAnalysis {
            code_index, superclass, method_impls, class_id, method_name_id,
            by_method_name,
            cha_cache: std::sync::RwLock::new(FxHashMap::default()),
            supertypes, corrupted_methods_skipped,
            cha_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
            build_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
        }
    }

    /// Interprocedural taint: follows invoke-direct, invoke-static, and
    /// monomorphic invoke-virtual/invoke-interface edges (via CHA) across all
    /// DEX files (depth-limited). Callee lookup uses the unified cross-DEX
    /// ClassAnalysis so methods defined in a different DEX are followed.
    ///
    /// Seeds carry `(TaintSource, Option<u32>)` — see `seeds_from_sources` for
    /// the bridge-seeding convenience constructor.
    #[allow(clippy::too_many_arguments, reason = "Nine-arg signature: the ordering (dex set, data, analysis, current dex, SSA body, layer, seeds, native-methods set, depth) mirrors `interproc_inner`'s recursive contract and collapsing it into a context struct would force callers to name intermediate bindings.")]
    pub fn run_interprocedural(
        dex_files: &[DexFile],
        dex_data: &[&[u8]],
        class_analysis: &ClassAnalysis,
        current_dex: usize,
        ssa: &SsaBody,
        layer: Layer,
        seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
        native_methods_per_dex: &[BTreeSet<MethodIdx>],
        depth: u8,
    ) -> Self {
        let mut visited: BTreeSet<(usize, MethodIdx)> = BTreeSet::new();
        let mut build_cache: BuildCache = AHashMap::default();
        let empty_seed_positions: BTreeMap<VarId, u8> = BTreeMap::new();
        let (findings, _, corrupted_methods_skipped, _) = Self::interproc_inner(
            dex_files, dex_data, class_analysis, current_dex,
            ssa, layer, seeds, &empty_seed_positions,
            native_methods_per_dex,
            depth, &mut visited, &mut build_cache,
        );
        Self {
            findings,
            corrupted_methods_skipped,
            bridge_sink_reachable_positions: Vec::new(),
        }
    }

    /// Bridge-flavored entry point. Same shape as
    /// [`Self::run_interprocedural`] plus a `seed_positions: BTreeMap<VarId, u8>`
    /// argument keyed on the same VarIds as `seeds`, carrying the JS-side
    /// arg position each seed represents (0-indexed within the JS Call's
    /// argument list; the implicit `this` slot at `param_vars[0]` is
    /// dropped by the caller before this map is built).
    ///
    /// Returns the same shape as `run_interprocedural` plus a
    /// per-finding `bridge_sink_reachable_positions: Vec<BTreeSet<usize>>`
    /// vector, index-aligned with `findings`. Downstream analysis
    /// intersects this against `TaintSink::NativeModuleArg.arg_positions`
    /// from the HBC side to validate composable cross-layer flows.
    ///
    /// Only the bridge-seeded entry method's own sink emits get
    /// non-empty position-sets. Sinks reached through interprocedural
    /// recursion into callees get empty sets (the seed-position tag is
    /// not threaded through the recursive call; downstream analysis
    /// handles empty intersections sensibly).
    #[allow(clippy::too_many_arguments, reason = "Ten-arg signature mirrors run_interprocedural plus seed_positions. Collapsing into a context struct would force callers to name intermediate bindings.")]
    pub fn run_interprocedural_bridge(
        dex_files: &[DexFile],
        dex_data: &[&[u8]],
        class_analysis: &ClassAnalysis,
        current_dex: usize,
        ssa: &SsaBody,
        layer: Layer,
        seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
        seed_positions: BTreeMap<VarId, u8>,
        native_methods_per_dex: &[BTreeSet<MethodIdx>],
        depth: u8,
    ) -> Self {
        let mut visited: BTreeSet<(usize, MethodIdx)> = BTreeSet::new();
        let mut build_cache: BuildCache = AHashMap::default();
        let (findings, _, corrupted_methods_skipped, bridge_sink_reachable_positions) =
            Self::interproc_inner(
                dex_files, dex_data, class_analysis, current_dex,
                ssa, layer, seeds, &seed_positions,
                native_methods_per_dex,
                depth,
                &mut visited, &mut build_cache,
            );
        Self {
            findings,
            corrupted_methods_skipped,
            bridge_sink_reachable_positions,
        }
    }

    /// Recursive interprocedural worker. Returns
    /// `(findings, return_taint, corrupted_methods_skipped, bridge_sink_reachable_positions)`.
    ///
    /// The `return_taint` element carries `(TaintSource, Option<u32>)` so
    /// the caller can stage it as `pending_return_taint` with the callee's
    /// source addr preserved.
    ///
    /// The `bridge_sink_reachable_positions` vector is index-aligned
    /// with `findings`. Empty entries (default) are pushed for
    /// findings emitted by recursive callee calls; non-empty entries
    /// are populated only for sinks in the current method's body when
    /// `seed_positions` is non-empty (the bridge-flavored caller via
    /// `run_interprocedural_bridge`).
    #[allow(clippy::too_many_arguments, clippy::type_complexity, reason = "Twelve-arg signature: recursive worker threading full interprocedural context + a mutable visited set for cycle-break + a build cache for (CodeItem, Cfg, SsaBody) reuse + the per-VarId seed_positions map for bridge seeding + the per-DEX native-methods set for the JNI sink gate. Return type carries (findings, return_taint, corrupted_skipped, sink_reachable_positions) — a return struct would force all call sites (including the recursive self-call) to name the fields.")]
    fn interproc_inner(
        dex_files: &[DexFile],
        dex_data: &[&[u8]],
        class_analysis: &ClassAnalysis,
        current_dex: usize,
        ssa: &SsaBody,
        layer: Layer,
        seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
        seed_positions: &BTreeMap<VarId, u8>,
        native_methods_per_dex: &[BTreeSet<MethodIdx>],
        depth: u8,
        visited: &mut BTreeSet<(usize, MethodIdx)>,
        build_cache: &mut BuildCache,
    ) -> (Vec<TaintFinding>, Option<(TaintSource, Option<u32>)>, usize, Vec<BTreeSet<usize>>) {
        let code_index = &class_analysis.code_index;
        let Some(dex) = dex_files.get(current_dex) else {
            // Caller-contract violation: out-of-range dex index. Bail
            // with empty findings rather than panic.
            return (Vec::new(), None, 0, Vec::new());
        };
        // taints: VarId → (source, source_addr)
        let mut taints: BTreeMap<VarId, (TaintSource, Option<u32>)> = seeds;
        // Per-VarId set of original seed VarIds that
        // propagated to it. Initialized from seed_positions keys
        // (each seed VarId reaches itself); mirrors taints' propagation
        // at every site below. Stays empty for non-bridge callers
        // (seed_positions is empty).
        let mut seed_reach: BTreeMap<VarId, BTreeSet<VarId>> = seed_positions
            .keys()
            .map(|v| (v.clone(), std::iter::once(v.clone()).collect()))
            .collect();
        // Parallel to all_findings: each entry carries the
        // sink_reachable_seed_positions set for the corresponding
        // finding. Pushed in lockstep so sort+dedup at the end can
        // operate on packed (finding, positions) pairs.
        let mut all_findings: Vec<(TaintFinding, BTreeSet<usize>)> = Vec::new();
        let mut return_taint: Option<(TaintSource, Option<u32>)> = None;
        let mut corrupted_methods_skipped: usize = 0;

        let mut changed = true;
        while changed {
            changed = false;

            for block in ssa.blocks.values() {
                // Phi propagation (carries source addr along).
                for phi in &block.phis {
                    for arg_var in phi.operands.values() {
                        if let Some(entry) = taints.get(arg_var).cloned()
                            && !taints.contains_key(&phi.dst)
                        {
                            taints.insert(phi.dst.clone(), entry);
                            // Phi destination unions
                            // every tainted arg's reach set.
                            if let Some(src_set) = seed_reach.get(arg_var).cloned() {
                                seed_reach.entry(phi.dst.clone()).or_default().extend(src_set);
                            }
                            changed = true;
                        }
                    }
                }

                // `pending_return_taint` carries the callee's return taint
                // from an invoke to the immediately-following MoveResult.
                let mut pending_return_taint: Option<(TaintSource, Option<u32>)> = None;

                for ssa_insn in &block.insns {
                    let insn = &ssa_insn.insn;

                    // Detect tainted return values.
                    if matches!(insn.op, Opcode::Return | Opcode::ReturnObject | Opcode::ReturnWide) {
                        for use_var in &ssa_insn.uses {
                            if let Some(entry) = taints.get(use_var)
                                && return_taint.is_none()
                            {
                                return_taint = Some(entry.clone());
                            }
                        }
                        pending_return_taint = None;
                        continue;
                    }

                    // MoveResult: inherit return taint from the preceding invoke.
                    if matches!(insn.op, Opcode::MoveResult | Opcode::MoveResultObject | Opcode::MoveResultWide) {
                        if let (Some(dst), Some(entry)) = (&ssa_insn.dst, pending_return_taint.take())
                            && !taints.contains_key(dst)
                        {
                            taints.insert(dst.clone(), entry);
                            changed = true;
                        }
                        continue;
                    }

                    // Any other opcode clears the pending slot.
                    pending_return_taint = None;

                    match insn.op {
                        Opcode::InvokeVirtual
                        | Opcode::InvokeInterface
                        | Opcode::InvokeStatic
                        | Opcode::InvokeDirect => {
                            if let Some(PoolIndex::Method(m_idx)) = insn.pool_idx
                                && let Some(m_id) = dex.methods.get(m_idx.0 as usize)
                            {
                                let Ok(method_name) = dex.get_string(m_id.name_idx) else {
                                    corrupted_methods_skipped =
                                        corrupted_methods_skipped.saturating_add(1);
                                    continue;
                                };
                                let method_name = method_name.to_string();
                                let Ok(class_desc) =
                                    dex.get_type_descriptor(m_id.class_idx)
                                else {
                                    corrupted_methods_skipped =
                                        corrupted_methods_skipped.saturating_add(1);
                                    continue;
                                };
                                let class_desc = class_desc.to_string();

                                // JNI gate: a tainted argument flowing into a
                                // method declared with the ACC_NATIVE access
                                // flag crosses an analyzer-opaque boundary —
                                // the implementation lives on the native side
                                // of JNI and is not visible to this engine.
                                // Emit a TaintSink::NativeMethod finding so
                                // the gap is analyst-visible; the existing
                                // classify_sink branch below still fires
                                // (additive — most native methods will not
                                // match the Java-method dispatch table).
                                let is_native = native_methods_per_dex
                                    .get(current_dex)
                                    .is_some_and(|set| set.contains(&m_idx));
                                if is_native {
                                    for use_var in &ssa_insn.uses {
                                        if let Some((source, src_addr)) = taints.get(use_var) {
                                            let finding = TaintFinding {
                                                source: source.clone(),
                                                sink: TaintSink::NativeMethod {
                                                    class: class_desc.clone(),
                                                    method: method_name.clone(),
                                                },
                                                layer,
                                                func_id: m_idx.0,
                                                class_descriptor: None,
                                                method_signature: None,
                                                source_offset: *src_addr,
                                                sink_offset: Some(insn.addr),
                                            };
                                            all_findings.push((finding, BTreeSet::new()));
                                        }
                                    }
                                }

                                // Sink detection: any tainted argument reaching a sink.
                                if let Some(sink) = classify_sink(&method_name, &class_desc) {
                                    for use_var in &ssa_insn.uses {
                                        if let Some((source, src_addr)) = taints.get(use_var) {
                                            // Derive the set of JS-side
                                            // arg positions that propagated to this
                                            // tainted operand. Empty when
                                            // seed_positions is empty (non-bridge
                                            // caller) or when no seed-tagged VarId
                                            // reached this use_var.
                                            let positions: BTreeSet<usize> = seed_reach
                                                .get(use_var)
                                                .map(|reach| reach
                                                    .iter()
                                                    .filter_map(|v| seed_positions.get(v).map(|&p| p as usize))
                                                    .collect())
                                                .unwrap_or_default();
                                            let finding = TaintFinding {
                                                source: source.clone(),
                                                sink: sink.clone(),
                                                layer,
                                                func_id: m_idx.0,
                                                class_descriptor: None,
                                                method_signature: None,
                                                source_offset: *src_addr,
                                                sink_offset: Some(insn.addr),
                                            };
                                            all_findings.push((finding, positions));
                                        }
                                    }
                                }

                                // Path-traversal sink: fire ONLY when the taint is
                                // in a PATH argument of a file-open call, so CWE-22
                                // is substantiated, not assumed. `uses` is
                                // [receiver?, arg0, …]; InvokeStatic has no receiver,
                                // so logical arg index = use index (non-static drops
                                // the leading receiver slot).
                                if let Some(path_positions) =
                                    classify_path_sink(&method_name, &class_desc)
                                {
                                    let is_static = matches!(insn.op, Opcode::InvokeStatic);
                                    for (use_idx, use_var) in ssa_insn.uses.iter().enumerate() {
                                        let Some(logical) = logical_arg_index(is_static, use_idx)
                                        else {
                                            continue;
                                        };
                                        if !path_positions.contains(&logical) {
                                            continue;
                                        }
                                        if let Some((source, src_addr)) = taints.get(use_var) {
                                            let positions: BTreeSet<usize> = seed_reach
                                                .get(use_var)
                                                .map(|reach| reach
                                                    .iter()
                                                    .filter_map(|v| seed_positions.get(v).map(|&p| p as usize))
                                                    .collect())
                                                .unwrap_or_default();
                                            let finding = TaintFinding {
                                                source: source.clone(),
                                                sink: TaintSink::FilePathTraversal {
                                                    path_pattern: String::new(),
                                                },
                                                layer,
                                                func_id: m_idx.0,
                                                class_descriptor: None,
                                                method_signature: None,
                                                source_offset: *src_addr,
                                                sink_offset: Some(insn.addr),
                                            };
                                            all_findings.push((finding, positions));
                                        }
                                    }
                                }

                                // Source classification: invoke opcodes have def=None
                                // in the SSA (no dst register), so taint from library
                                // calls must flow through pending_return_taint →
                                // MoveResult, exactly like interprocedural return taint.
                                // We stage it here; the followable block below may
                                // override with callee_ret if we recurse.
                                if let Some(src) = classify_source(&method_name, &class_desc) {
                                    pending_return_taint = Some((src, Some(insn.addr)));
                                }

                                // Interprocedural callee resolution:
                                // - invoke-direct / invoke-static: index lookup (exact)
                                // - invoke-virtual / invoke-interface: CHA single-impl
                                let callee_loc: Option<(usize, u32)> = if depth > 0
                                    && !visited.contains(&(current_dex, m_idx))
                                {
                                    match insn.op {
                                        Opcode::InvokeDirect | Opcode::InvokeStatic => {
                                            code_index.get(&(current_dex, m_idx)).copied()
                                        }
                                        Opcode::InvokeVirtual | Opcode::InvokeInterface => {
                                            // CHA: follow only if the declared class has exactly
                                            // one concrete implementation in the loaded DEX set.
                                            class_analysis.virtual_single_impl(
                                                &class_desc,
                                                &method_name,
                                            )
                                        }
                                        _ => None,
                                    }
                                } else {
                                    None
                                };

                                if let Some((owner_dex, callee_code_off)) = callee_loc {
                                    let tainted_args: Vec<(usize, (TaintSource, Option<u32>))> =
                                        ssa_insn.uses.iter().enumerate().filter_map(|(i, use_var)| {
                                            taints.get(use_var).map(|entry| (i, entry.clone()))
                                        }).collect();

                                    if !tainted_args.is_empty()
                                        && let Some(&callee_data) = dex_data.get(owner_dex)
                                    {
                                        let cache_key = (owner_dex, callee_code_off);
                                        // Cache lookup or build. Cloning the Rc is cheap
                                        // (refcount bump) and releases the cache's &mut
                                        // borrow so the recursive call below can take
                                        // `build_cache` again. On parse/CFG/SSA failure
                                        // the cache stays empty for this key — the failed
                                        // callee is re-attempted on the next visit (rare;
                                        // real-DEX build failures are typically permanent,
                                        // but re-trying is correct and cheap).
                                        let entry: Option<std::rc::Rc<(CodeItem, Cfg, SsaBody)>> =
                                            if let Some(hit) = build_cache.get(&cache_key) {
                                                Some(std::rc::Rc::clone(hit))
                                            } else if let Ok(callee_code) = parse_code_item(callee_data, callee_code_off)
                                                && let Ok(callee_cfg) = Cfg::build(&callee_code)
                                                && let Ok(callee_ssa) = SsaBody::build(&callee_code, &callee_cfg)
                                            {
                                                let rc = std::rc::Rc::new((callee_code, callee_cfg, callee_ssa));
                                                // Entry-count cap (`MAX_BUILD_CACHE_ENTRIES`): once full,
                                                // skip the insert and let subsequent lookups re-do the
                                                // parse+CFG+SSA work. Correct (cache is best-effort)
                                                // but slower. Bounds adversarial input where many
                                                // unique callees would otherwise balloon the cache.
                                                if build_cache.len() < MAX_BUILD_CACHE_ENTRIES {
                                                    build_cache.insert(cache_key, std::rc::Rc::clone(&rc));
                                                } else {
                                                    // Cap hit — record the skip on the shared atomic
                                                    // so the audit pipeline can emit a
                                                    // BUILD_CACHE_ENTRY_CAP_HIT Finding.
                                                    class_analysis
                                                        .build_cache_truncations
                                                        .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
                                                }
                                                Some(rc)
                                            } else {
                                                None
                                            };
                                        if let Some(entry) = entry {
                                            let callee_ssa = &entry.2;
                                            let param_seeds: BTreeMap<VarId, (TaintSource, Option<u32>)> =
                                                tainted_args.into_iter().filter_map(|(arg_i, src_entry)| {
                                                    callee_ssa.param_vars.get(arg_i).map(|v| (v.clone(), src_entry))
                                                }).collect();
                                            visited.insert((current_dex, m_idx));
                                            // Don't thread seed_positions
                                            // across the recursive call. Callee findings
                                            // get empty position-sets — downstream
                                            // analysis handles empty intersections sensibly.
                                            let empty_seed_positions: BTreeMap<VarId, u8> = BTreeMap::new();
                                            let (callee_findings, callee_ret, callee_skipped, callee_positions) =
                                                Self::interproc_inner(
                                                    dex_files, dex_data, class_analysis,
                                                    owner_dex,
                                                    callee_ssa, layer,
                                                    // WHY: depth > 0 gate at line above proves no underflow; saturating_sub is exact there.
                                                    param_seeds, &empty_seed_positions,
                                                    native_methods_per_dex,
                                                    depth.saturating_sub(1), visited,
                                                    build_cache,
                                                );
                                            visited.remove(&(current_dex, m_idx));
                                            all_findings.extend(
                                                callee_findings
                                                    .into_iter()
                                                    .zip(callee_positions.into_iter())
                                            );
                                            corrupted_methods_skipped = corrupted_methods_skipped.saturating_add(callee_skipped);
                                            // Callee return taint overrides source
                                            // classification (callee is authoritative).
                                            if let Some(ret) = callee_ret {
                                                pending_return_taint = Some(ret);
                                            }
                                        }
                                    }
                                }
                            }
                        }

                        Opcode::Aput
                        | Opcode::AputWide
                        | Opcode::AputObject
                        | Opcode::Iput
                        | Opcode::IputWide
                        | Opcode::IputObject
                        | Opcode::Sput
                        | Opcode::SputObject => {
                            if let Some(dst) = &ssa_insn.dst {
                                for use_var in &ssa_insn.uses {
                                    if let Some(entry) = taints.get(use_var).cloned()
                                        && !taints.contains_key(dst)
                                    {
                                        taints.insert(dst.clone(), entry);
                                        changed = true;
                                    }
                                }
                            }
                        }

                        _ => {
                            if let Some(dst) = &ssa_insn.dst {
                                for use_var in &ssa_insn.uses {
                                    if let Some(entry) = taints.get(use_var).cloned()
                                        && !taints.contains_key(dst)
                                    {
                                        taints.insert(dst.clone(), entry);
                                        changed = true;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        // Deduplicate: the fixed-point loop re-visits sink instructions on each
        // iteration once taint is established, generating duplicate (source, sink,
        // func_id) triples. Sort + dedup operates on (TaintFinding, positions)
        // pairs — same (source, sink, func_id) entries always carry the same
        // monotone-grown positions set (seed_reach only grows), so dedup is
        // shape-stable.
        all_findings.sort();
        all_findings.dedup();

        let (findings, positions): (Vec<TaintFinding>, Vec<BTreeSet<usize>>) =
            all_findings.into_iter().unzip();
        (findings, return_taint, corrupted_methods_skipped, positions)
    }
}

#[cfg(test)]
mod path_sink_tests {
    //! Unit coverage for the FileRead→FileWrite precision fix: the
    //! `FileWrite` data-write sink is retired (it mislabeled the benign
    //! read-then-write file-copy idiom as CWE-22), and `classify_path_sink` +
    //! `logical_arg_index` substantiate CWE-22 only when the taint reaches the
    //! PATH argument of a file-open call. The end-to-end behaviour on real
    //! bytecode is validated by the corpus re-sweep (the `FileWrite` Medium
    //! population must drop to ~0 and `FilePathTraversal` must appear).

    use super::*;

    #[test]
    fn data_write_methods_are_no_longer_a_sink() {
        // The retired data-write firings: `write`/`writeBytes`/`println` on a
        // stream/writer carry the DATA, not a path — not CWE-22, not a sink.
        for (m, c) in [
            ("write", "Ljava/io/FileOutputStream;"),
            ("write", "Ljava/io/OutputStream;"),
            ("writeBytes", "Ljava/io/DataOutputStream;"),
            ("writeUTF", "Ljava/io/DataOutputStream;"),
            ("println", "Ljava/io/PrintWriter;"),
            ("write", "Ljava/io/BufferedWriter;"),
        ] {
            assert!(
                classify_sink(m, c).is_none(),
                "{m} on {c} must not be a sink (retired data-write firing)"
            );
        }
    }

    #[test]
    fn file_open_calls_are_path_sinks_with_correct_positions() {
        // `new File(path)` / `new File(parent, child)` — both segments are path.
        assert_eq!(classify_path_sink("<init>", "Ljava/io/File;"), Some(&[0, 1][..]));
        // Write-side stream/writer ctors — path at arg 0, trailing mode excluded.
        for c in [
            "Ljava/io/FileOutputStream;",
            "Ljava/io/FileWriter;",
            "Ljava/io/RandomAccessFile;",
            "Ljava/io/PrintWriter;",
            "Ljava/io/PrintStream;",
        ] {
            assert_eq!(classify_path_sink("<init>", c), Some(&[0][..]), "ctor {c}");
        }
        // java.nio.file write-side static APIs + Paths.get / Path.resolve.
        for m in ["write", "newOutputStream", "newBufferedWriter", "createFile", "createDirectories"] {
            assert_eq!(
                classify_path_sink(m, "Ljava/nio/file/Files;"),
                Some(&[0][..]),
                "Files.{m}"
            );
        }
        assert_eq!(classify_path_sink("get", "Ljava/nio/file/Paths;"), Some(&[0][..]));
        assert_eq!(classify_path_sink("resolve", "Ljava/nio/file/Path;"), Some(&[0][..]));
        assert_eq!(classify_path_sink("resolveSibling", "Ljava/nio/file/Path;"), Some(&[0][..]));
    }

    #[test]
    fn non_file_open_calls_are_not_path_sinks() {
        // Data-write methods and unrelated calls are not path sinks.
        assert_eq!(classify_path_sink("write", "Ljava/io/FileOutputStream;"), None);
        assert_eq!(classify_path_sink("println", "Ljava/io/PrintWriter;"), None);
        // `<init>` on a non-file class is not a path sink.
        assert_eq!(classify_path_sink("<init>", "Ljava/lang/StringBuilder;"), None);
        // `get` on a non-Paths class is not a path sink (HashMap.get, etc.).
        assert_eq!(classify_path_sink("get", "Ljava/util/HashMap;"), None);
    }

    #[test]
    fn logical_arg_index_maps_invoke_kinds_correctly() {
        // Static: no receiver — use index IS the argument position. So
        // `Files.write(path, data)` has path at uses[0] → logical 0.
        assert_eq!(logical_arg_index(true, 0), Some(0));
        assert_eq!(logical_arg_index(true, 1), Some(1));
        // Non-static / <init>: receiver at uses[0] (logical None), args shift by
        // one. So `new File(path)` has path at uses[1] → logical 0.
        assert_eq!(logical_arg_index(false, 0), None, "receiver slot is not an argument");
        assert_eq!(logical_arg_index(false, 1), Some(0), "first real arg");
        assert_eq!(logical_arg_index(false, 2), Some(1), "second real arg");
    }
}

#[cfg(test)]
mod cap_tests {
    //! Structural tests that the entry-count caps on `cha_cache` +
    //! `build_cache_truncations` fire correctly when driven past their
    //! limits. These are the gate-coverage counterpart to the production
    //! `CHA_CACHE_ENTRY_CAP_HIT` / `BUILD_CACHE_ENTRY_CAP_HIT` Findings
    //! emitted by the audit pipeline.
    //!
    //! Constructing a real adversarial DEX with > 65536 unique
    //! (class, method-name) pairs is non-trivial (would need a
    //! synthetic-DEX builder). These tests cover the failure mode at the
    //! data-structure boundary: with a pre-filled `cha_cache` at the
    //! cap, the next `virtual_single_impl` insert is skipped and the
    //! truncations counter advances by exactly one. Verifies:
    //! - the cap-skip branch is reached (counter > 0)
    //! - the cache size remains at the cap (no over-cap growth)
    //! - the function still returns a correct result (re-iteration path)

    use super::*;
    use std::sync::atomic::Ordering;

    /// Build a minimal `ClassAnalysis` with one class hierarchy entry
    /// sufficient to drive `virtual_single_impl`. The synthetic class is
    /// `"Lfoo;"` with method `"bar"`, implemented by one concrete subclass
    /// at `(owner_dex=0, code_off=42)`.
    fn make_minimal_class_analysis() -> ClassAnalysis {
        let mut class_id: AHashMap<String, u32> = AHashMap::new();
        class_id.insert("Lfoo;".to_string(), 0);
        let mut method_name_id: AHashMap<String, u32> = AHashMap::new();
        method_name_id.insert("bar".to_string(), 0);
        let mut by_method_name: BTreeMap<String, Vec<(u32, usize, u32)>> = BTreeMap::new();
        by_method_name.insert("bar".to_string(), vec![(0, 0, 42)]);
        let mut supertypes: FxHashMap<u32, FxHashSet<u32>> = FxHashMap::default();
        let mut self_set: FxHashSet<u32> = FxHashSet::default();
        self_set.insert(0);
        supertypes.insert(0, self_set);

        ClassAnalysis {
            code_index: BTreeMap::new(),
            superclass: BTreeMap::new(),
            method_impls: BTreeMap::new(),
            class_id,
            method_name_id,
            by_method_name,
            cha_cache: std::sync::RwLock::new(FxHashMap::default()),
            supertypes,
            corrupted_methods_skipped: 0,
            cha_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
            build_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
        }
    }

    #[test]
    fn cha_cache_cap_skip_increments_truncations_counter() {
        let ca = make_minimal_class_analysis();

        // Pre-fill the cache to exactly MAX_CHA_CACHE_ENTRIES with synthetic
        // keys that don't collide with the test's lookup key (0, 0).
        // Filling via the public RwLock write API mirrors how the cap-skip
        // branch sees a "full" cache in production.
        {
            let mut cache = ca.cha_cache.write().expect("test setup: lock");
            for i in 1..=MAX_CHA_CACHE_ENTRIES {
                #[allow(clippy::cast_possible_truncation, reason = "test-only synthetic keys; i <= MAX_CHA_CACHE_ENTRIES = 65536 < u32::MAX")]
                let key = (i as u32, 0u32);
                cache.insert(key, None);
            }
            assert_eq!(cache.len(), MAX_CHA_CACHE_ENTRIES);
        }

        // Sanity: counter should be 0 before any cap-skip occurs.
        assert_eq!(ca.cha_cache_truncations.load(Ordering::Relaxed), 0);

        // Call virtual_single_impl with a key that misses the pre-filled
        // cache. The cap-skip branch should fire: cache stays at
        // MAX_CHA_CACHE_ENTRIES, truncations counter increments to 1, and
        // the function still returns the correct value (from re-iteration
        // of `by_method_name` candidates).
        let result = ca.virtual_single_impl("Lfoo;", "bar");
        assert_eq!(
            result, Some((0, 42)),
            "virtual_single_impl must still return the correct (owner_dex, code_off) \
             via re-iteration when the cache is at cap"
        );
        assert_eq!(
            ca.cha_cache_truncations.load(Ordering::Relaxed), 1,
            "cap-skip branch should have incremented the truncations counter"
        );
        assert_eq!(
            ca.cha_cache.read().expect("test: read lock").len(),
            MAX_CHA_CACHE_ENTRIES,
            "cache must stay at the cap — over-cap growth would defeat the bound"
        );

        // Second call with the same key: cache still at cap, counter
        // increments again. Each cap-skip is recorded independently.
        let _ = ca.virtual_single_impl("Lfoo;", "bar");
        assert_eq!(
            ca.cha_cache_truncations.load(Ordering::Relaxed), 2,
            "every cap-skip should advance the counter, not just the first"
        );
    }

    #[test]
    fn cha_cache_below_cap_inserts_without_truncation() {
        // Negative control: confirm the truncations counter stays at 0
        // when the cache has room. Otherwise a regression that always
        // increments the counter would slip through.
        let ca = make_minimal_class_analysis();
        let result = ca.virtual_single_impl("Lfoo;", "bar");
        assert_eq!(result, Some((0, 42)));
        assert_eq!(
            ca.cha_cache_truncations.load(Ordering::Relaxed), 0,
            "no cap-skip should fire when cache has room"
        );
        assert_eq!(
            ca.cha_cache.read().expect("test: read lock").len(), 1,
            "single lookup should produce one cache entry"
        );
    }
}

#[cfg(test)]
mod native_method_tests {
    //! Coverage for the JNI-bridges minimum-tier helper. A real shipped
    //! APK invariably contains framework classes that declare native
    //! methods (e.g. `android.util.Log.isLoggable`), so any vendored
    //! APK fixture proves the enumeration walks the full DEX without
    //! missing ACC_NATIVE entries. The fixture lives at the canonical
    //! probe path; this test fails closed if the APK is repacked
    //! without native methods, which would itself be a signal worth
    //! investigating.
    use super::*;
    use droidsaw_apk::Apk;
    use std::path::Path;

    fn vendored_probe_apk() -> Option<Apk> {
        // Resolve relative to the bench probe directory the rest of the
        // workspace uses for the crosstaint_rn fixture.
        let path = Path::new(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("droidsaw-bench")
            .join("probe")
            .join("crosstaint_rn.apk");
        if !path.exists() {
            return None;
        }
        Apk::parse(&path).ok()
    }

    #[test]
    fn collect_native_methods_finds_framework_native_methods() {
        let Some(apk) = vendored_probe_apk() else {
            // Fixture absent — skip rather than fail (allows the test
            // suite to run on workspaces without the probe directory).
            return;
        };
        let mut total_native = 0_usize;
        for (idx, dex_entry) in apk.dex.iter().enumerate() {
            let Ok(dex) = droidsaw_dex::parser::DexFile::parse(&dex_entry.data, None) else {
                continue;
            };
            let native = collect_native_methods(&dex, &dex_entry.data);
            total_native = total_native.saturating_add(native.len());
            // Every entry in the returned set must be a real MethodIdx
            // pointing into the DEX's method pool.
            for m_idx in &native {
                assert!(
                    (m_idx.0 as usize) < dex.methods.len(),
                    "dex {idx}: native MethodIdx {m_idx:?} out of method-pool range ({} entries)",
                    dex.methods.len(),
                );
            }
        }
        // A React Native shipped APK pulls in framework classes that
        // declare native methods (Log, Bitmap, etc.). If this assertion
        // fails, either the helper is broken OR the APK fixture has
        // been repacked without framework native methods — both worth
        // investigating.
        assert!(
            total_native > 0,
            "expected vendored RN APK to contain at least one framework ACC_NATIVE method",
        );
    }
}

#[cfg(test)]
mod shadow_gate_tests {
    use super::*;
    use crate::analysis::dup_class_fixture;

    /// `collect_native_methods` unions ACC_NATIVE method idxs across all
    /// `class_def` rows. On a duplicate-`class_idx` pair where the
    /// canonical (first) row is NON-native and the shadow (second) row
    /// declares a native method, an ungated walk would attribute the
    /// shadow row's native method to the class. The shadow gate skips
    /// row 1, so only the canonical row's (empty native) set survives.
    #[test]
    fn collect_native_methods_skips_shadow_row() {
        // Canonical method idx 0 (non-native), shadow method idx 1
        // (native).
        let fx = dup_class_fixture::with_native_method_rows(0, false, 1, true);
        let native = collect_native_methods(&fx.dex, &fx.raw);

        assert!(
            !native.contains(&MethodIdx(1)),
            "shadow row's native method must not be attributed to the canonical class"
        );
        assert!(
            native.is_empty(),
            "canonical (first-wins) row is non-native, so the native set is empty; \
             got {native:?}"
        );
    }

    /// `collect_unified_code_index` registers each `(class, method)`
    /// under an interned `class_id` in `by_method_name`. A shadow row
    /// re-registers the same method under a second `class_id`, yielding
    /// double tuples for one logical method. The shadow gate drops the
    /// shadow row, so each method name maps to exactly one tuple.
    #[test]
    fn unified_code_index_skips_shadow_row() {
        // Both rows declare a method with code (the fixture emits a
        // non-zero code_off) so they reach by_method_name registration.
        // The rows carry DISTINCT method names ("canonMethod" /
        // "shadowMethod"), so a missing gate would surface the shadow
        // method as its own index entry.
        let fx = dup_class_fixture::with_native_method_rows(0, false, 1, false);

        let dex_files = vec![fx.dex];
        let dex_data: Vec<&[u8]> = vec![fx.raw.as_slice()];
        let analysis = DexTaintAnalysis::collect_unified_code_index(&dex_files, &dex_data);

        assert!(
            analysis.by_method_name.contains_key("canonMethod"),
            "canonical (first-wins) row's method must still be indexed"
        );
        assert!(
            !analysis.by_method_name.contains_key("shadowMethod"),
            "shadow row's method name must not be indexed; the gate drops the shadow class_def"
        );
    }
}