#![cfg_attr(not(test), allow(clippy::as_conversions, reason = "PROOF (4 sites): all `as` casts in this module are `MethodIdx (u32 newtype) .0 as usize` or `usize as u32` (raw_idx enumeration of dex.method_ids). The widening direction is lossless on 64-bit (droidsaw's supported target set); the narrowing direction is bounded by the DEX format's u32-cap on `method_ids_size` per spec (existing per-site `cast_possible_truncation` allow at line ~600 already encodes this PROOF). `.get()` handles any OOB by returning None."))]
use std::collections::{BTreeMap, BTreeSet};
use ahash::AHashMap;
use rustc_hash::{FxHashMap, FxHashSet};
use droidsaw_dex::DexFile;
use droidsaw_dex::ids::MethodIdx;
use droidsaw_dex::ssa::{SsaBody, VarId};
use droidsaw_dex::opcodes::Opcode;
use droidsaw_dex::decode::{parse_class_data, parse_code_item, CodeItem, PoolIndex};
use droidsaw_dex::cfg::Cfg;
pub use droidsaw_common::analysis::{TaintSource, TaintSink, TaintFinding};
use droidsaw_common::finding::Layer;
/// Per-call cache for interprocedural callee builds.
///
/// `interproc_inner` rebuilds `(CodeItem, Cfg, SsaBody)` for every
/// distinct callee it follows. Within a single top-level
/// `run_interprocedural` call, the fixpoint loop (`while changed`)
/// can revisit the same Invoke instruction multiple times as taint
/// propagates through phis, AND fan-in from different invoke sites
/// can reach the same callee through different chains. Both
/// scenarios re-trigger parse → CFG → SSA construction on bytes
/// that haven't changed.
///
/// The cache keys on `(owner_dex, code_off)` — code_off uniquely
/// identifies a method's bytecode within its DEX file. Values are
/// `Rc<...>` so a cache lookup can clone the handle and drop the
/// cache borrow before passing `&mut BuildCache` into the recursive
/// call. The recursion is single-threaded (one rayon worker owns
/// the whole chain), so `Rc` is sufficient — no `Arc` overhead.
///
/// `AHashMap` rather than `FxHashMap`: `code_off` is read from parsed
/// DEX content, attacker-controllable. A crafted DEX with many methods
/// at code_offs that collide under FxHash would bucket-bloat the cache
/// and degrade per-call O(1) to O(N). AHash's randomized seed defeats
/// this; the cost (~1.2-1.5× slower per lookup) is negligible compared
/// to the parse+CFG+SSA construction the cache amortizes.
type BuildCache = AHashMap<(usize, u32), std::rc::Rc<(CodeItem, Cfg, SsaBody)>>;
/// Entry-count cap on `BuildCache`. Each entry holds a `Rc<(CodeItem, Cfg,
/// SsaBody)>` — non-trivial allocation per entry (MB-scale on large
/// methods). Without a cap, an adversarial DEX with many unique callee
/// `code_off`s reached via interproc fixpoint can balloon the cache into
/// GB territory — an OOM-class DoS path. On cap hit, new entries skip
/// insertion (cache becomes best-effort beyond N); subsequent lookups
/// re-do parse/CFG/SSA work, which is correct but slower. The cache is
/// per-call so the lifetime is bounded by `run_interprocedural`. Realistic
/// upper bound on production input: low thousands of unique callees per
/// interproc call. N=4096 leaves an order of magnitude of headroom over
/// observed production behaviour. Mirrors the `MAX_AXML_FINDINGS = 1024`
/// pattern in `droidsaw-apk/src/binary_xml.rs:155`.
const MAX_BUILD_CACHE_ENTRIES: usize = 4096;
/// Entry-count cap on `ClassAnalysis::cha_cache`. Each entry is a
/// `((u32, u32), Option<(usize, u32)>)` — ~32 bytes per entry plus HashMap
/// overhead, so at N=65536 the cache uses ~2-4 MB. Key is
/// `(class_id, method_name_id)` — both interned u32s sourced from parsed
/// DEX content, so the cache size is bounded by `class_count * method_name_count`
/// of the loaded APK. Production-corpus class × method-name product is in
/// the 10⁴-10⁵ range for large APKs; an adversarial DEX could push this
/// higher (unique method names per class). On cap hit, new entries skip
/// insertion; CHA lookups re-do the candidate iteration, which is correct
/// but slower.
///
/// Note: the cap bounds *memory*, not *work*. Past N, every lookup re-
/// iterates `by_method_name` candidates — CPU-class amplification on
/// adversarial input. The total work is bounded by the audit-level wall
/// deadline (`ParseBudget::deadline` in `droidsaw-common/src/budget.rs`),
/// which is the load-bearing CPU bound for adversarial workloads. This
/// cap and the budget cooperate: the cap removes the OOM path, the
/// budget removes the runaway-CPU path.
const MAX_CHA_CACHE_ENTRIES: usize = 65536;
/// Finding id surfaced by the audit pipeline when `cha_cache` hits its
/// entry-count cap during a run. Info-severity summary; non-zero count
/// signals that CHA-resolved virtual-call findings past the cap point
/// came from re-iteration (still correct, no semantic regression). Mirrors
/// the `AXML_FINDINGS_TRUNCATED` discipline from
/// `droidsaw-apk/src/binary_xml.rs:153`.
pub const CHA_CACHE_ENTRY_CAP_HIT: &str = "CHA_CACHE_ENTRY_CAP_HIT";
/// Finding id surfaced when one or more `BuildCache` calls hit their
/// entry-count cap during a run. See `CHA_CACHE_ENTRY_CAP_HIT` for the
/// discipline rationale.
pub const BUILD_CACHE_ENTRY_CAP_HIT: &str = "BUILD_CACHE_ENTRY_CAP_HIT";
/// Memoization of `virtual_single_impl` results, shared across rayon
/// workers for the lifetime of a single audit.
///
/// Key is `(declared_class_id, method_name_id)` — both u32 interned in
/// `ClassAnalysis::class_id` / `ClassAnalysis::method_name_id`. Value is
/// the cached `Option<(owner_dex, code_off)>` — exactly what
/// `virtual_single_impl` would return.
///
/// Lives on `ClassAnalysis` itself behind an `RwLock`. Real APKs hit the
/// same `(declared_class, method_name)` pair from many method-task
/// workers (e.g. `Cursor.getString`, `Intent.getStringExtra`, `Log.d`
/// queried while taint-analysing every method that calls them).
/// `virtual_single_impl` is the hot path in large-APK audits; a per-call
/// cache reduces within-call redundancy, and promoting the cache to
/// `ClassAnalysis` lets warmed entries serve all workers.
///
/// DETERMINISM: This cache does not flow into evidence-envelope output.
/// Same rationale as `class_id` / `supertypes`. See those docstrings.
/// Cross-DEX class analysis: code index + class hierarchy + method implementations.
/// Built once by `DexTaintAnalysis::collect_unified_code_index` and reused across
/// all taint analysis passes for the same APK.
pub struct ClassAnalysis {
/// (caller_dex, MethodIdx) → (owner_dex, code_off) for O(log n) callee lookup.
pub code_index: BTreeMap<(usize, MethodIdx), (usize, u32)>,
/// class_desc → direct superclass_desc (first occurrence wins across DEX files).
pub superclass: BTreeMap<String, String>,
/// (class_desc, method_name) → (owner_dex, code_off) — concrete implementations only.
pub method_impls: BTreeMap<(String, String), (usize, u32)>,
/// Intern table: class descriptor → compact u32 id. Populated for every
/// class_desc that appears as either a subtype or supertype in the
/// `supertypes` closure. Lets downstream structures key on u32 instead
/// of String → is_subtype's inner loop runs on ~5-cycle integer hashes
/// rather than ~35-cycle string hashes + memcmp. See `supertypes`
/// docstring for the measured CPU cost of string keys.
///
/// DETERMINISM: This FxHashMap does not flow into evidence envelope output.
/// Iteration order is internal-only: `class_id` is used only for O(1) intern
/// lookups in `virtual_single_impl` and `collect_unified_code_index`. Output
/// from `DexTaintAnalysis` flows through `Vec<TaintFinding>` → findings DB →
/// SQL `ORDER BY (severity, layer, id_tag, rowid)` in `produce_unsigned_envelope`.
/// If output usage is ever added, convert to BTreeMap or sort before serialization.
/// See `droidsaw/src/threat_model/envelope.rs:13` for the canonical discipline.
/// String key, attacker-controlled (class descriptors come from
/// parsed DEX). Uses `AHashMap` for randomized hash seeding — FxHash
/// is bijective on integer keys but vulnerable to crafted-string
/// collision attacks. AHash's per-process seed prevents that.
pub class_id: AHashMap<String, u32>,
/// Intern table: method_name → compact u32 id. Populated alongside
/// `by_method_name`. Lets the per-thread `ChaCache` key on (u32, u32)
/// instead of (&str, &str) — no string hashing on the cache lookup
/// hot path. Same determinism rationale as `class_id`: internal-only,
/// never serialized. Same DoS rationale as `class_id`: method names
/// come from parsed DEX (attacker-controlled) — AHash, not FxHash.
pub method_name_id: AHashMap<String, u32>,
/// Secondary index: method_name → list of (class_id, dex, code_off)
/// that implement that method name. Class id is the interned u32 from
/// `class_id`. Avoids the O(n_total_methods) full scan in
/// `virtual_single_impl` — without this, CHA dispatch resolution was
/// Without this, CHA dispatch resolution dominates audit CPU on large Play APKs.
pub by_method_name: BTreeMap<String, Vec<(u32, usize, u32)>>,
/// CHA-result memoization. See the `ChaCache`-companion docstring on
/// the type alias for the determinism + sharing-pattern rationale.
pub cha_cache: std::sync::RwLock<FxHashMap<(u32, u32), Option<(usize, u32)>>>,
/// Transitive supertype closure per class id, including the class itself.
/// `is_subtype(cd, target)` is one FxHashMap+FxHashSet lookup on u32
/// pairs — no string comparisons.
///
/// DETERMINISM: This FxHashMap (and its FxHashSet values) does not flow
/// into evidence envelope output. Iteration is internal-only: `supertypes`
/// is queried only by `is_subtype_id` during CHA dispatch resolution, whose
/// result is a bool — not serialized. If output usage is ever added, sort the
/// set into a Vec<u32> before serialization. See envelope.rs:13 for the
/// canonical avoid-iteration-order discipline.
pub supertypes: FxHashMap<u32, FxHashSet<u32>>,
/// Methods skipped during `collect_unified_code_index` because
/// `get_string` or `get_type_descriptor` returned `None` (corrupt
/// class/method pool entry). Zero on well-formed DEX input.
pub corrupted_methods_skipped: usize,
/// Count of `cha_cache` inserts that were skipped because the cap
/// (`MAX_CHA_CACHE_ENTRIES`) was hit. Atomic because rayon workers
/// concurrently traverse `virtual_single_impl`. Read once per audit
/// after the par_iter completes; if > 0 the audit pipeline emits a
/// `CHA_CACHE_ENTRY_CAP_HIT` Finding mirroring the
/// `AXML_FINDINGS_TRUNCATED` discipline.
pub cha_cache_truncations: std::sync::atomic::AtomicUsize,
/// Count of `BuildCache` inserts skipped because the cap
/// (`MAX_BUILD_CACHE_ENTRIES`) was hit, summed across every per-method
/// `run_interprocedural` call. Per-call caches are not visible to the
/// caller of `run_interprocedural` directly, so we accumulate via
/// this shared atomic. Emits `BUILD_CACHE_ENTRY_CAP_HIT` Finding if > 0.
pub build_cache_truncations: std::sync::atomic::AtomicUsize,
}
impl ClassAnalysis {
/// u32-keyed subtype check. Called from `virtual_single_impl` where both
/// sides have already been interned. Integer hash + integer set contains
/// — no string comparisons on the hot path.
#[inline]
fn is_subtype_id(&self, class_id: u32, target_id: u32) -> bool {
if class_id == target_id {
return true;
}
self.supertypes
.get(&class_id)
.map(|s| s.contains(&target_id))
.unwrap_or(false)
}
/// Class Hierarchy Analysis: find the unique concrete implementation of
/// `method_name` in `declared_class` or any of its subclasses.
///
/// Returns `Some((owner_dex, code_off))` only when exactly one concrete
/// implementation exists — monomorphic call site, safe to follow.
/// Returns `None` for abstract or polymorphic call sites.
pub fn virtual_single_impl(
&self,
declared_class: &str,
method_name: &str,
) -> Option<(usize, u32)> {
// Intern both sides before consulting the cache. `class_id` and
// `method_name_id` were populated at ClassAnalysis construction.
// A miss on either ID table means the call site references a
// class/method that no loaded DEX implements — return None
// without touching the cache (matches pre-cache behavior).
let &declared_id = self.class_id.get(declared_class)?;
let &method_id = self.method_name_id.get(method_name)?;
let key = (declared_id, method_id);
// Read path: hot. Take the read lock, fetch, drop the lock before
// any potential write. `RwLock` allows many concurrent readers,
// which matches the hit-heavy steady state after warm-up across
// the rayon par_iter workers.
if let Ok(cache) = self.cha_cache.read()
&& let Some(&cached) = cache.get(&key)
{
return cached;
}
// Cache miss — do the iteration. `by_method_name` is keyed by the
// raw string, not the interned id, because the construction-time
// lookup is a one-time cost and switching to u32 keys would not
// change the candidate iteration shape.
let candidates = self.by_method_name.get(method_name)?;
let mut found: Option<(usize, u32)> = None;
let mut polymorphic = false;
for &(cd_id, dex_idx, code_off) in candidates {
if !self.is_subtype_id(cd_id, declared_id) {
continue;
}
match found {
None => found = Some((dex_idx, code_off)),
Some(_) => {
polymorphic = true;
break;
}
}
}
let result = if polymorphic { None } else { found };
// Write path: take the write lock briefly. Lost-update races on
// the same key are harmless — CHA is a pure function of the
// immutable `ClassAnalysis`, so racing writers always insert the
// same value. We don't `expect` the lock: a poisoned lock on a
// pure-memoization cache is a recoverable miss, not a crash path.
// Entry-count cap (`MAX_CHA_CACHE_ENTRIES`): when full, skip the
// insert rather than evict. Subsequent lookups re-do the candidate
// iteration — correct but slower. Bounds adversarial input (many
// unique class/method-name pairs) from ballooning the cache.
if let Ok(mut cache) = self.cha_cache.write() {
if cache.len() < MAX_CHA_CACHE_ENTRIES {
cache.insert(key, result);
} else {
// Cap hit — record the skip so the audit pipeline can emit
// a CHA_CACHE_ENTRY_CAP_HIT Finding.
self.cha_cache_truncations
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
}
result
}
}
pub struct DexTaintAnalysis {
pub findings: Vec<TaintFinding>,
/// Number of invoke instructions skipped because `get_string` or
/// `get_type_descriptor` returned `None` (corrupt method pool entry).
/// Non-zero only on adversarial/corrupt DEX input; zero on well-formed
/// files. Callers can surface this in audit telemetry.
pub corrupted_methods_skipped: usize,
/// Per-finding union of seed positions that propagated to the tainted
/// operand at the sink, mapped via the `seed_positions` argument passed to
/// [`DexTaintAnalysis::run_interprocedural_bridge`]. Index-aligned
/// with `findings`. Empty for non-bridge callers (`run`,
/// `run_with_seeds`, plain `run_interprocedural`).
///
/// Each position is the JS-side arg index (`param_vars[i]` slot
/// shifted so position=0 corresponds to JS arg0; the implicit
/// `this` slot at `param_vars[0]` is dropped by the caller before
/// the seed_positions map is built). Downstream analysis intersects
/// this against `TaintSink::NativeModuleArg.arg_positions` from
/// the HBC side to validate composable cross-layer flows.
pub bridge_sink_reachable_positions: Vec<BTreeSet<usize>>,
}
/// Classify a method call as a taint source given the method name and
/// receiver class descriptor.
///
/// **Covered sources.** Intent extras, `SharedPreferences` reads,
/// `ContentProvider`/`Cursor` column reads, HTTP response body
/// (OkHttp/Retrofit `ResponseBody`), `InputStream`/`Reader`/`Scanner`
/// reads, and `EditText`/`TextView`/`SearchView` user-input widgets.
///
/// **Not covered (coverage gaps).** The following source classes are not
/// currently recognised and will return `None`:
/// - `ClipboardManager` get (clipboard read — `getPrimaryClip`,
/// `getText`): a common inter-app data exfil channel.
/// - `TelephonyManager` / `SmsManager` inbound SMS body reads.
/// - `LocationManager` / `FusedLocationClient` location fixes
/// (latitude/longitude as a taint source).
/// - `Camera` / `MediaRecorder` capture callbacks.
/// - `BluetoothSocket` / `BluetoothGatt` characteristic reads.
/// - `BroadcastReceiver.onReceive` intent extras (covered at the
/// component level only when the intent is accessed via `getExtra*`
/// — a `BroadcastReceiver` subclass that reads its argument directly
/// is not detected).
///
/// DEX `access_flags` bit indicating a method's implementation lives on
/// the native side of the JNI boundary. Per the DEX format spec
/// (`access_flags` encoding for `encoded_method`); matches the
/// `ACC_NATIVE` constant in JVM specifications.
const ACC_NATIVE: u32 = 0x0100;
/// Enumerate every method in `dex` whose `access_flags` carry the
/// `ACC_NATIVE` bit, returning their `MethodIdx`. These are the
/// per-DEX JNI handoff points the JNI-bridges minimum-tier gate
/// surfaces in the [`TaintSink::NativeMethod`] sink emit.
///
/// Walks each `class_def → class_data → direct_methods + virtual_methods`
/// once. A class with `class_data_off == 0` (interface placeholder /
/// shadowed class_idx entry / corrupted record) contributes nothing.
/// `parse_class_data` errors are silently skipped — the caller's
/// `dex.parse_errors` channel surfaces structurally-corrupt class_data
/// for analyst review elsewhere; this helper's job is just to
/// enumerate the well-formed native methods.
///
/// Empty `BTreeSet` is the natural "no native methods" sentinel and
/// makes downstream `.contains(&m_idx)` a no-op gate.
pub fn collect_native_methods(dex: &DexFile, raw: &[u8]) -> BTreeSet<MethodIdx> {
let mut native = BTreeSet::new();
for (class_defs_idx, cd) in dex.class_defs.iter().enumerate() {
// Shadow gate: a duplicate-`class_idx` shadow row may point at a
// different `class_data_off` whose method access_flags get OR'd
// into `native` across both rows. An attacker controls the shadow
// row's class_data, so without the gate the native-method set
// reflects rows that `class_def_for_type` never resolves. Skip
// shadowed rows so the set is the first-wins canonical view.
if dex.class_def_is_shadowed(class_defs_idx) {
continue;
}
if cd.class_data_off == 0 {
continue;
}
let Ok(class_data) = parse_class_data(raw, cd.class_data_off) else {
continue;
};
for em in class_data
.direct_methods
.iter()
.chain(class_data.virtual_methods.iter())
{
if em.access_flags & ACC_NATIVE != 0 {
native.insert(em.method_idx);
}
}
}
native
}
/// Pre-compute [`collect_native_methods`] for every DEX in `dex_files`,
/// returning a `Vec<BTreeSet<MethodIdx>>` index-aligned with `dex_files`.
/// The caller computes this once, above its per-method loop, and threads
/// the slice through `run_interprocedural` / `run_interprocedural_bridge`
/// into [`Self::interproc_inner`]. It re-parses every class_data in the
/// APK, so it must stay hoisted out of the per-source loop: calling it
/// per source is O(sources × class_data) and hangs large multidex apps.
/// The recursive per-callee walks gate-check via `.contains(&m_idx)` in
/// O(log N) against the precomputed set.
///
/// `dex_data` is index-aligned with `dex_files`; on a length mismatch
/// the missing-tail DEXes contribute empty sets (treated as "no native
/// methods to gate" by the caller).
pub fn collect_native_methods_per_dex(
dex_files: &[DexFile],
dex_data: &[&[u8]],
) -> Vec<BTreeSet<MethodIdx>> {
dex_files
.iter()
.enumerate()
.map(|(idx, dex)| match dex_data.get(idx) {
Some(raw) => collect_native_methods(dex, raw),
None => BTreeSet::new(),
})
.collect()
}
/// The dispatch table for covered sources is the `match method_name`
/// block below; extend it to close gaps.
fn classify_source(method_name: &str, class_desc: &str) -> Option<TaintSource> {
match method_name {
// Intent extras — most common entry point for inter-component taint.
"getStringExtra" | "getIntExtra" | "getLongExtra" | "getBooleanExtra"
| "getBundleExtra" | "getParcelableExtra" | "getSerializableExtra" => {
Some(TaintSource::IntentExtra { key: String::new() })
}
// SharedPreferences
"getString" | "getInt" | "getLong" | "getBoolean" | "getFloat"
if class_desc.contains("SharedPreferences") || class_desc.contains("Editor") =>
{
Some(TaintSource::SharedPreferencesRead { key: String::new() })
}
// ContentProvider / Cursor column reads
"getString" | "getInt" | "getLong" | "getBlob"
if class_desc.contains("Cursor") =>
{
Some(TaintSource::ContentProviderQuery {
uri: String::new(),
})
}
// HTTP response body (OkHttp / Retrofit)
"string" | "bytes" | "byteStream" | "charStream"
if class_desc.contains("ResponseBody") || class_desc.contains("Response") =>
{
Some(TaintSource::NetworkResponse { endpoint: String::new() })
}
// InputStream / Reader
"readLine" | "read" | "readUTF" | "readAllBytes"
if class_desc.contains("InputStream")
|| class_desc.contains("Reader")
|| class_desc.contains("Scanner") =>
{
Some(TaintSource::FileRead { path_pattern: String::new() })
}
// EditText / user input widgets
"getText" | "getQuery"
if class_desc.contains("EditText")
|| class_desc.contains("TextView")
|| class_desc.contains("SearchView") =>
{
Some(TaintSource::UserInput)
}
_ => None,
}
}
/// Classify a method call as a taint sink given the method name and
/// receiver class descriptor.
///
/// **Covered sinks.** `Runtime.exec` / `ProcessBuilder` (command
/// execution), `WebView.loadUrl` / `loadData` (open-redirect/XSS),
/// `SQLiteDatabase` / `rawQuery` (SQL injection), `android/util/Log`
/// (logcat leak), `Method.invoke` (reflection), `OutputStream`/`Writer`/
/// `FileChannel` writes (file exfiltration), `ContentProvider` insert/
/// update/delete (IPC boundary crossing), OkHttp/Retrofit outbound
/// requests (network exfiltration), and `Cipher`/`Mac`/`MessageDigest`
/// init/update/doFinal (tainted crypto material).
///
/// **Not covered (coverage gaps).** The following sink classes are not
/// currently recognised and will return `None`:
/// - `ClipboardManager` set (`setPrimaryClip`, `setText`): tainted data
/// placed on the clipboard is readable by any foreground app.
/// - `sendBroadcast` / `sendOrderedBroadcast`: intent-based exfiltration
/// to third-party receivers.
/// - `FileOutputStream` constructed with an explicit path (covered only
/// when the write is via a higher-level `OutputStream` wrapper above;
/// raw `FileOutputStream` constructor is not a sink here).
/// - `Socket` / `SSLSocket` raw network writes (`getOutputStream().write`
/// chains through `OutputStream`, which is covered, but the constructor
/// `new Socket(host, port)` as an exfil signal is not).
/// - `SharedPreferences.Editor.putString` / `putInt` etc. (tainted data
/// persisted to shared prefs for later cross-process read).
/// - `NotificationManager.notify` (tainted data surfaced in system UI /
/// notification shade).
///
/// The dispatch table for covered sinks is the `match method_name` block
/// below; extend it to close gaps.
fn classify_sink(method_name: &str, class_desc: &str) -> Option<TaintSink> {
match method_name {
// Command execution
"exec" if class_desc.contains("Runtime") || class_desc.contains("ProcessBuilder") => {
Some(TaintSink::RuntimeExec)
}
// WebView navigation — open redirect / XSS
"loadUrl" | "loadData" | "loadDataWithBaseURL"
if class_desc.contains("WebView") =>
{
Some(TaintSink::WebViewLoadUrl)
}
// SQL
"execSQL" | "rawQuery" | "executeQuery" | "compileStatement"
if class_desc.contains("SQLite")
|| class_desc.contains("Database")
|| class_desc.contains("Cursor") =>
{
Some(TaintSink::SqlExecute)
}
// Android Log — sensitive data leaks to logcat
"d" | "e" | "w" | "v" | "i" | "wtf"
if class_desc.contains("android/util/Log") || class_desc == "Landroid/util/Log;" =>
{
Some(TaintSink::LogOutput)
}
// Reflection — taint flowing into dynamic invocation
"invoke"
if class_desc.contains("Method")
|| class_desc.contains("reflect") =>
{
Some(TaintSink::ReflectionInvoke { class: String::new() })
}
// NOTE: a tainted value reaching a stream/writer *data*-write
// (`write`/`writeBytes`/`println` on OutputStream/Writer/FileChannel)
// is NOT a sink here. That pattern is the read-then-write file-copy
// idiom — the taint is the DATA, never the path — so it does not
// substantiate CWE-22 (path traversal). Real path traversal is
// detected by `classify_path_sink` below, which fires only when the
// taint is in the PATH argument of a file-open call.
// ContentProvider insert/update (data crossing component boundary)
"insert" | "update" | "delete"
if class_desc.contains("ContentProvider")
|| class_desc.contains("ContentResolver") =>
{
Some(TaintSink::ContentProviderInsert { uri: String::new() })
}
// HTTP request body (OkHttp RequestBody, Retrofit @Body)
"newCall" | "execute" | "enqueue"
if class_desc.contains("OkHttpClient")
|| class_desc.contains("Call")
|| class_desc.contains("Retrofit") =>
{
Some(TaintSink::HttpRequest { method: String::new() })
}
// Crypto — tainted key/IV material
"init" | "doFinal" | "update"
if class_desc.contains("Cipher")
|| class_desc.contains("Mac")
|| class_desc.contains("MessageDigest") =>
{
Some(TaintSink::CryptoInput { operation: method_name.to_string() })
}
_ => None,
}
}
/// Recognize a file-OPEN operation and return the **logical argument positions**
/// (0-based, receiver excluded) that carry the file PATH.
///
/// A tainted value in one of these positions means the attacker steers *where* a
/// file is opened/written — CWE-22 path traversal. The caller gates the
/// [`TaintSink::FilePathTraversal`] sink on the taint landing in exactly these
/// positions, so the CWE-22 claim is substantiated (the path is tainted), not
/// assumed from a mere read→write data flow (that data-side pattern is the
/// benign file-copy idiom and is deliberately not a sink — see `classify_sink`).
///
/// Coverage is the write-side `java.io` / `java.nio.file` open surface. The path
/// is conventionally the first argument; `new File(parent, child)` carries two.
/// Trailing mode/append/charset/options arguments are deliberately excluded.
/// Range-form invokes (`invoke-*/range`, >5 registers) never reach this function:
/// the enclosing match arm at the two call sites admits only
/// `InvokeVirtual`/`InvokeInterface`/`InvokeStatic`/`InvokeDirect`, so the
/// caller's `matches!(insn.op, Opcode::InvokeStatic)` static-vs-non-static test
/// is sound *because* the opcode set is pre-filtered upstream. TRAP: widening
/// that arm to admit `invoke-static/range` (a distinct opcode) without also
/// updating the static test would misclassify a ranged static call as
/// non-static and shift the path-arg mapping by one — revisit both together.
fn classify_path_sink(method_name: &str, class_desc: &str) -> Option<&'static [usize]> {
match method_name {
// `new File(path)` / `new File(parent, child)` — both segments are path.
"<init>" if class_desc == "Ljava/io/File;" => Some(&[0, 1]),
// Write-side stream/writer constructors — `new FileOutputStream(path[,
// append])`, `new FileWriter(path[, …])`, `new RandomAccessFile(path,
// mode)`, `new PrintWriter(path)`, … — path at arg 0; trailing
// mode/append/charset excluded.
"<init>"
if class_desc == "Ljava/io/FileOutputStream;"
|| class_desc == "Ljava/io/FileWriter;"
|| class_desc == "Ljava/io/RandomAccessFile;"
|| class_desc == "Ljava/io/PrintWriter;"
|| class_desc == "Ljava/io/PrintStream;" =>
{
Some(&[0])
}
// java.nio.file write-side static APIs — `Files.write(path, …)`,
// `Files.newOutputStream(path, …)`, `Files.createFile(path)`, … — path
// at arg 0, data/options follow.
"write" | "newOutputStream" | "newBufferedWriter" | "createFile"
| "createDirectories"
if class_desc == "Ljava/nio/file/Files;" =>
{
Some(&[0])
}
// `Paths.get(first, …)` (static) and `path.resolve(child)` (instance) —
// the attacker-steered path segment is logical arg 0.
"get" if class_desc == "Ljava/nio/file/Paths;" => Some(&[0]),
"resolve" | "resolveSibling" if class_desc == "Ljava/nio/file/Path;" => Some(&[0]),
_ => None,
}
}
/// Map an invoke operand's index in `ssa_insn.uses` to its **logical argument
/// position** (0-based, receiver excluded). `InvokeStatic` has no receiver, so
/// the use index *is* the argument position; every other invoke form carries the
/// receiver at `uses[0]`, so the argument position is one less. Returns `None`
/// for the receiver slot of a non-static call (it is not a logical argument).
///
/// INVARIANT (load-bearing): `uses` is one entry per Dalvik *register slot*, not
/// per logical argument. A wide argument (`long`/`double`) occupies two adjacent
/// slots, so this slot→argument mapping is exact only when no wide argument
/// precedes the position of interest. Every path argument in `classify_path_sink`
/// is logical position 0, preceded only by reference args (`File`/`Path`/`String`
/// receiver-or-none) — never a wide. A future path-sink whose path follows a
/// `long`/`double` argument would break this and must widen the mapping first.
fn logical_arg_index(is_static: bool, use_idx: usize) -> Option<usize> {
if is_static {
Some(use_idx)
} else {
use_idx.checked_sub(1)
}
}
impl DexTaintAnalysis {
pub fn run(dex: &DexFile, ssa: &SsaBody, layer: Layer) -> Self {
Self::run_with_seeds(dex, ssa, layer, BTreeMap::new())
}
/// Convenience: convert a plain `BTreeMap<VarId, TaintSource>` seed map
/// (e.g. from bridge seeding) to the `(TaintSource, None)` form expected
/// by `run_with_seeds` and `run_interprocedural`. Source addr is `None`
/// for pre-seeded parameters that have no invoke-site address.
pub fn seeds_from_sources(
sources: BTreeMap<VarId, TaintSource>,
) -> BTreeMap<VarId, (TaintSource, Option<u32>)> {
sources.into_iter().map(|(v, s)| (v, (s, None))).collect()
}
/// Same as `run` but pre-seeds the taint map before analysis begins.
/// Used for bridged taint: @ReactMethod parameter VarIds are seeded as
/// `TaintSource::ReactBridgeParam` so JS-controlled inputs are tracked
/// through the Java body to dangerous sinks.
///
/// Seeds carry `(TaintSource, Option<u32>)` — the second element is the
/// source instruction address (Dalvik code-unit offset). `None` for
/// parameter-seeded sources (bridge taint) that have no invoke address.
pub fn run_with_seeds(
dex: &DexFile,
ssa: &SsaBody,
layer: Layer,
seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
) -> Self {
// taints: VarId → (source, source_addr)
let mut taints: BTreeMap<VarId, (TaintSource, Option<u32>)> = seeds;
let mut findings = Vec::new();
let mut corrupted_methods_skipped: usize = 0;
let mut changed = true;
while changed {
changed = false;
for block in ssa.blocks.values() {
// Propagate through Phis — if any operand is tainted, the
// phi destination inherits the taint.
for phi in &block.phis {
for arg_var in phi.operands.values() {
// phi destination inherits taint including source addr.
if let Some(entry) = taints.get(arg_var).cloned()
&& !taints.contains_key(&phi.dst)
{
taints.insert(phi.dst.clone(), entry);
changed = true;
}
}
}
for ssa_insn in &block.insns {
let insn = &ssa_insn.insn;
match insn.op {
Opcode::InvokeVirtual
| Opcode::InvokeInterface
| Opcode::InvokeStatic
| Opcode::InvokeDirect => {
if let Some(PoolIndex::Method(m_idx)) = insn.pool_idx
&& let Some(m_id) = dex.methods.get(m_idx.0 as usize)
{
let Ok(method_name) = dex.get_string(m_id.name_idx) else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let method_name = method_name.to_string();
let Ok(class_desc) =
dex.get_type_descriptor(m_id.class_idx)
else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let class_desc = class_desc.to_string();
// Source tagging on the destination register.
if let Some(dst) = &ssa_insn.dst {
if let Some(src) =
classify_source(&method_name, &class_desc)
&& !taints.contains_key(dst)
{
// Record the address of this source invoke.
taints.insert(dst.clone(), (src, Some(insn.addr)));
changed = true;
}
// Propagate from any tainted argument (carry addr).
for use_var in &ssa_insn.uses {
if let Some(entry) = taints.get(use_var).cloned()
&& !taints.contains_key(dst)
{
taints.insert(dst.clone(), entry);
changed = true;
}
}
}
// Sink detection: any tainted argument reaching a sink.
if let Some(sink) =
classify_sink(&method_name, &class_desc)
{
for use_var in &ssa_insn.uses {
if let Some((source, src_addr)) = taints.get(use_var) {
findings.push(TaintFinding {
source: source.clone(),
sink: sink.clone(),
layer,
func_id: m_idx.0,
// class_descriptor / method_signature are
// filled in by the caller (commands/mod.rs)
// once it resolves the containing method via
// method_key_for_idx — they are not available
// inside the per-instruction walker.
class_descriptor: None,
method_signature: None,
source_offset: *src_addr,
sink_offset: Some(insn.addr),
});
}
}
}
// Path-traversal sink: fire ONLY when the taint
// is in a PATH argument of a file-open call, so
// CWE-22 is substantiated, not assumed. `uses` is
// [receiver?, arg0, arg1, …]; InvokeStatic has no
// receiver, so logical arg index = use index
// (non-static drops the leading receiver slot).
if let Some(path_positions) =
classify_path_sink(&method_name, &class_desc)
{
let is_static = matches!(insn.op, Opcode::InvokeStatic);
for (use_idx, use_var) in ssa_insn.uses.iter().enumerate() {
let Some(logical) = logical_arg_index(is_static, use_idx)
else {
continue;
};
if !path_positions.contains(&logical) {
continue;
}
if let Some((source, src_addr)) = taints.get(use_var) {
findings.push(TaintFinding {
source: source.clone(),
sink: TaintSink::FilePathTraversal {
path_pattern: String::new(),
},
layer,
func_id: m_idx.0,
class_descriptor: None,
method_signature: None,
source_offset: *src_addr,
sink_offset: Some(insn.addr),
});
}
}
}
}
}
// Array element or field assignment: propagate taint
// from the value register to the result if there is one.
Opcode::Aput
| Opcode::AputWide
| Opcode::AputObject
| Opcode::Iput
| Opcode::IputWide
| Opcode::IputObject
| Opcode::Sput
| Opcode::SputObject => {
if let Some(dst) = &ssa_insn.dst {
for use_var in &ssa_insn.uses {
if let Some(entry) = taints.get(use_var).cloned()
&& !taints.contains_key(dst)
{
taints.insert(dst.clone(), entry);
changed = true;
}
}
}
}
_ => {
// Arithmetic / move / aget: propagate taint from
// any used register to the destination.
if let Some(dst) = &ssa_insn.dst {
for use_var in &ssa_insn.uses {
if let Some(entry) = taints.get(use_var).cloned()
&& !taints.contains_key(dst)
{
taints.insert(dst.clone(), entry);
changed = true;
}
}
}
}
}
}
}
}
Self {
findings,
corrupted_methods_skipped,
bridge_sink_reachable_positions: Vec::new(),
}
}
/// Build a `ClassAnalysis` from all DEX files in two passes.
///
/// Pass 1: walk all class data, collecting
/// (class_descriptor, method_name) → (dex_idx, code_off) (method_impls)
/// class_descriptor → superclass_descriptor (superclass hierarchy)
/// Pass 2: for every MethodIdx in every DEX method pool, resolve its
/// (class_desc, method_name) strings and look up in the pass-1 map,
/// producing the final (caller_dex, MethodIdx) → (owner_dex, code_off) table.
///
/// The returned `ClassAnalysis` supports both O(log n) callee lookup
/// (code_index) and CHA-based single-implementation resolution for
/// virtual/interface dispatch (method_impls + superclass).
pub fn collect_unified_code_index(
dex_files: &[DexFile],
dex_data: &[&[u8]],
) -> ClassAnalysis {
// Pass 1: (class_desc, method_name) → (dex_idx, code_off)
// and class_desc → direct_superclass_desc
// and method_name → [(class_id, dex_idx, code_off)]
let mut method_impls: BTreeMap<(String, String), (usize, u32)> = BTreeMap::new();
let mut superclass: BTreeMap<String, String> = BTreeMap::new();
// class_id is built lazily as we see class descriptors; every
// class_desc that either appears in a class_def or as a super
// gets an id. Saturating id counter — 4B classes is absurd, so
// u32 capacity is more than enough; .saturating_add prevents a
// panic on the pathological case.
let mut class_id: AHashMap<String, u32> = AHashMap::with_capacity(1024);
let mut next_id: u32 = 0;
let mut method_name_id: AHashMap<String, u32> = AHashMap::with_capacity(4096);
let mut next_method_id: u32 = 0;
let mut by_method_name: BTreeMap<String, Vec<(u32, usize, u32)>> = BTreeMap::new();
let mut corrupted_methods_skipped: usize = 0;
for ((dex_idx, dex), &data) in dex_files.iter().enumerate().zip(dex_data.iter()) {
for (class_defs_idx, cd) in dex.class_defs.iter().enumerate() {
// Shadow gate: a duplicate-`class_idx` shadow row would
// register the same method under a SECOND interned
// `class_id` in `by_method_name`, producing double tuples
// for one logical method and method-id ambiguity in the
// CHA / cross-layer taint stitcher. Skip shadowed rows so
// the index reflects the first-wins canonical class set.
if dex.class_def_is_shadowed(class_defs_idx) {
continue;
}
let Ok(class_desc) = dex.get_type_descriptor(cd.class_idx) else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let class_desc = class_desc.to_string();
// Record direct superclass when present.
if let Some(super_idx) = cd.superclass_idx
&& let Ok(super_desc) = dex.get_type_descriptor(super_idx)
{
superclass.entry(class_desc.clone()).or_insert_with(|| super_desc.to_string());
}
if cd.class_data_off == 0 {
continue;
}
let Ok(class_data) = parse_class_data(data, cd.class_data_off) else {
continue;
};
for m in class_data.direct_methods.iter().chain(class_data.virtual_methods.iter()) {
if m.code_off == 0 {
continue;
}
if let Some(m_id) = dex.methods.get(m.method_idx.0 as usize) {
let Ok(method_name) = dex.get_string(m_id.name_idx) else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let method_name = method_name.to_string();
let inserted = method_impls
.entry((class_desc.clone(), method_name.clone()))
.or_insert((dex_idx, m.code_off));
// Only register with the secondary index on first
// sight (matches method_impls's first-wins semantics
// from the `.or_insert(...)` above).
if inserted.0 == dex_idx && inserted.1 == m.code_off {
// Intern class_desc to a compact u32 id. First sight
// of the descriptor assigns the next id.
let cd_id = match class_id.get(&class_desc) {
Some(&id) => id,
None => {
let id = next_id;
next_id = next_id.saturating_add(1);
class_id.insert(class_desc.clone(), id);
id
}
};
// Intern method_name (mirrors class_id pattern).
// Built lazily: first sight assigns next_method_id.
if !method_name_id.contains_key(&method_name) {
method_name_id.insert(method_name.clone(), next_method_id);
next_method_id = next_method_id.saturating_add(1);
}
by_method_name
.entry(method_name)
.or_default()
.push((cd_id, dex_idx, m.code_off));
}
}
}
}
}
// Pass 2: for each DEX's method pool, resolve strings → look up pass-1 map
let mut code_index: BTreeMap<(usize, MethodIdx), (usize, u32)> = BTreeMap::new();
for (dex_idx, dex) in dex_files.iter().enumerate() {
for (raw_idx, m_id) in dex.methods.iter().enumerate() {
let Ok(class_desc) = dex.get_type_descriptor(m_id.class_idx) else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let class_desc = class_desc.to_string();
let Ok(method_name) = dex.get_string(m_id.name_idx) else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let method_name = method_name.to_string();
if let Some(&resolved) = method_impls.get(&(class_desc, method_name)) {
#[allow(
clippy::cast_possible_truncation,
reason = "PROOF: raw_idx enumerates dex.method_ids (a DEX method pool); DEX format caps method_ids_size at u32::MAX, so usize→u32 is lossless on every supported target."
)]
let m_idx = raw_idx as u32;
code_index.insert((dex_idx, MethodIdx(m_idx)), resolved);
}
}
}
// Pass 3: intern every class_desc that participates in the superclass
// chain, then precompute transitive supertype sets keyed by u32 ids.
// is_subtype becomes O(1) integer hash + integer set contains —
// no string comparisons anywhere on the hot path. Pre-interning
// phase: every key AND every value in `superclass` needs an id.
for (child, parent) in &superclass {
if !class_id.contains_key(child) {
let id = next_id;
next_id = next_id.saturating_add(1);
class_id.insert(child.clone(), id);
}
if !class_id.contains_key(parent) {
let id = next_id;
next_id = next_id.saturating_add(1);
class_id.insert(parent.clone(), id);
}
}
let mut supertypes: FxHashMap<u32, FxHashSet<u32>> =
FxHashMap::with_capacity_and_hasher(superclass.len(), Default::default());
for class_desc in superclass.keys() {
let Some(&class_id_val) = class_id.get(class_desc) else { continue; };
let mut set: FxHashSet<u32> = FxHashSet::default();
set.insert(class_id_val);
let mut current = class_desc.as_str();
// Cycle-safe: any pathological DEX with a superclass cycle would
// otherwise loop forever. Bound by the class hierarchy size.
let mut steps = 0usize;
while steps < superclass.len() {
match superclass.get(current) {
Some(parent) => {
let Some(&parent_id) = class_id.get(parent) else { break; };
if !set.insert(parent_id) {
break; // already seen → cycle or converged
}
current = parent.as_str();
}
None => break,
}
steps = steps.saturating_add(1);
}
supertypes.insert(class_id_val, set);
}
ClassAnalysis {
code_index, superclass, method_impls, class_id, method_name_id,
by_method_name,
cha_cache: std::sync::RwLock::new(FxHashMap::default()),
supertypes, corrupted_methods_skipped,
cha_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
build_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
}
}
/// Interprocedural taint: follows invoke-direct, invoke-static, and
/// monomorphic invoke-virtual/invoke-interface edges (via CHA) across all
/// DEX files (depth-limited). Callee lookup uses the unified cross-DEX
/// ClassAnalysis so methods defined in a different DEX are followed.
///
/// Seeds carry `(TaintSource, Option<u32>)` — see `seeds_from_sources` for
/// the bridge-seeding convenience constructor.
#[allow(clippy::too_many_arguments, reason = "Nine-arg signature: the ordering (dex set, data, analysis, current dex, SSA body, layer, seeds, native-methods set, depth) mirrors `interproc_inner`'s recursive contract and collapsing it into a context struct would force callers to name intermediate bindings.")]
pub fn run_interprocedural(
dex_files: &[DexFile],
dex_data: &[&[u8]],
class_analysis: &ClassAnalysis,
current_dex: usize,
ssa: &SsaBody,
layer: Layer,
seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
native_methods_per_dex: &[BTreeSet<MethodIdx>],
depth: u8,
) -> Self {
let mut visited: BTreeSet<(usize, MethodIdx)> = BTreeSet::new();
let mut build_cache: BuildCache = AHashMap::default();
let empty_seed_positions: BTreeMap<VarId, u8> = BTreeMap::new();
let (findings, _, corrupted_methods_skipped, _) = Self::interproc_inner(
dex_files, dex_data, class_analysis, current_dex,
ssa, layer, seeds, &empty_seed_positions,
native_methods_per_dex,
depth, &mut visited, &mut build_cache,
);
Self {
findings,
corrupted_methods_skipped,
bridge_sink_reachable_positions: Vec::new(),
}
}
/// Bridge-flavored entry point. Same shape as
/// [`Self::run_interprocedural`] plus a `seed_positions: BTreeMap<VarId, u8>`
/// argument keyed on the same VarIds as `seeds`, carrying the JS-side
/// arg position each seed represents (0-indexed within the JS Call's
/// argument list; the implicit `this` slot at `param_vars[0]` is
/// dropped by the caller before this map is built).
///
/// Returns the same shape as `run_interprocedural` plus a
/// per-finding `bridge_sink_reachable_positions: Vec<BTreeSet<usize>>`
/// vector, index-aligned with `findings`. Downstream analysis
/// intersects this against `TaintSink::NativeModuleArg.arg_positions`
/// from the HBC side to validate composable cross-layer flows.
///
/// Only the bridge-seeded entry method's own sink emits get
/// non-empty position-sets. Sinks reached through interprocedural
/// recursion into callees get empty sets (the seed-position tag is
/// not threaded through the recursive call; downstream analysis
/// handles empty intersections sensibly).
#[allow(clippy::too_many_arguments, reason = "Ten-arg signature mirrors run_interprocedural plus seed_positions. Collapsing into a context struct would force callers to name intermediate bindings.")]
pub fn run_interprocedural_bridge(
dex_files: &[DexFile],
dex_data: &[&[u8]],
class_analysis: &ClassAnalysis,
current_dex: usize,
ssa: &SsaBody,
layer: Layer,
seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
seed_positions: BTreeMap<VarId, u8>,
native_methods_per_dex: &[BTreeSet<MethodIdx>],
depth: u8,
) -> Self {
let mut visited: BTreeSet<(usize, MethodIdx)> = BTreeSet::new();
let mut build_cache: BuildCache = AHashMap::default();
let (findings, _, corrupted_methods_skipped, bridge_sink_reachable_positions) =
Self::interproc_inner(
dex_files, dex_data, class_analysis, current_dex,
ssa, layer, seeds, &seed_positions,
native_methods_per_dex,
depth,
&mut visited, &mut build_cache,
);
Self {
findings,
corrupted_methods_skipped,
bridge_sink_reachable_positions,
}
}
/// Recursive interprocedural worker. Returns
/// `(findings, return_taint, corrupted_methods_skipped, bridge_sink_reachable_positions)`.
///
/// The `return_taint` element carries `(TaintSource, Option<u32>)` so
/// the caller can stage it as `pending_return_taint` with the callee's
/// source addr preserved.
///
/// The `bridge_sink_reachable_positions` vector is index-aligned
/// with `findings`. Empty entries (default) are pushed for
/// findings emitted by recursive callee calls; non-empty entries
/// are populated only for sinks in the current method's body when
/// `seed_positions` is non-empty (the bridge-flavored caller via
/// `run_interprocedural_bridge`).
#[allow(clippy::too_many_arguments, clippy::type_complexity, reason = "Twelve-arg signature: recursive worker threading full interprocedural context + a mutable visited set for cycle-break + a build cache for (CodeItem, Cfg, SsaBody) reuse + the per-VarId seed_positions map for bridge seeding + the per-DEX native-methods set for the JNI sink gate. Return type carries (findings, return_taint, corrupted_skipped, sink_reachable_positions) — a return struct would force all call sites (including the recursive self-call) to name the fields.")]
fn interproc_inner(
dex_files: &[DexFile],
dex_data: &[&[u8]],
class_analysis: &ClassAnalysis,
current_dex: usize,
ssa: &SsaBody,
layer: Layer,
seeds: BTreeMap<VarId, (TaintSource, Option<u32>)>,
seed_positions: &BTreeMap<VarId, u8>,
native_methods_per_dex: &[BTreeSet<MethodIdx>],
depth: u8,
visited: &mut BTreeSet<(usize, MethodIdx)>,
build_cache: &mut BuildCache,
) -> (Vec<TaintFinding>, Option<(TaintSource, Option<u32>)>, usize, Vec<BTreeSet<usize>>) {
let code_index = &class_analysis.code_index;
let Some(dex) = dex_files.get(current_dex) else {
// Caller-contract violation: out-of-range dex index. Bail
// with empty findings rather than panic.
return (Vec::new(), None, 0, Vec::new());
};
// taints: VarId → (source, source_addr)
let mut taints: BTreeMap<VarId, (TaintSource, Option<u32>)> = seeds;
// Per-VarId set of original seed VarIds that
// propagated to it. Initialized from seed_positions keys
// (each seed VarId reaches itself); mirrors taints' propagation
// at every site below. Stays empty for non-bridge callers
// (seed_positions is empty).
let mut seed_reach: BTreeMap<VarId, BTreeSet<VarId>> = seed_positions
.keys()
.map(|v| (v.clone(), std::iter::once(v.clone()).collect()))
.collect();
// Parallel to all_findings: each entry carries the
// sink_reachable_seed_positions set for the corresponding
// finding. Pushed in lockstep so sort+dedup at the end can
// operate on packed (finding, positions) pairs.
let mut all_findings: Vec<(TaintFinding, BTreeSet<usize>)> = Vec::new();
let mut return_taint: Option<(TaintSource, Option<u32>)> = None;
let mut corrupted_methods_skipped: usize = 0;
let mut changed = true;
while changed {
changed = false;
for block in ssa.blocks.values() {
// Phi propagation (carries source addr along).
for phi in &block.phis {
for arg_var in phi.operands.values() {
if let Some(entry) = taints.get(arg_var).cloned()
&& !taints.contains_key(&phi.dst)
{
taints.insert(phi.dst.clone(), entry);
// Phi destination unions
// every tainted arg's reach set.
if let Some(src_set) = seed_reach.get(arg_var).cloned() {
seed_reach.entry(phi.dst.clone()).or_default().extend(src_set);
}
changed = true;
}
}
}
// `pending_return_taint` carries the callee's return taint
// from an invoke to the immediately-following MoveResult.
let mut pending_return_taint: Option<(TaintSource, Option<u32>)> = None;
for ssa_insn in &block.insns {
let insn = &ssa_insn.insn;
// Detect tainted return values.
if matches!(insn.op, Opcode::Return | Opcode::ReturnObject | Opcode::ReturnWide) {
for use_var in &ssa_insn.uses {
if let Some(entry) = taints.get(use_var)
&& return_taint.is_none()
{
return_taint = Some(entry.clone());
}
}
pending_return_taint = None;
continue;
}
// MoveResult: inherit return taint from the preceding invoke.
if matches!(insn.op, Opcode::MoveResult | Opcode::MoveResultObject | Opcode::MoveResultWide) {
if let (Some(dst), Some(entry)) = (&ssa_insn.dst, pending_return_taint.take())
&& !taints.contains_key(dst)
{
taints.insert(dst.clone(), entry);
changed = true;
}
continue;
}
// Any other opcode clears the pending slot.
pending_return_taint = None;
match insn.op {
Opcode::InvokeVirtual
| Opcode::InvokeInterface
| Opcode::InvokeStatic
| Opcode::InvokeDirect => {
if let Some(PoolIndex::Method(m_idx)) = insn.pool_idx
&& let Some(m_id) = dex.methods.get(m_idx.0 as usize)
{
let Ok(method_name) = dex.get_string(m_id.name_idx) else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let method_name = method_name.to_string();
let Ok(class_desc) =
dex.get_type_descriptor(m_id.class_idx)
else {
corrupted_methods_skipped =
corrupted_methods_skipped.saturating_add(1);
continue;
};
let class_desc = class_desc.to_string();
// JNI gate: a tainted argument flowing into a
// method declared with the ACC_NATIVE access
// flag crosses an analyzer-opaque boundary —
// the implementation lives on the native side
// of JNI and is not visible to this engine.
// Emit a TaintSink::NativeMethod finding so
// the gap is analyst-visible; the existing
// classify_sink branch below still fires
// (additive — most native methods will not
// match the Java-method dispatch table).
let is_native = native_methods_per_dex
.get(current_dex)
.is_some_and(|set| set.contains(&m_idx));
if is_native {
for use_var in &ssa_insn.uses {
if let Some((source, src_addr)) = taints.get(use_var) {
let finding = TaintFinding {
source: source.clone(),
sink: TaintSink::NativeMethod {
class: class_desc.clone(),
method: method_name.clone(),
},
layer,
func_id: m_idx.0,
class_descriptor: None,
method_signature: None,
source_offset: *src_addr,
sink_offset: Some(insn.addr),
};
all_findings.push((finding, BTreeSet::new()));
}
}
}
// Sink detection: any tainted argument reaching a sink.
if let Some(sink) = classify_sink(&method_name, &class_desc) {
for use_var in &ssa_insn.uses {
if let Some((source, src_addr)) = taints.get(use_var) {
// Derive the set of JS-side
// arg positions that propagated to this
// tainted operand. Empty when
// seed_positions is empty (non-bridge
// caller) or when no seed-tagged VarId
// reached this use_var.
let positions: BTreeSet<usize> = seed_reach
.get(use_var)
.map(|reach| reach
.iter()
.filter_map(|v| seed_positions.get(v).map(|&p| p as usize))
.collect())
.unwrap_or_default();
let finding = TaintFinding {
source: source.clone(),
sink: sink.clone(),
layer,
func_id: m_idx.0,
class_descriptor: None,
method_signature: None,
source_offset: *src_addr,
sink_offset: Some(insn.addr),
};
all_findings.push((finding, positions));
}
}
}
// Path-traversal sink: fire ONLY when the taint is
// in a PATH argument of a file-open call, so CWE-22
// is substantiated, not assumed. `uses` is
// [receiver?, arg0, …]; InvokeStatic has no receiver,
// so logical arg index = use index (non-static drops
// the leading receiver slot).
if let Some(path_positions) =
classify_path_sink(&method_name, &class_desc)
{
let is_static = matches!(insn.op, Opcode::InvokeStatic);
for (use_idx, use_var) in ssa_insn.uses.iter().enumerate() {
let Some(logical) = logical_arg_index(is_static, use_idx)
else {
continue;
};
if !path_positions.contains(&logical) {
continue;
}
if let Some((source, src_addr)) = taints.get(use_var) {
let positions: BTreeSet<usize> = seed_reach
.get(use_var)
.map(|reach| reach
.iter()
.filter_map(|v| seed_positions.get(v).map(|&p| p as usize))
.collect())
.unwrap_or_default();
let finding = TaintFinding {
source: source.clone(),
sink: TaintSink::FilePathTraversal {
path_pattern: String::new(),
},
layer,
func_id: m_idx.0,
class_descriptor: None,
method_signature: None,
source_offset: *src_addr,
sink_offset: Some(insn.addr),
};
all_findings.push((finding, positions));
}
}
}
// Source classification: invoke opcodes have def=None
// in the SSA (no dst register), so taint from library
// calls must flow through pending_return_taint →
// MoveResult, exactly like interprocedural return taint.
// We stage it here; the followable block below may
// override with callee_ret if we recurse.
if let Some(src) = classify_source(&method_name, &class_desc) {
pending_return_taint = Some((src, Some(insn.addr)));
}
// Interprocedural callee resolution:
// - invoke-direct / invoke-static: index lookup (exact)
// - invoke-virtual / invoke-interface: CHA single-impl
let callee_loc: Option<(usize, u32)> = if depth > 0
&& !visited.contains(&(current_dex, m_idx))
{
match insn.op {
Opcode::InvokeDirect | Opcode::InvokeStatic => {
code_index.get(&(current_dex, m_idx)).copied()
}
Opcode::InvokeVirtual | Opcode::InvokeInterface => {
// CHA: follow only if the declared class has exactly
// one concrete implementation in the loaded DEX set.
class_analysis.virtual_single_impl(
&class_desc,
&method_name,
)
}
_ => None,
}
} else {
None
};
if let Some((owner_dex, callee_code_off)) = callee_loc {
let tainted_args: Vec<(usize, (TaintSource, Option<u32>))> =
ssa_insn.uses.iter().enumerate().filter_map(|(i, use_var)| {
taints.get(use_var).map(|entry| (i, entry.clone()))
}).collect();
if !tainted_args.is_empty()
&& let Some(&callee_data) = dex_data.get(owner_dex)
{
let cache_key = (owner_dex, callee_code_off);
// Cache lookup or build. Cloning the Rc is cheap
// (refcount bump) and releases the cache's &mut
// borrow so the recursive call below can take
// `build_cache` again. On parse/CFG/SSA failure
// the cache stays empty for this key — the failed
// callee is re-attempted on the next visit (rare;
// real-DEX build failures are typically permanent,
// but re-trying is correct and cheap).
let entry: Option<std::rc::Rc<(CodeItem, Cfg, SsaBody)>> =
if let Some(hit) = build_cache.get(&cache_key) {
Some(std::rc::Rc::clone(hit))
} else if let Ok(callee_code) = parse_code_item(callee_data, callee_code_off)
&& let Ok(callee_cfg) = Cfg::build(&callee_code)
&& let Ok(callee_ssa) = SsaBody::build(&callee_code, &callee_cfg)
{
let rc = std::rc::Rc::new((callee_code, callee_cfg, callee_ssa));
// Entry-count cap (`MAX_BUILD_CACHE_ENTRIES`): once full,
// skip the insert and let subsequent lookups re-do the
// parse+CFG+SSA work. Correct (cache is best-effort)
// but slower. Bounds adversarial input where many
// unique callees would otherwise balloon the cache.
if build_cache.len() < MAX_BUILD_CACHE_ENTRIES {
build_cache.insert(cache_key, std::rc::Rc::clone(&rc));
} else {
// Cap hit — record the skip on the shared atomic
// so the audit pipeline can emit a
// BUILD_CACHE_ENTRY_CAP_HIT Finding.
class_analysis
.build_cache_truncations
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
Some(rc)
} else {
None
};
if let Some(entry) = entry {
let callee_ssa = &entry.2;
let param_seeds: BTreeMap<VarId, (TaintSource, Option<u32>)> =
tainted_args.into_iter().filter_map(|(arg_i, src_entry)| {
callee_ssa.param_vars.get(arg_i).map(|v| (v.clone(), src_entry))
}).collect();
visited.insert((current_dex, m_idx));
// Don't thread seed_positions
// across the recursive call. Callee findings
// get empty position-sets — downstream
// analysis handles empty intersections sensibly.
let empty_seed_positions: BTreeMap<VarId, u8> = BTreeMap::new();
let (callee_findings, callee_ret, callee_skipped, callee_positions) =
Self::interproc_inner(
dex_files, dex_data, class_analysis,
owner_dex,
callee_ssa, layer,
// WHY: depth > 0 gate at line above proves no underflow; saturating_sub is exact there.
param_seeds, &empty_seed_positions,
native_methods_per_dex,
depth.saturating_sub(1), visited,
build_cache,
);
visited.remove(&(current_dex, m_idx));
all_findings.extend(
callee_findings
.into_iter()
.zip(callee_positions.into_iter())
);
corrupted_methods_skipped = corrupted_methods_skipped.saturating_add(callee_skipped);
// Callee return taint overrides source
// classification (callee is authoritative).
if let Some(ret) = callee_ret {
pending_return_taint = Some(ret);
}
}
}
}
}
}
Opcode::Aput
| Opcode::AputWide
| Opcode::AputObject
| Opcode::Iput
| Opcode::IputWide
| Opcode::IputObject
| Opcode::Sput
| Opcode::SputObject => {
if let Some(dst) = &ssa_insn.dst {
for use_var in &ssa_insn.uses {
if let Some(entry) = taints.get(use_var).cloned()
&& !taints.contains_key(dst)
{
taints.insert(dst.clone(), entry);
changed = true;
}
}
}
}
_ => {
if let Some(dst) = &ssa_insn.dst {
for use_var in &ssa_insn.uses {
if let Some(entry) = taints.get(use_var).cloned()
&& !taints.contains_key(dst)
{
taints.insert(dst.clone(), entry);
changed = true;
}
}
}
}
}
}
}
}
// Deduplicate: the fixed-point loop re-visits sink instructions on each
// iteration once taint is established, generating duplicate (source, sink,
// func_id) triples. Sort + dedup operates on (TaintFinding, positions)
// pairs — same (source, sink, func_id) entries always carry the same
// monotone-grown positions set (seed_reach only grows), so dedup is
// shape-stable.
all_findings.sort();
all_findings.dedup();
let (findings, positions): (Vec<TaintFinding>, Vec<BTreeSet<usize>>) =
all_findings.into_iter().unzip();
(findings, return_taint, corrupted_methods_skipped, positions)
}
}
#[cfg(test)]
mod path_sink_tests {
//! Unit coverage for the FileRead→FileWrite precision fix: the
//! `FileWrite` data-write sink is retired (it mislabeled the benign
//! read-then-write file-copy idiom as CWE-22), and `classify_path_sink` +
//! `logical_arg_index` substantiate CWE-22 only when the taint reaches the
//! PATH argument of a file-open call. The end-to-end behaviour on real
//! bytecode is validated by the corpus re-sweep (the `FileWrite` Medium
//! population must drop to ~0 and `FilePathTraversal` must appear).
use super::*;
#[test]
fn data_write_methods_are_no_longer_a_sink() {
// The retired data-write firings: `write`/`writeBytes`/`println` on a
// stream/writer carry the DATA, not a path — not CWE-22, not a sink.
for (m, c) in [
("write", "Ljava/io/FileOutputStream;"),
("write", "Ljava/io/OutputStream;"),
("writeBytes", "Ljava/io/DataOutputStream;"),
("writeUTF", "Ljava/io/DataOutputStream;"),
("println", "Ljava/io/PrintWriter;"),
("write", "Ljava/io/BufferedWriter;"),
] {
assert!(
classify_sink(m, c).is_none(),
"{m} on {c} must not be a sink (retired data-write firing)"
);
}
}
#[test]
fn file_open_calls_are_path_sinks_with_correct_positions() {
// `new File(path)` / `new File(parent, child)` — both segments are path.
assert_eq!(classify_path_sink("<init>", "Ljava/io/File;"), Some(&[0, 1][..]));
// Write-side stream/writer ctors — path at arg 0, trailing mode excluded.
for c in [
"Ljava/io/FileOutputStream;",
"Ljava/io/FileWriter;",
"Ljava/io/RandomAccessFile;",
"Ljava/io/PrintWriter;",
"Ljava/io/PrintStream;",
] {
assert_eq!(classify_path_sink("<init>", c), Some(&[0][..]), "ctor {c}");
}
// java.nio.file write-side static APIs + Paths.get / Path.resolve.
for m in ["write", "newOutputStream", "newBufferedWriter", "createFile", "createDirectories"] {
assert_eq!(
classify_path_sink(m, "Ljava/nio/file/Files;"),
Some(&[0][..]),
"Files.{m}"
);
}
assert_eq!(classify_path_sink("get", "Ljava/nio/file/Paths;"), Some(&[0][..]));
assert_eq!(classify_path_sink("resolve", "Ljava/nio/file/Path;"), Some(&[0][..]));
assert_eq!(classify_path_sink("resolveSibling", "Ljava/nio/file/Path;"), Some(&[0][..]));
}
#[test]
fn non_file_open_calls_are_not_path_sinks() {
// Data-write methods and unrelated calls are not path sinks.
assert_eq!(classify_path_sink("write", "Ljava/io/FileOutputStream;"), None);
assert_eq!(classify_path_sink("println", "Ljava/io/PrintWriter;"), None);
// `<init>` on a non-file class is not a path sink.
assert_eq!(classify_path_sink("<init>", "Ljava/lang/StringBuilder;"), None);
// `get` on a non-Paths class is not a path sink (HashMap.get, etc.).
assert_eq!(classify_path_sink("get", "Ljava/util/HashMap;"), None);
}
#[test]
fn logical_arg_index_maps_invoke_kinds_correctly() {
// Static: no receiver — use index IS the argument position. So
// `Files.write(path, data)` has path at uses[0] → logical 0.
assert_eq!(logical_arg_index(true, 0), Some(0));
assert_eq!(logical_arg_index(true, 1), Some(1));
// Non-static / <init>: receiver at uses[0] (logical None), args shift by
// one. So `new File(path)` has path at uses[1] → logical 0.
assert_eq!(logical_arg_index(false, 0), None, "receiver slot is not an argument");
assert_eq!(logical_arg_index(false, 1), Some(0), "first real arg");
assert_eq!(logical_arg_index(false, 2), Some(1), "second real arg");
}
}
#[cfg(test)]
mod cap_tests {
//! Structural tests that the entry-count caps on `cha_cache` +
//! `build_cache_truncations` fire correctly when driven past their
//! limits. These are the gate-coverage counterpart to the production
//! `CHA_CACHE_ENTRY_CAP_HIT` / `BUILD_CACHE_ENTRY_CAP_HIT` Findings
//! emitted by the audit pipeline.
//!
//! Constructing a real adversarial DEX with > 65536 unique
//! (class, method-name) pairs is non-trivial (would need a
//! synthetic-DEX builder). These tests cover the failure mode at the
//! data-structure boundary: with a pre-filled `cha_cache` at the
//! cap, the next `virtual_single_impl` insert is skipped and the
//! truncations counter advances by exactly one. Verifies:
//! - the cap-skip branch is reached (counter > 0)
//! - the cache size remains at the cap (no over-cap growth)
//! - the function still returns a correct result (re-iteration path)
use super::*;
use std::sync::atomic::Ordering;
/// Build a minimal `ClassAnalysis` with one class hierarchy entry
/// sufficient to drive `virtual_single_impl`. The synthetic class is
/// `"Lfoo;"` with method `"bar"`, implemented by one concrete subclass
/// at `(owner_dex=0, code_off=42)`.
fn make_minimal_class_analysis() -> ClassAnalysis {
let mut class_id: AHashMap<String, u32> = AHashMap::new();
class_id.insert("Lfoo;".to_string(), 0);
let mut method_name_id: AHashMap<String, u32> = AHashMap::new();
method_name_id.insert("bar".to_string(), 0);
let mut by_method_name: BTreeMap<String, Vec<(u32, usize, u32)>> = BTreeMap::new();
by_method_name.insert("bar".to_string(), vec![(0, 0, 42)]);
let mut supertypes: FxHashMap<u32, FxHashSet<u32>> = FxHashMap::default();
let mut self_set: FxHashSet<u32> = FxHashSet::default();
self_set.insert(0);
supertypes.insert(0, self_set);
ClassAnalysis {
code_index: BTreeMap::new(),
superclass: BTreeMap::new(),
method_impls: BTreeMap::new(),
class_id,
method_name_id,
by_method_name,
cha_cache: std::sync::RwLock::new(FxHashMap::default()),
supertypes,
corrupted_methods_skipped: 0,
cha_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
build_cache_truncations: std::sync::atomic::AtomicUsize::new(0),
}
}
#[test]
fn cha_cache_cap_skip_increments_truncations_counter() {
let ca = make_minimal_class_analysis();
// Pre-fill the cache to exactly MAX_CHA_CACHE_ENTRIES with synthetic
// keys that don't collide with the test's lookup key (0, 0).
// Filling via the public RwLock write API mirrors how the cap-skip
// branch sees a "full" cache in production.
{
let mut cache = ca.cha_cache.write().expect("test setup: lock");
for i in 1..=MAX_CHA_CACHE_ENTRIES {
#[allow(clippy::cast_possible_truncation, reason = "test-only synthetic keys; i <= MAX_CHA_CACHE_ENTRIES = 65536 < u32::MAX")]
let key = (i as u32, 0u32);
cache.insert(key, None);
}
assert_eq!(cache.len(), MAX_CHA_CACHE_ENTRIES);
}
// Sanity: counter should be 0 before any cap-skip occurs.
assert_eq!(ca.cha_cache_truncations.load(Ordering::Relaxed), 0);
// Call virtual_single_impl with a key that misses the pre-filled
// cache. The cap-skip branch should fire: cache stays at
// MAX_CHA_CACHE_ENTRIES, truncations counter increments to 1, and
// the function still returns the correct value (from re-iteration
// of `by_method_name` candidates).
let result = ca.virtual_single_impl("Lfoo;", "bar");
assert_eq!(
result, Some((0, 42)),
"virtual_single_impl must still return the correct (owner_dex, code_off) \
via re-iteration when the cache is at cap"
);
assert_eq!(
ca.cha_cache_truncations.load(Ordering::Relaxed), 1,
"cap-skip branch should have incremented the truncations counter"
);
assert_eq!(
ca.cha_cache.read().expect("test: read lock").len(),
MAX_CHA_CACHE_ENTRIES,
"cache must stay at the cap — over-cap growth would defeat the bound"
);
// Second call with the same key: cache still at cap, counter
// increments again. Each cap-skip is recorded independently.
let _ = ca.virtual_single_impl("Lfoo;", "bar");
assert_eq!(
ca.cha_cache_truncations.load(Ordering::Relaxed), 2,
"every cap-skip should advance the counter, not just the first"
);
}
#[test]
fn cha_cache_below_cap_inserts_without_truncation() {
// Negative control: confirm the truncations counter stays at 0
// when the cache has room. Otherwise a regression that always
// increments the counter would slip through.
let ca = make_minimal_class_analysis();
let result = ca.virtual_single_impl("Lfoo;", "bar");
assert_eq!(result, Some((0, 42)));
assert_eq!(
ca.cha_cache_truncations.load(Ordering::Relaxed), 0,
"no cap-skip should fire when cache has room"
);
assert_eq!(
ca.cha_cache.read().expect("test: read lock").len(), 1,
"single lookup should produce one cache entry"
);
}
}
#[cfg(test)]
mod native_method_tests {
//! Coverage for the JNI-bridges minimum-tier helper. A real shipped
//! APK invariably contains framework classes that declare native
//! methods (e.g. `android.util.Log.isLoggable`), so any vendored
//! APK fixture proves the enumeration walks the full DEX without
//! missing ACC_NATIVE entries. The fixture lives at the canonical
//! probe path; this test fails closed if the APK is repacked
//! without native methods, which would itself be a signal worth
//! investigating.
use super::*;
use droidsaw_apk::Apk;
use std::path::Path;
fn vendored_probe_apk() -> Option<Apk> {
// Resolve relative to the bench probe directory the rest of the
// workspace uses for the crosstaint_rn fixture.
let path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("droidsaw-bench")
.join("probe")
.join("crosstaint_rn.apk");
if !path.exists() {
return None;
}
Apk::parse(&path).ok()
}
#[test]
fn collect_native_methods_finds_framework_native_methods() {
let Some(apk) = vendored_probe_apk() else {
// Fixture absent — skip rather than fail (allows the test
// suite to run on workspaces without the probe directory).
return;
};
let mut total_native = 0_usize;
for (idx, dex_entry) in apk.dex.iter().enumerate() {
let Ok(dex) = droidsaw_dex::parser::DexFile::parse(&dex_entry.data, None) else {
continue;
};
let native = collect_native_methods(&dex, &dex_entry.data);
total_native = total_native.saturating_add(native.len());
// Every entry in the returned set must be a real MethodIdx
// pointing into the DEX's method pool.
for m_idx in &native {
assert!(
(m_idx.0 as usize) < dex.methods.len(),
"dex {idx}: native MethodIdx {m_idx:?} out of method-pool range ({} entries)",
dex.methods.len(),
);
}
}
// A React Native shipped APK pulls in framework classes that
// declare native methods (Log, Bitmap, etc.). If this assertion
// fails, either the helper is broken OR the APK fixture has
// been repacked without framework native methods — both worth
// investigating.
assert!(
total_native > 0,
"expected vendored RN APK to contain at least one framework ACC_NATIVE method",
);
}
}
#[cfg(test)]
mod shadow_gate_tests {
use super::*;
use crate::analysis::dup_class_fixture;
/// `collect_native_methods` unions ACC_NATIVE method idxs across all
/// `class_def` rows. On a duplicate-`class_idx` pair where the
/// canonical (first) row is NON-native and the shadow (second) row
/// declares a native method, an ungated walk would attribute the
/// shadow row's native method to the class. The shadow gate skips
/// row 1, so only the canonical row's (empty native) set survives.
#[test]
fn collect_native_methods_skips_shadow_row() {
// Canonical method idx 0 (non-native), shadow method idx 1
// (native).
let fx = dup_class_fixture::with_native_method_rows(0, false, 1, true);
let native = collect_native_methods(&fx.dex, &fx.raw);
assert!(
!native.contains(&MethodIdx(1)),
"shadow row's native method must not be attributed to the canonical class"
);
assert!(
native.is_empty(),
"canonical (first-wins) row is non-native, so the native set is empty; \
got {native:?}"
);
}
/// `collect_unified_code_index` registers each `(class, method)`
/// under an interned `class_id` in `by_method_name`. A shadow row
/// re-registers the same method under a second `class_id`, yielding
/// double tuples for one logical method. The shadow gate drops the
/// shadow row, so each method name maps to exactly one tuple.
#[test]
fn unified_code_index_skips_shadow_row() {
// Both rows declare a method with code (the fixture emits a
// non-zero code_off) so they reach by_method_name registration.
// The rows carry DISTINCT method names ("canonMethod" /
// "shadowMethod"), so a missing gate would surface the shadow
// method as its own index entry.
let fx = dup_class_fixture::with_native_method_rows(0, false, 1, false);
let dex_files = vec![fx.dex];
let dex_data: Vec<&[u8]> = vec![fx.raw.as_slice()];
let analysis = DexTaintAnalysis::collect_unified_code_index(&dex_files, &dex_data);
assert!(
analysis.by_method_name.contains_key("canonMethod"),
"canonical (first-wins) row's method must still be indexed"
);
assert!(
!analysis.by_method_name.contains_key("shadowMethod"),
"shadow row's method name must not be indexed; the gate drops the shadow class_def"
);
}
}