droidsaw 2.0.0

DROIDSAW — unified Android reverse engineering CLI. Hermes, DEX, APK signing. JSON output, MCP server. Bytecode is not a security layer.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
//! HBC (Hermes bytecode) intra-function taint analysis.
//!
//! Structurally parallel to `dex_taint`: seeds `param_vars` as sources,
//! propagates through SSA operands, detects JS-layer sinks. Now that
//! `SsaFunction` exposes `param_vars`, seeding is identical to DEX.
//!
//! Sinks detected:
//!   `DirectEval`        → `TaintSink::Eval`           (code injection)
//!   `Call*` with tainted args → `TaintSink::NativeModuleArg`  (bridge crossing)
//!
//! ## Two visitor entry points
//!
//! * [`HbcTaintAnalysis::run_eval_only`] takes `&SsaFunction<Raw>` and
//!   emits only the `DirectEval` sink. The bridge-crossing back-walk is
//!   skipped. Used for every non-bridge function (no optimize pipeline
//!   needs to run for these, eliminating the per-function `optimize::optimize`
//!   wall-time cost on bundles whose bridge-function set is small relative
//!   to total function count).
//! * [`HbcTaintAnalysis::run_full`] takes `&SsaFunction<Resolved>` and emits
//!   both `DirectEval` AND `NativeModuleArg`. The back-walk requires
//!   `ResolvedString` operands on `GetById*` ops, which are canonical
//!   only post-`optimize::optimize`.
//!
//! ## `NativeModuleArg` sink
//!
//! * `module` + `method` — the `(NativeModules.X.Y)` identity recovered
//!   by the back-walk against the `Call*` callee. The back-walk's
//!   **terminal-hop gauge** ([`extract_native_modules_chain`]) drops
//!   findings whose outer hop is itself another `GetById*`, defending
//!   against three-hop misattribution (e.g. `NativeModules.Sub.Mod.meth`
//!   would otherwise produce a confidently-wrong `("Sub", "Mod")` tuple
//!   that both newtype constructors accept).
//! * `arg_positions` — the set of 0-indexed argument positions
//!   (operand index minus 3, matching the `[dst, callee, argc]`
//!   header) that carried taint at THIS Call site. Variant-local,
//!   so cross-function contamination via union at the HBC layer is
//!   structurally impossible.
//!
//! ## Backwalk-failure breadcrumbs
//!
//! When a `Call*` in a bridge function has tainted args but the back-walk
//! does not yield a `(module, method)` tuple, the visitor returns the
//! site in [`HbcTaintAnalysis::backwalk_failures`]. The orchestration
//! layer translates these into `Severity::Info` `HBC_BRIDGE_BACKWALK_FAILED`
//! findings so analysts see WHY findings are missing relative to the
//! number of tainted bridge calls observed. Downstream analysis will
//! supersede this with structured `BridgeResolutionAmbiguous`
//! finding emissions carrying `AmbiguousCause` variants.

use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::btree_map::Entry;

use droidsaw_hermes::decompile::ssa::{Phase, Raw, Resolved, SsaFunction, SsaOperand, VarId};
use droidsaw_hermes::opcodes::OpCode;
use droidsaw_common::analysis::{TaintFinding, TaintSink, TaintSource};
use droidsaw_common::cross_layer_taint::{NativeModuleMethodName, NativeModuleName};
use droidsaw_common::finding::Layer;

/// Why a bridge `Call*` site's back-walk produced no
/// `(NativeModuleName, NativeModuleMethodName)` tuple. Carried back to
/// the orchestration layer for breadcrumb-finding emission. Downstream
/// analysis will replace this with structured `AmbiguousCause` variants on
/// `BridgeResolutionAmbiguous` findings.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BackwalkFailureReason {
    /// `Call*` operand[1] (callee) is not a `Var` (typically `CallDirect`
    /// resolving via func_id; or a non-back-walkable shape).
    CalleeNotVar,
    /// Either hop of the two-step `GetById` chain returned `None` —
    /// `Mov`-collapsed callee, phi-only def chain, computed access
    /// (`NativeModules['X']`), `Function.prototype.apply` redirect, etc.
    ChainExtractionFailed,
    /// Both hops succeeded but the outer hop's def is itself a `GetById*`,
    /// indicating a three-or-more-hop chain. The two-hop visitor would
    /// otherwise misattribute the inner GetById pair as `(module, method)`.
    TerminalHopIsGetById,
}

/// Locator for one backwalk-failure site. `op_index` is the linear
/// 0-indexed offset into the function's full op stream (sum of preceding
/// blocks' op counts plus the op's intra-block index) — lossy but
/// sufficient for the analyst to grep + diff against decompile output.
#[derive(Debug, Clone, Copy)]
pub struct BackwalkFailureSite {
    pub func_id: u32,
    pub op_index: usize,
    pub reason: BackwalkFailureReason,
}

pub struct HbcTaintAnalysis {
    pub findings: Vec<TaintFinding>,
    /// Backwalk-failure sites that the orchestration layer turns into
    /// breadcrumb `Finding`s. Always empty for `run_eval_only`.
    pub backwalk_failures: Vec<BackwalkFailureSite>,
}

impl HbcTaintAnalysis {
    /// DirectEval-only visitor. Skips the bridge `Call*` sink entirely.
    /// Cheap — does not require `optimize::optimize` to have run, so
    /// callers can pass the raw `build_ssa` output directly. Used for
    /// every non-bridge function, which on a typical shipped RN bundle
    /// is the overwhelming majority of functions.
    pub fn run_eval_only(
        ssa: &SsaFunction<Raw>,
        func_id: u32,
        layer: Layer,
        seeds: BTreeMap<VarId, TaintSource>,
    ) -> Self {
        let taints = propagate_to_fixed_point(ssa, seeds);
        let findings = emit_eval_sinks(ssa, &taints, layer, func_id);
        Self { findings, backwalk_failures: Vec::new() }
    }

    /// Full visitor: DirectEval AND bridge `Call*` (via the two-hop
    /// back-walk). Requires `<Resolved>` SSA — the visitor reads
    /// `ResolvedString` operands on `GetById*` ops, which is the
    /// post-`optimize::optimize` canonical shape.
    ///
    /// `bridge_func_ids` gates whether THIS function emits the
    /// `NativeModuleArg` sink. Caller is responsible for restricting
    /// the `optimize::optimize` cost to functions in this set; the
    /// visitor itself trusts the caller's gating.
    pub fn run_full(
        ssa: &SsaFunction<Resolved>,
        func_id: u32,
        layer: Layer,
        seeds: BTreeMap<VarId, TaintSource>,
        bridge_func_ids: &BTreeSet<u32>,
    ) -> Self {
        let taints = propagate_to_fixed_point(ssa, seeds);
        let mut findings = emit_eval_sinks(ssa, &taints, layer, func_id);
        let mut backwalk_failures = Vec::new();
        if bridge_func_ids.contains(&func_id) {
            let bridge_findings =
                emit_bridge_sinks(ssa, &taints, layer, func_id, &mut backwalk_failures);
            findings.extend(bridge_findings);
        }
        Self { findings, backwalk_failures }
    }
}

/// Phase A — propagate taint through phi + op operands until no new
/// VarId enters the `taints` map. Sink emission is split into Phase C
/// so it runs exactly once per sink site.
///
/// Termination: `taints` is `BTreeMap<VarId, TaintSource>`; each
/// iteration with `changed = true` strictly grows `taints.len()` via
/// `Entry::Vacant` insert (the guard prevents overwrite). VarId set is
/// finite (≤ `frame_size × version_count` per function, capped by the
/// builder's `u32` version counter). The loop terminates in at most
/// `|VarId set|` iterations.
fn propagate_to_fixed_point<P: Phase>(
    ssa: &SsaFunction<P>,
    seeds: BTreeMap<VarId, TaintSource>,
) -> BTreeMap<VarId, TaintSource> {
    let mut taints = seeds;
    let mut changed = true;
    while changed {
        changed = false;
        for block in &ssa.blocks {
            for phi in &block.phis {
                for (_pred, arg) in &phi.args {
                    if let Some(src) = taints.get(arg).cloned()
                        && let Entry::Vacant(e) = taints.entry(phi.dst)
                    {
                        e.insert(src);
                        changed = true;
                    }
                }
            }
            for op in &block.ops {
                if let Some(dst) = op.dst {
                    for operand in &op.operands {
                        if let SsaOperand::Var(v) = operand
                            && let Some(src) = taints.get(v).cloned()
                            && let Entry::Vacant(e) = taints.entry(dst)
                        {
                            e.insert(src);
                            changed = true;
                        }
                    }
                }
            }
        }
    }
    taints
}

/// Emit `TaintSink::Eval` findings for every `DirectEval` whose
/// argument is tainted. Phase-agnostic — neither the propagation map
/// nor the eval sink reads `ResolvedString`.
fn emit_eval_sinks<P: Phase>(
    ssa: &SsaFunction<P>,
    taints: &BTreeMap<VarId, TaintSource>,
    layer: Layer,
    func_id: u32,
) -> Vec<TaintFinding> {
    let mut findings = Vec::new();
    for block in &ssa.blocks {
        for op in &block.ops {
            if op.op == OpCode::DirectEval {
                for operand in &op.operands {
                    if let SsaOperand::Var(v) = operand
                        && let Some(src) = taints.get(v)
                    {
                        findings.push(TaintFinding {
                            source: src.clone(),
                            sink: TaintSink::Eval,
                            layer,
                            func_id,
                            class_descriptor: None,
                            method_signature: None,
                            source_offset: None,
                            sink_offset: None,
                        });
                    }
                }
            }
        }
    }
    findings
}

/// Emit `TaintSink::NativeModuleArg` findings for every bridge `Call*`
/// whose arguments include at least one tainted Var AND whose callee
/// back-walks cleanly to a `(NativeModuleName, NativeModuleMethodName)`
/// tuple via [`extract_native_modules_chain`].
///
/// Back-walk failures push a `BackwalkFailureSite` into `failures` so
/// the orchestration layer can emit breadcrumb `Info` findings; without
/// them the dropped findings would be invisible to the analyst until
/// downstream analysis lands its structured `AmbiguousCause` emit.
fn emit_bridge_sinks(
    ssa: &SsaFunction<Resolved>,
    taints: &BTreeMap<VarId, TaintSource>,
    layer: Layer,
    func_id: u32,
    failures: &mut Vec<BackwalkFailureSite>,
) -> Vec<TaintFinding> {
    let def_map: BTreeMap<VarId, (usize, usize)> = ssa
        .blocks
        .iter()
        .enumerate()
        .flat_map(|(bi, b)| {
            b.ops.iter().enumerate().filter_map(move |(oi, op)| {
                op.dst.map(|d| (d, (bi, oi)))
            })
        })
        .collect();

    let mut findings = Vec::new();
    let mut linear_op_index: usize = 0;
    for block in &ssa.blocks {
        for op in &block.ops {
            let this_op_index = linear_op_index;
            linear_op_index = linear_op_index.saturating_add(1);

            if !is_call_op(op.op) {
                continue;
            }
            let mut arg_positions: BTreeSet<usize> = BTreeSet::new();
            let mut first_source: Option<TaintSource> = None;
            for (arg_pos, operand) in op.operands.iter().skip(3).enumerate() {
                if let SsaOperand::Var(v) = operand
                    && let Some(src) = taints.get(v)
                {
                    if first_source.is_none() {
                        first_source = Some(src.clone());
                    }
                    arg_positions.insert(arg_pos);
                }
            }
            let Some(source) = first_source else { continue };

            let callee_var = match op.operands.get(1) {
                Some(SsaOperand::Var(v)) => *v,
                _ => {
                    failures.push(BackwalkFailureSite {
                        func_id,
                        op_index: this_op_index,
                        reason: BackwalkFailureReason::CalleeNotVar,
                    });
                    continue;
                }
            };
            match extract_native_modules_chain(callee_var, &def_map, ssa) {
                ChainResult::Resolved(module, method) => {
                    findings.push(TaintFinding {
                        source,
                        sink: TaintSink::NativeModuleArg {
                            module,
                            method,
                            arg_positions,
                        },
                        layer,
                        func_id,
                        class_descriptor: None,
                        method_signature: None,
                        source_offset: None,
                        sink_offset: None,
                    });
                }
                ChainResult::Failed(reason) => {
                    failures.push(BackwalkFailureSite {
                        func_id,
                        op_index: this_op_index,
                        reason,
                    });
                }
            }
        }
    }
    findings
}

/// Outcome of [`extract_native_modules_chain`] — typed so the call site
/// can both emit the resolved finding AND record the failure reason in
/// one match arm, instead of relying on `Option` plus a separate
/// failure-reason lookup.
enum ChainResult {
    Resolved(NativeModuleName, NativeModuleMethodName),
    Failed(BackwalkFailureReason),
}

/// One hop of the `NativeModules.X.Y` property-chain back-walk.
///
/// Given a VarId, return `(obj_var, property_name)` if its definition
/// is a `GetById` / `GetByIdShort` / `GetByIdLong` / `TryGetById` /
/// `TryGetByIdLong` op against a resolved property name. Returns `None`
/// for any other op shape (including unresolved property operands —
/// the `Resolved` phase pin on the caller's SSA makes this case dead
/// at the type-system floor, but the runtime fallthrough stays
/// defensive against a future shape regression).
///
/// Op layout (post-build_ssa):
///   `[dst, obj, ..., name]` — `dst` is `DstPlaceholder`, `obj` is a
///   `Var`, and `name` is the last operand (`Const(sid)` pre-resolve,
///   `ResolvedString(s)` post-resolve).
fn read_get_by_id_chain_step(
    var: VarId,
    def_map: &BTreeMap<VarId, (usize, usize)>,
    ssa: &SsaFunction<Resolved>,
) -> Option<(VarId, String)> {
    let (bi, oi) = def_map.get(&var)?;
    let op = ssa.blocks.get(*bi)?.ops.get(*oi)?;
    if !is_get_by_id_op(op.op) {
        return None;
    }
    let obj = match op.operands.get(1)? {
        SsaOperand::Var(v) => *v,
        _ => return None,
    };
    let name = match op.operands.last()? {
        SsaOperand::ResolvedString(s) => s.clone(),
        _ => return None,
    };
    Some((obj, name))
}

/// Back-walk: from a `Call*` callee VarId, recover
/// `(NativeModuleName, NativeModuleMethodName)` for the
/// `[wrapper.]NativeModules.<module>.<method>(...)` pattern. Accepts both
/// the 2-hop shape (where `NativeModules` is loaded directly via
/// `GetGlobalObject` / `LoadFromEnvironment` / `LoadParam`) and the 3-hop
/// shape (the canonical Metro hoist, where an enclosing wrapper object
/// is loaded from an env slot and `NativeModules` is a property on it).
///
/// Step 1: `callee = GetById(module_var, "<method>")`.
/// Step 2: `module_var = GetById(outer_var, "<module>")`.
/// Step 3 (gauge): inspect `outer_var`'s def:
/// - If NOT a `GetById*` (e.g. `GetGlobalObject`, `LoadFromEnvironment`,
///   `LoadParam`, phi) → accept the 2-hop result.
/// - If a `GetById*` AND its resolved property name is literally
///   `"NativeModules"` → accept the 2-hop result. This is the Metro
///   hoist shape: `<env_slot>.NativeModules.<module>.<method>()`.
/// - If a `GetById*` AND its resolved property name is something else →
///   reject as `TerminalHopIsGetById`. Catches both 4+-hop
///   misattributions (`NativeModules.Outer.Module.method` → 4 hops where
///   the step-3 string is "Outer") and the non-bridge FP class
///   (`Object.defineProperty`, `TurboModules.push`, etc., where the
///   step-3 string is something other than `NativeModules`).
///
/// The 3-hop accept-on-NativeModules check is what catches the canonical
/// Metro bundler emission. Without it, `crosstaint_rn.apk`'s `onSubmit`
/// (bytecode confirms `wrapper.NativeModules.CrosstaintModule.exec(arg)`
/// disassembles to three GetById hops with the middle hop's resolved
/// string == `"NativeModules"`) rejects at the terminal gauge, and zero
/// composites emit.
///
/// Termination: at most three `def_map.get` lookups + one string compare
/// — no recursion, no iteration. The visitor never re-enters the
/// back-walk for the same callee_var within a single emit pass.
///
/// The newtype constructors reject empty strings, so a `Resolved`
/// return guarantees both names are non-empty (though not necessarily
/// valid JS identifiers — see module docstring's caveat list).
fn extract_native_modules_chain(
    callee_var: VarId,
    def_map: &BTreeMap<VarId, (usize, usize)>,
    ssa: &SsaFunction<Resolved>,
) -> ChainResult {
    let Some((module_var, method_str)) = read_get_by_id_chain_step(callee_var, def_map, ssa)
    else {
        return ChainResult::Failed(BackwalkFailureReason::ChainExtractionFailed);
    };
    let Some((outer_var, module_str)) = read_get_by_id_chain_step(module_var, def_map, ssa) else {
        return ChainResult::Failed(BackwalkFailureReason::ChainExtractionFailed);
    };
    // Step-3 gauge: when outer_var's def is itself a `GetById*`, only
    // accept the 2-hop result if its resolved property name is literally
    // `"NativeModules"` — the canonical Metro hoist shape. Otherwise
    // reject (4+-hop chain or non-bridge FP class like `Object.defineProperty`).
    if let Some((bi, oi)) = def_map.get(&outer_var)
        && let Some(op) = ssa.blocks.get(*bi).and_then(|b| b.ops.get(*oi))
        && is_get_by_id_op(op.op)
    {
        let third_hop_str = match op.operands.last() {
            Some(SsaOperand::ResolvedString(s)) => s.as_str(),
            _ => return ChainResult::Failed(BackwalkFailureReason::TerminalHopIsGetById),
        };
        if third_hop_str != "NativeModules" {
            return ChainResult::Failed(BackwalkFailureReason::TerminalHopIsGetById);
        }
        // Fall through: 3-hop chain with middle hop "NativeModules" —
        // canonical Metro shape, accept.
    }
    let Some(module) = NativeModuleName::try_new(module_str) else {
        return ChainResult::Failed(BackwalkFailureReason::ChainExtractionFailed);
    };
    let Some(method) = NativeModuleMethodName::try_new(method_str) else {
        return ChainResult::Failed(BackwalkFailureReason::ChainExtractionFailed);
    };
    ChainResult::Resolved(module, method)
}

fn is_get_by_id_op(op: OpCode) -> bool {
    matches!(
        op,
        OpCode::GetById
            | OpCode::GetByIdShort
            | OpCode::GetByIdLong
            | OpCode::TryGetById
            | OpCode::TryGetByIdLong
    )
}

fn is_call_op(op: OpCode) -> bool {
    matches!(
        op,
        OpCode::Call
        | OpCode::CallLong
        | OpCode::Call1
        | OpCode::Call2
        | OpCode::Call3
        | OpCode::Call4
        | OpCode::CallDirect
        | OpCode::CallDirectLongIndex
        | OpCode::CallWithNewTarget
        | OpCode::CallWithNewTargetLong
    )
}