1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
use serde_json::{json, Value};
use crate::context::CrossLayerContext;
use super::{meta, progress};
#[allow(clippy::arithmetic_side_effects, clippy::as_conversions, reason = "HBC slice bounds same as `decompile()`; `i + 1` usize+1 bounded by ctx.dex.len(). Display-only counters (hermes_count/dex_count) are handled via `saturating_*` in the display-only commit.")]
pub fn semgrep(
ctx: &CrossLayerContext,
output: Option<&std::path::Path>,
semgrep_args: &crate::semgrep::SemgrepArgs,
) -> anyhow::Result<Value> {
// RAII drain guard: closes the I/O-`?`-before-explicit-drain gap.
// The function explicitly drains at end and embeds findings in the
// returned JSON, but several `?` ops (fs::create_dir_all, fs::write,
// require_apk, compose_config_args) sit between the per-function
// optimize() loop and the explicit drain. SIGPIPE / ENOSPC / any
// I/O failure strands findings unless this guard's Drop fires.
let _drain_guard = crate::context::HermesFindingDrainGuard::install_discard();
let path = std::path::PathBuf::from(&ctx.require_apk()?.path);
let default_out = std::path::PathBuf::from(format!(
"./droidsaw-semgrep-{}",
path.file_stem().and_then(|s| s.to_str()).unwrap_or("unknown")
));
let out_dir = output.unwrap_or(&default_out);
std::fs::create_dir_all(out_dir)?;
let mut hermes_count = 0usize;
let mut dex_count = 0usize;
let mut bytes_written: u64 = 0;
if let Some(hbc_owned) = ctx.hbc.as_ref() {
let hbc = hbc_owned.hbc();
let hbc_dir = out_dir.join("hermes");
std::fs::create_dir_all(&hbc_dir)?;
let hbc_data = hbc_owned.bytes();
for fid in 0..hbc.function_count {
let f = hbc.function_get(fid);
if f.size == 0 {
continue;
}
let fname = if f.name_id < hbc.string_count {
hbc.string_as_str_or_empty(f.name_id).into_owned()
} else {
String::new()
};
let safe_name = sanitize_filename(&fname);
#[allow(
clippy::cast_possible_truncation,
reason = "PROOF: bounded by hbc_data.len() via .min(); usize→u64→usize roundtrip lossless on every supported target."
)]
let end = (u64::from(f.offset) + u64::from(f.size)).min(hbc_data.len() as u64) as usize;
let code_end = (end + 256).min(hbc_data.len());
let Some(code) = hbc_data.get(f.offset as usize..code_end) else {
continue;
};
let Some(decode_slice) = hbc_data.get(f.offset as usize..end) else {
continue;
};
let Ok(instructions) = droidsaw_hermes::decompile::decode::decode_function(
decode_slice,
hbc.opcode_version(),
) else {
continue;
};
let mut exc_handlers = Vec::new();
for i in 0..hbc.function_exception_count(fid) {
let eh = hbc.function_exception_get(fid, i);
exc_handlers.push(droidsaw_hermes::decompile::cfg::ExcHandler {
start: eh.start,
end: eh.end,
target: eh.target,
});
}
let Ok(cfg) = droidsaw_hermes::decompile::cfg::Cfg::build(&instructions, &exc_handlers, code) else {
continue;
};
let Ok(ssa) = droidsaw_hermes::decompile::ssa::build_ssa(&cfg, f.frame_size) else {
continue;
};
let get_str = |id: u32| -> String {
if id < hbc.string_count {
hbc.string_as_str_or_empty(id).into_owned()
} else {
format!("<{id}>")
}
};
let get_literal = |a: u8, b: u32, c: u32, d: u32| -> (u8, u32, i32, f64) {
let v = hbc.literal_get(a, b, c, d);
(v.tag, v.str_id, v.ival, v.dval)
};
let get_shape = |i: u32| -> (u32, u32) {
match hbc.object_shape_get(i) {
Some(s) => (s.key_buffer_offset, s.num_props),
None => (0, 0),
}
};
let get_func_name = |fid2: u32| -> String {
if fid2 < hbc.function_count {
let fi = hbc.function_get(fid2);
if fi.name_id < hbc.string_count {
return hbc.string_as_str_or_empty(fi.name_id).into_owned();
}
}
String::new()
};
let get_bigint = |idx: u32| -> Option<String> { hbc.bigint_as_str(idx) };
let ssa = droidsaw_hermes::decompile::optimize::optimize(
ssa,
&get_str,
&get_literal,
&get_shape,
&get_func_name,
&get_bigint,
);
let exc_map: std::collections::BTreeMap<u32, u32> = cfg
.blocks
.values()
.filter_map(|b| b.exc_handler.map(|h| (b.id, h)))
.collect();
let fname_for_emit = if f.name_id < hbc.string_count {
hbc.string_as_str_or_empty(f.name_id).into_owned()
} else {
String::new()
};
let structured = droidsaw_hermes::decompile::structure::structure_function_with_exc(
&ssa,
fname_for_emit,
f.param_count,
f.flags,
&exc_map,
);
let js = droidsaw_hermes::decompile::emit::emit_js(&structured, &get_str);
let file_path = hbc_dir.join(format!("fn_{fid:06}_{safe_name}.js"));
// DISPLAY-ONLY: `files_written` / `hermes_functions` JSON fields.
bytes_written = bytes_written.saturating_add(js.len() as u64);
std::fs::write(file_path, js)?;
hermes_count = hermes_count.saturating_add(1);
}
}
let apk = ctx.require_apk()?;
use rayon::prelude::*;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
for ((i, dex), apk_dex) in ctx.dex.iter().enumerate().zip(apk.dex.iter()) {
let dex_dir = out_dir.join(format!("dex{}", i + 1));
std::fs::create_dir_all(&dex_dir)?;
let data = &apk_dex.data;
// Bulk-path: build TypeToClassDefMap + EnumInlineMap ONCE per DEX
// and reuse across every class. The single-class `decompile_class`
// entry point passes `None` for ttm, which falls through to an
// O(n_classes) linear scan inside `is_synthetic_bridge_ctor` —
// called per-method per-class → O(classes² × methods) aggregate.
// Measured as the dominant superlinear cost on large Play APKs
// (43k-class chatgpt: ~28 min extract under the scan vs expected
// few minutes with the O(1) ttm lookup).
let ttm = droidsaw_dex::classes::TypeToClassDefMap::build(dex);
let enum_inlines = droidsaw_dex::classes::EnumInlineMap::build(dex, data, &ttm);
// Per-class decompile is pure over shared-reference context
// (&dex, &data, &cd, &ttm, &enum_inlines) and writes to its own
// output file at a disjoint path — embarrassingly parallel.
// rayon's work-stealing thread pool handles scheduling; the
// .par_bridge() adapter converts the non-indexable iterator
// from classes_to_decompile into a parallel iterator.
// Counters are AtomicU64/AtomicUsize to keep the post-loop
// display-only JSON totals consistent with the serial path.
let dex_count_atomic = AtomicUsize::new(0);
let bytes_written_atomic = AtomicU64::new(0);
// Amortize r8_inversion::build_trampoline_census across the
// par_bridge per-class loop.
// Without the guard: census was being rebuilt per class through
// decompile_class_ext on every call. On the --mode=full hot path
// this dominated CPU. Built once per DEX + shared across rayon
// workers (TrampolineCensus is Sync).
let census = droidsaw_dex::r8_inversion::build_trampoline_census(dex);
droidsaw_dex::classes::classes_to_decompile(dex)
.par_bridge()
.for_each(|(_, cd)| {
let Ok(desc) = dex.get_type_descriptor(cd.class_idx) else {
return;
};
let safe = desc
.trim_start_matches('L')
.trim_end_matches(';')
.replace('/', "_");
let safe = sanitize_filename(&safe);
let source = droidsaw_dex::classes::decompile_class_ext_with_census(
dex,
data,
cd,
Some(&enum_inlines),
Some(&ttm),
Some(&census),
);
let file_path = dex_dir.join(format!("{safe}.java"));
// DISPLAY-ONLY: `files_written` / `dex_classes` JSON fields.
bytes_written_atomic.fetch_add(source.len() as u64, Ordering::Relaxed);
// WHY: best-effort write for DISPLAY-ONLY filesystem mirror
// (counters are the load-bearing output); drop is explicit.
drop(std::fs::write(file_path, source));
dex_count_atomic.fetch_add(1, Ordering::Relaxed);
});
bytes_written = bytes_written.saturating_add(bytes_written_atomic.load(Ordering::Relaxed));
dex_count = dex_count.saturating_add(dex_count_atomic.load(Ordering::Relaxed));
}
// Drain decompile-time HermesFinding emissions accumulated across
// the per-function `optimize::optimize()` calls. The channel is
// thread-local; without this drain the findings would either leak
// into the next parse on the same blocking-pool worker (tokio
// `spawn_blocking` thread reuse) or, in single-shot CLI runs, never
// surface. Translate to common::Finding and embed in the returned
// JSON envelope so operator-facing tooling can consume them.
let hermes_findings = CrossLayerContext::drain_hermes_findings();
progress!(
"wrote {:?} hermes functions, {:?} dex classes to {:?}",
hermes_count,
dex_count,
out_dir
);
// Compose the JSON `command` hint from the user's --rules / --no-auto
// flags + DROIDSAW_SEMGREP_RULES env. Backward compat: callers passing
// SemgrepArgs::default() with no env produce `semgrep --config auto <dir>/`,
// identical to the prior hardcoded form.
let composed = crate::semgrep::compose_config_args(semgrep_args)
.map_err(|e| anyhow::anyhow!("semgrep arg composition: {e}"))?;
let cmd_hint = format!(
"semgrep {} {}/",
composed.join(" "),
out_dir.display()
);
let out = json!({
"output_dir": out_dir.display().to_string(),
// DISPLAY-ONLY: JSON sum of two file-counters (each bounded by
// actual writes; saturating defends against pathological inputs).
"files_written": hermes_count.saturating_add(dex_count),
"hermes_functions": hermes_count,
"dex_classes": dex_count,
"bytes_written": bytes_written,
"command": cmd_hint,
"findings": hermes_findings,
"_meta": meta(
1,
false,
"source extracted — run the returned `command` to scan with semgrep",
&["audit-full", "audit-light", "strings", "xrefs"],
),
});
Ok(out)
}
/// CLI `scan semgrep` entry point — wraps [`semgrep`] (extract) and
/// optionally chains the shared
/// [`crate::semgrep::run_and_persist`] helper when `persist` is set.
///
/// Default (`persist = false`): identical to calling [`semgrep`]
/// directly — extract source + return a `command` hint string. Backward
/// compatible with consumers that parse the hint and run semgrep
/// themselves.
///
/// `persist = true`: also invokes `semgrep` against the extracted
/// source and writes hits to a SQLite findings DB at `db_path` (or a
/// derived default path next to the input). Returns an extended JSON
/// envelope with the original extraction fields plus `db_path` and a
/// nested `semgrep_scan` object — the same shape the audit handler
/// produces when its mode runs semgrep.
pub fn scan_semgrep(
ctx: &CrossLayerContext,
output: Option<&std::path::Path>,
semgrep_args: &crate::semgrep::SemgrepArgs,
persist: bool,
db: Option<&std::path::Path>,
) -> anyhow::Result<Value> {
let extracted = semgrep(ctx, output, semgrep_args)?;
if !persist {
return Ok(extracted);
}
let output_dir = extracted
.get("output_dir")
.and_then(|v| v.as_str())
.ok_or_else(|| anyhow::anyhow!("scan_semgrep: extract step did not return output_dir"))?
.to_string();
// Default DB path mirrors `audit`'s convention of one DB per input
// basename. `--db <path>` overrides if the operator wants a shared
// DB across multiple semgrep runs (e.g. corpus aggregation).
let db_path: std::path::PathBuf = match db {
Some(p) => p.to_path_buf(),
None => {
let input = std::path::PathBuf::from(&ctx.require_apk()?.path);
let stem = input
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown");
std::path::PathBuf::from(format!("./droidsaw-{stem}.db"))
}
};
let scan = crate::semgrep::run_and_persist(
std::path::Path::new(&output_dir),
semgrep_args,
&db_path,
None,
)?;
let mut merged = extracted.as_object().cloned().unwrap_or_default();
merged.insert("semgrep_scan".into(), scan);
merged.insert(
"db_path".into(),
serde_json::json!(db_path.display().to_string()),
);
Ok(serde_json::Value::Object(merged))
}
fn sanitize_filename(s: &str) -> String {
s.chars()
.map(|c| if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
.take(64)
.collect()
}