synth_backend/arm_backend.rs
1//! ARM Backend — wraps the instruction selector + optimizer + encoder as a Backend
2//!
3//! This is Synth's custom ARM compiler targeting Cortex-M (Thumb-2).
4//! It's the only backend that supports per-rule formal verification (ASIL D path).
5
6use crate::ArmEncoder;
7use synth_core::backend::{
8 Backend, BackendCapabilities, BackendError, CodeRelocation, CompilationResult, CompileConfig,
9 CompiledFunction, LineMap, SafetyBounds,
10};
11use synth_core::target::{IsaVariant, TargetSpec};
12use synth_core::wasm_decoder::DecodedModule;
13use synth_core::wasm_op::WasmOp;
14use synth_synthesis::{
15 ArmInstruction, ArmOp, BoundsCheckConfig, InstructionSelector, OptimizationConfig,
16 OptimizerBridge, RuleDatabase, validate_instructions,
17};
18
19/// ARM Cortex-M backend using Synth's custom compiler pipeline
20pub struct ArmBackend;
21
22impl ArmBackend {
23 pub fn new() -> Self {
24 Self
25 }
26}
27
28impl Default for ArmBackend {
29 fn default() -> Self {
30 Self::new()
31 }
32}
33
34impl Backend for ArmBackend {
35 fn name(&self) -> &str {
36 "arm"
37 }
38
39 fn capabilities(&self) -> BackendCapabilities {
40 BackendCapabilities {
41 produces_elf: false,
42 supports_rule_verification: true,
43 supports_binary_verification: true,
44 is_external: false,
45 }
46 }
47
48 fn supported_targets(&self) -> Vec<TargetSpec> {
49 vec![
50 TargetSpec::cortex_m3(),
51 TargetSpec::cortex_m4(),
52 TargetSpec::cortex_m4f(),
53 TargetSpec::cortex_m7(),
54 TargetSpec::cortex_m7dp(),
55 ]
56 }
57
58 fn compile_module(
59 &self,
60 module: &DecodedModule,
61 config: &CompileConfig,
62 ) -> Result<CompilationResult, BackendError> {
63 let exports: Vec<_> = module
64 .functions
65 .iter()
66 .filter(|f| f.export_name.is_some())
67 .collect();
68
69 if exports.is_empty() {
70 return Err(BackendError::CompilationFailed(
71 "no exported functions found".into(),
72 ));
73 }
74
75 let mut functions = Vec::new();
76 for func in &exports {
77 let name = func.export_name.clone().unwrap();
78 // #359: copy THIS function's declared param widths into the config so
79 // `compile_function` (which carries no function index) can refuse a
80 // 64-bit param on the AAPCS stack-argument path. Cheap clone only when
81 // a signature table is present and this function has a width entry —
82 // otherwise reuse the shared config (every existing module unchanged).
83 let func_config = match config.func_params_i64.get(func.index as usize) {
84 Some(p) if !p.is_empty() => Some(CompileConfig {
85 current_func_params_i64: p.clone(),
86 ..config.clone()
87 }),
88 _ => None,
89 };
90 let cfg = func_config.as_ref().unwrap_or(config);
91 let compiled = self.compile_function(&name, &func.ops, cfg)?;
92 functions.push(compiled);
93 }
94
95 Ok(CompilationResult {
96 functions,
97 elf: None,
98 backend_name: self.name().to_string(),
99 })
100 }
101
102 fn compile_function(
103 &self,
104 name: &str,
105 ops: &[WasmOp],
106 config: &CompileConfig,
107 ) -> Result<CompiledFunction, BackendError> {
108 let (code, relocations, line_map) =
109 compile_wasm_to_arm(ops, config).map_err(BackendError::CompilationFailed)?;
110
111 Ok(CompiledFunction {
112 name: name.to_string(),
113 code,
114 wasm_ops: ops.to_vec(),
115 relocations,
116 line_map,
117 })
118 }
119
120 fn is_available(&self) -> bool {
121 true // Always available — it's a library backend
122 }
123}
124
125/// Count the number of function parameters by analyzing LocalGet patterns
126fn count_params(wasm_ops: &[WasmOp]) -> u32 {
127 let mut first_access: std::collections::HashMap<u32, bool> = std::collections::HashMap::new();
128 for op in wasm_ops {
129 match op {
130 WasmOp::LocalGet(idx) => {
131 first_access.entry(*idx).or_insert(true);
132 }
133 WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
134 first_access.entry(*idx).or_insert(false);
135 }
136 _ => {}
137 }
138 }
139
140 first_access
141 .iter()
142 .filter_map(
143 |(&idx, &is_read_first)| {
144 if is_read_first { Some(idx + 1) } else { None }
145 },
146 )
147 .max()
148 .unwrap_or(0)
149}
150
151/// Core compilation: WASM ops → ARM machine code bytes + relocations
152///
153/// Returns (code_bytes, relocations) where relocations record BL instructions
154/// that target external symbols (e.g., `__meld_dispatch_import` for import calls).
155fn compile_wasm_to_arm(
156 wasm_ops: &[WasmOp],
157 config: &CompileConfig,
158) -> Result<(Vec<u8>, Vec<CodeRelocation>, LineMap), String> {
159 let num_params = count_params(wasm_ops);
160
161 let bounds_config = match config.effective_safety_bounds() {
162 SafetyBounds::None => BoundsCheckConfig::None,
163 SafetyBounds::Mpu => BoundsCheckConfig::Mpu,
164 SafetyBounds::Software => BoundsCheckConfig::Software,
165 SafetyBounds::Mask => BoundsCheckConfig::Masking,
166 };
167
168 // The non-optimized (direct) instruction-selection path. Handles f32 via
169 // VFP/FPU. Used directly when `--no-optimize` is set, and as the fallback
170 // when the optimized path declines a module (see issue #120 below).
171 //
172 // VCR-RA-001 step 3b-lite (#242): a FRESH selector per attempt, with
173 // `spill_on_exhaustion` set only on the retry — the first pass is the
174 // unmodified default, so every function that compiles today is selected by
175 // exactly the code that compiled it yesterday (bit-identity is structural,
176 // not behavioural).
177 let select_direct_attempt = |spill_on_exhaustion: bool,
178 param_backing_on_exhaustion: bool,
179 local_promote: bool|
180 -> Result<Vec<ArmInstruction>, synth_core::Error> {
181 let db = RuleDatabase::with_standard_rules();
182 let mut selector =
183 InstructionSelector::with_bounds_check(db.rules().to_vec(), bounds_config);
184 selector.set_target(config.target.fpu, &config.target.triple);
185 if config.num_imports > 0 {
186 selector.set_num_imports(config.num_imports);
187 }
188 // #195: plumb the callee argument-count tables so the direct selector can
189 // marshal call arguments into R0–R3 per AAPCS.
190 selector.set_func_arg_counts(
191 config.func_arg_counts.clone(),
192 config.type_arg_counts.clone(),
193 );
194 // #197: in relocatable host-link mode, emit direct `func_N` BLs for
195 // imports (rewritten to the wasm field name by build_relocatable_elf)
196 // instead of `__meld_dispatch_import`.
197 selector.set_relocatable(config.relocatable);
198 // #237: native-pointer ABI — wasm statics become __synth_wasm_data-relative.
199 selector.set_native_pointer_abi(config.native_pointer_abi, config.linear_memory_bytes);
200 // #311: i64 call results are register PAIRS — tag them.
201 selector.set_result_types(config.func_ret_i64.clone(), config.type_ret_i64.clone());
202 // #359: declared param widths of THIS function, so the AAPCS stack-arg
203 // path can refuse 64-bit params (Ok-or-Err). Empty ⇒ assume i32.
204 selector.set_params_i64(config.current_func_params_i64.clone());
205 // Stack-pointer promotion is meaningful only under the native-pointer ABI;
206 // gating here keeps every non-native compile (all frozen fixtures) on the
207 // legacy R9 globals-table path, bit-identical.
208 if config.native_pointer_abi
209 && let Some((sp_idx, sp_init)) = config.stack_pointer_global
210 {
211 selector.set_native_pointer_stack(sp_idx, sp_init);
212 }
213 selector.set_spill_on_exhaustion(spill_on_exhaustion);
214 selector.set_param_backing_on_exhaustion(param_backing_on_exhaustion);
215 // VCR-RA local promotion (#390, #242): keep eligible non-param i32 locals
216 // in callee-saved registers instead of frame slots — the structural lever
217 // toward native parity. DEFAULT-ON as of v0.14.0: gale's G474RE DWT gate
218 // cleared it as a net win (gust_mix dissolved 58→50 cyc/call −14%, all 5
219 // stack spill/reloads eliminated, correctness bit-identical over [0,2047],
220 // 2.00×→1.72× vs LLVM). Escape hatch: `SYNTH_NO_LOCAL_PROMOTE=1` restores
221 // the frame-slot path. Leaf-only / i32-only / ARM-only (see
222 // compute_local_promotion); the leaf-only lift + i64 locals are follow-ons.
223 // #474: `local_promote` is now a per-attempt parameter so the retry ladder
224 // can drop promotion as an exhaustion-recovery rung (promotion pins r4-r8,
225 // which on a dense function leaves the spill allocator with nothing to
226 // free → the frame-slot path is the escape that restores compilability).
227 selector.set_local_promote(local_promote);
228 selector.select_with_stack(wasm_ops, num_params)
229 };
230 let select_direct = || -> Result<Vec<ArmInstruction>, String> {
231 const SINGLE_EXHAUSTION: &str = "all allocatable registers are live on the stack";
232 const PAIR_EXHAUSTION: &str = "no consecutive pair of free registers for i64";
233 // The full exhaustion-recovery ladder, parameterized on whether local
234 // promotion is enabled. Each rung is reached only when the previous one
235 // returned a recoverable register-exhaustion Err, so a function that
236 // compiles on the first attempt is untouched by the later rungs. Returns
237 // the result AND which rung produced it (for the #242 measurement below).
238 let recovery_ladder =
239 |promote: bool| -> (Result<Vec<ArmInstruction>, synth_core::Error>, &'static str) {
240 let mut attempt = select_direct_attempt(false, false, promote);
241 let mut rung = "base";
242 // VCR-RA-001 step 3b-lite (#242): the i32 register-exhaustion
243 // hard-fail is recoverable — retry with spill-on-exhaustion, which
244 // reserves the spill area and spills the deepest stack value when
245 // the pool is full.
246 if let Err(e) = &attempt
247 && e.to_string().contains(SINGLE_EXHAUSTION)
248 {
249 attempt = select_direct_attempt(true, false, promote);
250 rung = "spill";
251 }
252 // VCR-RA-001 acceptance increment (#242): the i64 consecutive-PAIR
253 // exhaustion is recoverable too — not by stack spilling (the pair
254 // allocator already spills stack values, #171) but by frame-backing
255 // the params (#204) so they stop pinning R0-R3, with spill kept on.
256 if let Err(e) = &attempt
257 && e.to_string().contains(PAIR_EXHAUSTION)
258 {
259 attempt = select_direct_attempt(true, true, promote);
260 rung = "param-backing";
261 }
262 (attempt, rung)
263 };
264 // #474: local promotion (default-on since v0.14.0) is an OPTIMIZATION — it
265 // must never be the reason a function fails to compile. Run the full ladder
266 // with promotion first (so every function that compiles today is
267 // bit-identical), and if it still ends in register exhaustion, fall back to
268 // the promotion-off ladder (the v0.12.0 frame-slot lowering — exactly what
269 // the `SYNTH_NO_LOCAL_PROMOTE=1` workaround does, now automatic). Promotion
270 // pins r4-r8 for the locals; on a dense function that leaves the allocator
271 // with nothing to free, so dropping it restores compilability. The fallback
272 // is reached ONLY by functions that exhaust WITH promotion, so promotion-on
273 // output is untouched by construction (frozen byte gate stays green).
274 let promote = std::env::var("SYNTH_NO_LOCAL_PROMOTE").is_err();
275 let (mut attempt, mut rung) = recovery_ladder(promote);
276 let mut promotion_dropped = false;
277 if promote
278 && attempt
279 .as_ref()
280 .err()
281 .is_some_and(|e| e.to_string().contains("register exhaustion"))
282 {
283 let (rescued, off_rung) = recovery_ladder(false);
284 if rescued.is_ok() {
285 attempt = rescued;
286 rung = off_rung;
287 promotion_dropped = true;
288 }
289 }
290 // VCR-RA measurement (#242): log which recovery rung produced the result,
291 // so the per-rung distribution across a corpus can be measured — the size
292 // of the failure surface a verified allocator must subsume (see
293 // scripts/repro/register_exhaustion_recovery_ladder.md). Logging only:
294 // emitted bytes are unchanged, so the frozen byte gate is unaffected.
295 if std::env::var("SYNTH_RECOVERY_STATS").is_ok() {
296 eprintln!(
297 "[recovery-stats] rung={rung}{} result={}",
298 if promotion_dropped {
299 " promotion-off"
300 } else {
301 ""
302 },
303 if attempt.is_ok() { "ok" } else { "exhausted" },
304 );
305 }
306 attempt.map_err(|e| format!("instruction selection failed: {}", e))
307 };
308
309 // Instruction selection: optimized or direct.
310 //
311 // #197: `--relocatable` (host-link ET_REL) forces the direct selector. The
312 // optimized path materializes an absolute linmem base (0x20000100) and does
313 // not preserve caller-saved registers across calls — both wrong for a
314 // host-linked object, where the linmem base arrives via `fp` at runtime and
315 // callees follow AAPCS. `select_with_stack` (now i64-spill capable after
316 // #171) handles fp-relative memory + caller-saved preservation correctly.
317 let arm_instrs = if config.no_optimize || config.relocatable {
318 select_direct()?
319 } else {
320 let opt_config = if config.loom_compat {
321 OptimizationConfig::loom_compat()
322 } else {
323 OptimizationConfig::all()
324 };
325
326 let mut bridge = OptimizerBridge::with_config(opt_config);
327 // #188: tell the bridge how many imports there are so it declines only
328 // LOCAL calls (and leaves import calls on the optimized path, keeping
329 // the #173 field-name relocation rewrite intact).
330 bridge.set_num_imports(config.num_imports);
331 // `ir_to_arm` now returns `Result` — an `Err` means the optimized path
332 // hit an unmapped vreg (issue-#93-class). Treat it identically to an
333 // `optimize_full` failure: fall back to the direct selector rather
334 // than propagating, so the function still compiles correctly.
335 match bridge
336 .optimize_full(wasm_ops)
337 .and_then(|(opt_ir, _cfg, _stats)| bridge.ir_to_arm(&opt_ir, num_params as usize))
338 {
339 Ok(arm_ops) => arm_ops
340 .into_iter()
341 .map(|op| ArmInstruction {
342 op,
343 source_line: None,
344 })
345 .collect(),
346 // Issue #120: the optimized path declines modules it cannot lower
347 // (notably scalar f32/f64 ops — the IR has no float opcodes). Fall
348 // back to the direct instruction selector, which handles f32 via
349 // VFP/FPU. This is honest degradation: the function still compiles
350 // correctly, just without IR-level optimization.
351 Err(_) => select_direct()?,
352 }
353 };
354
355 // #257/#277: `mul`+`add`→`mla` fusion is intentionally NOT wired here.
356 // The transform is correct and ready (`synth_synthesis::liveness::fuse_mul_add`,
357 // fully tested), but it is **register-allocation-coupled**: over the current
358 // greedy single-pass selector, folding `mul rM,..; add rD,rM,rX` → `mla`
359 // extends the live ranges of the mul inputs to the mla point, and the added
360 // pressure (extra moves/spills) costs more than the single-cycle MLA saves —
361 // gale measured a +2 cyc on-target REGRESSION (flat_flight 255→257, G474RE)
362 // even though it removes 2 instructions and the seam stays 0x07FDF307. So the
363 // fusion stays unwired until the spill-aware allocator (VCR-RA-001) chooses
364 // registers, at which point it becomes net-positive (per #272's plan and the
365 // wiring design note). Lesson (#277): a register-pressure-affecting transform
366 // needs an on-target/allocator-aware gate, not a byte-count gate, before it
367 // can default on.
368
369 // VCR-RA-001 const-CSE / rematerialization-avoidance (#209), the first
370 // allocator-analysis-driven CODEGEN change. Drops `movw` re-materializations
371 // of a constant already resident in another register and retargets the reads
372 // — every rewrite proven by the liveness analysis, and it ONLY removes
373 // materializations (pressure never rises), so unlike the mla fusion (#277) it
374 // cannot regress on-target. Runs on the selected stream before branch
375 // resolution (it removes instructions, shifting byte offsets). Behind
376 // `SYNTH_CONST_CSE=1` while it is validated against the differential oracle +
377 // gale's five on-target baselines; off by default keeps every fixture
378 // bit-identical.
379 let arm_instrs = if std::env::var("SYNTH_CONST_CSE").is_ok() {
380 synth_synthesis::liveness::apply_const_cse(&arm_instrs).0
381 } else {
382 arm_instrs
383 };
384
385 // VCR-RA-001 RANGE RE-ALLOCATION (#209/#242, wiring step 3a) — the first
386 // CONSEQUENTIAL allocator pass: re-colour each maximal straight-line
387 // segment over the R0-R8 pool with value ranges as the allocation unit
388 // (segment inputs + per-register live-outs pinned to their original
389 // registers, reserved R9-R12/SP identity-assigned — each segment is
390 // independently sound, no cross-segment liveness assumed). Renames
391 // registers only: never adds, removes, or reorders instructions, so
392 // labels/branch offsets are unaffected.
393 //
394 // DEFAULT-ON since v0.11.36: gale cleared the gate on-target (G474RE,
395 // #209 2026-06-10) — flag-on output byte-identical to flag-off on
396 // flat_flight/controller/control_step, fires on the filter family with
397 // zero cycle delta and a small size win, all selfchecks green on silicon.
398 // Opt out with `SYNTH_RANGE_REALLOC=0`; per-function stats with
399 // `SYNTH_REALLOC_STATS=1`.
400 //
401 // The companion dead callee-saved-save elimination (gale's "next
402 // consequential lever", same issue comment) then shrinks the prologue
403 // `push {r4-r8,lr}` / epilogue `pop {r4-r8,pc}` to the callee-saved
404 // registers the re-allocated body still touches (leaf-only,
405 // SP-untouched, even-count-padded — see shrink_callee_saved_saves):
406 // ~12 cycles of pure save/restore overhead removed on small leaves.
407 let realloc_on = std::env::var("SYNTH_RANGE_REALLOC").map_or(true, |v| v != "0");
408 let arm_instrs = if realloc_on {
409 use synth_synthesis::rules::Reg;
410 const POOL: [Reg; 9] = [
411 Reg::R0,
412 Reg::R1,
413 Reg::R2,
414 Reg::R3,
415 Reg::R4,
416 Reg::R5,
417 Reg::R6,
418 Reg::R7,
419 Reg::R8,
420 ];
421 let (out, stats) = synth_synthesis::liveness::reallocate_function(&arm_instrs, &POOL);
422 if std::env::var("SYNTH_REALLOC_STATS").is_ok() {
423 eprintln!(
424 "[range-realloc] {} segments: {} reallocated, {} declined ({} validator-rejected), {} need spill (step 4)",
425 stats.segments,
426 stats.reallocated,
427 stats.declined,
428 stats.validator_rejects,
429 stats.needs_spill
430 );
431 }
432 // VCR-RA-002 (#390, epic #242): eliminate a provably-dead stack frame
433 // (`sub sp,#N`/`add sp,#N` reserved by `compute_local_layout` for locals
434 // that promotion homed in registers, never accessed). Removing it saves
435 // the two instructions AND restores the SP-untouched precondition that
436 // `shrink_callee_saved_saves` requires — so it must run FIRST. Flag-off
437 // (opt-in `SYNTH_DEAD_FRAME_ELIM=1`); off ⇒ byte-identical. Default-on
438 // flip held for on-silicon validation, like the realloc/shrink levers.
439 let out = if std::env::var("SYNTH_DEAD_FRAME_ELIM").is_ok() {
440 synth_synthesis::liveness::elide_dead_frame(&out).unwrap_or(out)
441 } else {
442 out
443 };
444 // #490 (epic #242): the optimized selector uses r4-r8 as scratch /
445 // promoted locals but emits no prologue, silently clobbering a caller's
446 // callee-saved registers. Add the missing `push {r4-r8,lr}` /
447 // `pop {r4-r8,pc}` HERE — on the post-realloc body, where realloc has
448 // lowered low-pressure r4-r8 scratch back to r0-r3, so a save is added
449 // only for registers genuinely clobbered. `shrink_callee_saved_saves`
450 // (next) then trims it to the used set. No-op on the direct path (it
451 // already has its own prologue) and on callee-saved-free leaves.
452 let out = synth_synthesis::liveness::ensure_callee_saved_prologue(&out);
453 synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out)
454 } else {
455 // Range-realloc off (`SYNTH_RANGE_REALLOC=0`): the optimized path still
456 // must preserve the callee-saved registers it clobbers (#490). No shrink
457 // (it is coupled to the realloc lever), so the conservative full save
458 // stays — correct, just not minimised in this debug configuration.
459 synth_synthesis::liveness::ensure_callee_saved_prologue(&arm_instrs)
460 };
461
462 // VCR-RA-001 SHADOW ALLOCATION (#209/#242): run the register allocator on
463 // the selected stream and LOG what it finds — without changing a single
464 // emitted byte. This is the measure-only bridge between the built analysis
465 // layer and the eventual virtual-register wiring: it shows, per real
466 // function, whether the allocator can colour it within the R0–R8 pool and
467 // how much const-CSE / rematerialization headroom exists (#209). Enable with
468 // `SYNTH_SHADOW_ALLOC=1`; off by default and side-effect-free either way.
469 if std::env::var("SYNTH_SHADOW_ALLOC").is_ok() {
470 use synth_synthesis::liveness::{
471 AllocationOutcome, allocate_function, function_peak_pressure,
472 };
473 // R9 globals / R10 mem-size / R11 mem-base / R12 IP-scratch are reserved;
474 // pin them above the 0..9 allocatable pool so the colourer keeps R0–R8.
475 let precolored = std::collections::BTreeMap::from([
476 (synth_synthesis::rules::Reg::R9, 9usize),
477 (synth_synthesis::rules::Reg::R10, 10),
478 (synth_synthesis::rules::Reg::R11, 11),
479 (synth_synthesis::rules::Reg::R12, 12),
480 ]);
481 // True VALUE pressure (one node per value, not per reused physical reg):
482 // a NeedsSpill with peak ≤ 9 is a SPURIOUS physical-register spill — the
483 // function fits once virtually allocated.
484 let peak = function_peak_pressure(&arm_instrs);
485 match allocate_function(&arm_instrs, 9, &precolored) {
486 AllocationOutcome::Allocated {
487 remat_opportunities,
488 coloring,
489 } => eprintln!(
490 "[shadow-alloc] OK: {} pregs coloured within R0-R8 pool, peak value-pressure {}, {} const-CSE/remat opportunities",
491 coloring.len(),
492 peak,
493 remat_opportunities
494 ),
495 AllocationOutcome::NeedsSpill(s) => eprintln!(
496 "[shadow-alloc] physical-graph would spill {:?}, but peak value-pressure is {} (≤9 ⇒ spurious; fits once virtually allocated)",
497 s, peak
498 ),
499 AllocationOutcome::Declined => {
500 eprintln!(
501 "[shadow-alloc] declined (unmodeled construct — calls/i64/fp/offset-branch)"
502 )
503 }
504 }
505 }
506
507 // VCR-SEL-004 cmp→select → IT-block predication fusion (#242). The selector
508 // lowers a `select` whose condition is a comparison to a *materialize then
509 // re-test* sequence (`cmp a,b; SetCond D,c; cmp D,#0; movne dst,v1; moveq
510 // dst,v2`); this collapses it onto the comparison's own flags — deleting the
511 // `SetCond` and the `cmp D,#0` and retargeting the predicated moves to `c` /
512 // `invert(c)` — yielding the textbook predicated clamp (`cmp a,b; movc dst,v1;
513 // mov{!c} dst,v2`). −2 instructions per fused select. gale #428 measured this
514 // as the #1 hot-path size/cycle lever on the gust_mix clamp chain.
515 //
516 // Run LATE: after range re-allocation (so the dead-D proof sees final register
517 // identities) and before encode. Removal-only + rename-only ⇒ no spill
518 // regression and labels/branch offsets are unaffected. Each fusion is proven
519 // sound (flags reused only when nothing clobbers them in the window; the
520 // boolean deleted only when provably dead) — see `fuse_cmp_select`.
521 //
522 // DEFAULT-ON as of v0.13.0 (#428): cmp→select fusion ships by default. The
523 // byte-changing flip is validated by (a) the unicorn execution oracle that runs
524 // the two-move `mov{invert(c)}` arm (cmp_select_two_move_differential.py), (b)
525 // gale's gale_decider_diff 10,596-case sweep across all 8 verified primitives
526 // (native ≡ flag-off ≡ flag-on = 0x88e73178d232bcf5), and (c) the named-anchor
527 // differentials re-run with fusion ON — control_step still 0x00210A55, flat+
528 // inlined flight_algo still 0x07FDF307 (results preserved; bytes deliberately
529 // changed, re-frozen on this commit). Escape hatch: `SYNTH_NO_CMP_SELECT_FUSE=1`
530 // reverts to the pre-fusion lowering. The on-silicon G474RE DWT no-regression
531 // check is a tracked post-ship follow-up (gale owns it).
532 let arm_instrs = if std::env::var("SYNTH_NO_CMP_SELECT_FUSE").is_err() {
533 // The rewritten stream is identical to `fuse_cmp_select`'s 2-tuple form;
534 // the extra `two_move` count is diagnostic only (the fusion census /
535 // blast-radius datum — #7 made that arm reachable).
536 let (out, fused, two_move) =
537 synth_synthesis::liveness::fuse_cmp_select_with_stats(&arm_instrs);
538 if std::env::var("SYNTH_FUSE_STATS").is_ok() {
539 let in_place = fused - two_move;
540 eprintln!(
541 "[cmp-select-fuse] {fused} select(s) fused to predicated moves \
542 ({two_move} two-move, {in_place} in-place)"
543 );
544 }
545 out
546 } else {
547 arm_instrs
548 };
549
550 // Perf lever 1 toward native parity (#390): redundant stack-reload elimination.
551 // synth lowers every wasm local to a frame slot, so `local.set; local.get` emits
552 // `str rX,[sp,#N]; … ; ldr rY,[sp,#N]`; when rX still holds the value the reload
553 // (a ~2-cycle M4 load) becomes `mov rY,rX`. Removal-of-a-load + rename only ⇒ no
554 // new instruction form and no label/offset change. BEHIND `SYNTH_STACK_FWD=1`
555 // (opt-in, off by default ⇒ bit-identical) while it is validated against the
556 // execution differential + gale's G474RE bench — the same gated path the
557 // cmp→select flip took before shipping default-on in v0.13.0.
558 let arm_instrs = if std::env::var("SYNTH_STACK_FWD").is_ok() {
559 let (out, fwd) = synth_synthesis::liveness::forward_stack_reloads(&arm_instrs);
560 if std::env::var("SYNTH_FUSE_STATS").is_ok() {
561 eprintln!("[stack-fwd] {fwd} stack reload(s) forwarded to register moves");
562 }
563 out
564 } else {
565 arm_instrs
566 };
567
568 // VCR-RA immediate-shift folding (#390, #242): a constant shift amount the
569 // stack selector materialized into a scratch register (`movw rM,#C; lsl rD,rN,rM`)
570 // folds to the immediate form (`lsl rD,rN,#C`), removing the dead `movw` — −1
571 // instruction, −1 live register. Removal-only (offset-neutral before branch
572 // resolution, like the dead-store pass). DEFAULT-ON as of v0.15.0: validated
573 // bit-identical results + a net cycle win on the dissolved hot path (−2
574 // cyc/call, .text 100→90 B on gust_mix). Escape hatch: `SYNTH_NO_IMM_SHIFT_FOLD=1`.
575 let arm_instrs = if std::env::var("SYNTH_NO_IMM_SHIFT_FOLD").is_err() {
576 let (out, folds) = synth_synthesis::liveness::fold_immediate_shifts(&arm_instrs);
577 if std::env::var("SYNTH_FUSE_STATS").is_ok() {
578 eprintln!(
579 "[imm-shift-fold] {folds} register shift(s) folded to immediate, movw dropped"
580 );
581 }
582 out
583 } else {
584 arm_instrs
585 };
586
587 // VCR-RA uxth/uxtb fold (#428, #242): `movw rM,#0xffff; and rD,rN,rM` →
588 // `uxth rD,rN` (and the 0xff/uxtb form), removing the dead `movw` — −1
589 // instruction, −1 live register per 16/8-bit mask. 0xffff/0xff are not Thumb-2
590 // modified immediates so the selector materializes them into a register; the
591 // dedicated zero-extend expresses the same masking inline. Removal-only +
592 // rewrite-in-place (offset-neutral). FLAG-OFF by default (opt-in
593 // `SYNTH_UXTH_FOLD=1`) ⇒ bit-identical (frozen gate green); the byte-changing
594 // default-on flip is the separate on-target-gated step, like the prior levers.
595 let arm_instrs = if std::env::var("SYNTH_UXTH_FOLD").is_ok() {
596 let (out, folds) = synth_synthesis::liveness::fold_uxth(&arm_instrs);
597 if std::env::var("SYNTH_FUSE_STATS").is_ok() {
598 eprintln!("[uxth-fold] {folds} mask-and folded to uxth/uxtb, movw dropped");
599 }
600 out
601 } else {
602 arm_instrs
603 };
604
605 // ISA feature gate: validate that all generated instructions are supported
606 // by the target. This catches FPU instructions on no-FPU targets, double-precision
607 // instructions on single-precision targets, etc.
608 validate_instructions(&arm_instrs, config.target.fpu, &config.target.triple)
609 .map_err(|e| format!("ISA validation failed: {}", e))?;
610
611 // Encode to binary — use Thumb-2 for Cortex-M targets
612 let use_thumb2 = matches!(config.target.isa, IsaVariant::Thumb2 | IsaVariant::Thumb);
613
614 let encoder = if use_thumb2 {
615 ArmEncoder::new_thumb2_with_fpu(config.target.fpu)
616 } else {
617 ArmEncoder::new_arm32()
618 };
619
620 // #202: resolve local label branches (Bcc/B/Bhs/Blo) to byte-accurate
621 // offsets before encoding. `select_with_stack` emits them as label
622 // placeholders and never resolves them — without this they encode as
623 // `bne.n #0` and land mid-instruction whenever a 32-bit Thumb-2 instruction
624 // sits between the branch and its target (UsageFault on real hardware).
625 // Only meaningful for Thumb-2 (the offset units are halfword/PC+4).
626 let arm_instrs = if use_thumb2 {
627 resolve_label_branches(arm_instrs, &encoder)?
628 } else {
629 arm_instrs
630 };
631
632 let mut code = Vec::new();
633 let mut relocations = Vec::new();
634
635 // #345: literal-pool address loads. Each `LdrSym` was encoded as a placeholder
636 // `LDR.W rd,[pc,#0]`; record where its instruction sits and what it loads so
637 // we can append a pooled word (carrying the symbol address via R_ARM_ABS32)
638 // and patch the PC-relative offset once the pool position is known.
639 struct PendingLiteral {
640 ldr_offset: u32,
641 symbol: String,
642 addend: i32,
643 }
644 let mut pending_literals: Vec<PendingLiteral> = Vec::new();
645
646 // VCR-DBG-001: per-instruction source map for DWARF `.debug_line`. Captured
647 // here because `code.len()` immediately before `encode()` is the final
648 // machine offset of the instruction within this function's `.text` — nothing
649 // after the loop shifts earlier instructions (the literal pool is appended at
650 // the end; the LDR patch below is in-place/length-preserving). Purely
651 // additive: it does not touch `code`, so `.text` is byte-identical.
652 let mut line_map: LineMap = Vec::new();
653
654 for instr in &arm_instrs {
655 // Record a relocation for every BL: the encoder emits `bl #0` and
656 // relies on a relocation to patch the target. This covers BOTH import
657 // dispatch stubs (`__meld_*`, undefined externals) AND internal calls
658 // (`func_N`, defined in this object). Previously only `__meld_*` was
659 // recorded, so internal `BL func_N` calls were left as unpatched
660 // `bl #0` placeholders branching to a garbage address (#167).
661 if let ArmOp::Bl { label } = &instr.op {
662 relocations.push(CodeRelocation {
663 offset: code.len() as u32,
664 symbol: label.clone(),
665 kind: synth_core::backend::RelocKind::ThmCall,
666 });
667 }
668 // #237: symbol-relative MOVW/MOVT (the `--native-pointer-abi` static-data
669 // addressing). The encoder writes the addend in place; record the matching
670 // R_ARM_MOVW_ABS_NC / R_ARM_MOVT_ABS so the linker adds the symbol address.
671 if let ArmOp::MovwSym { symbol, .. } = &instr.op {
672 relocations.push(CodeRelocation {
673 offset: code.len() as u32,
674 symbol: symbol.clone(),
675 kind: synth_core::backend::RelocKind::MovwAbs,
676 });
677 }
678 if let ArmOp::MovtSym { symbol, .. } = &instr.op {
679 relocations.push(CodeRelocation {
680 offset: code.len() as u32,
681 symbol: symbol.clone(),
682 kind: synth_core::backend::RelocKind::MovtAbs,
683 });
684 }
685 // #345: defer the literal-pool word + reloc + offset patch to the
686 // post-loop pass (the pool address is not yet known).
687 if let ArmOp::LdrSym { symbol, addend, .. } = &instr.op {
688 pending_literals.push(PendingLiteral {
689 ldr_offset: code.len() as u32,
690 symbol: symbol.clone(),
691 addend: *addend,
692 });
693 }
694
695 // The machine offset of this instruction is the current code length,
696 // captured before the bytes are appended.
697 line_map.push((code.len() as u32, instr.source_line));
698
699 let encoded = encoder
700 .encode(&instr.op)
701 .map_err(|e| format!("ARM encoding failed: {}", e))?;
702 code.extend_from_slice(&encoded);
703 }
704
705 // #345: place the literal pool at the end of this function's `.text`. Gated on
706 // there being at least one `LdrSym` — functions without one are byte-identical
707 // to before (no trailing padding, so downstream `func_offsets` are unchanged
708 // and the frozen differential fixtures stay bit-for-bit equal).
709 if !pending_literals.is_empty() {
710 if !use_thumb2 {
711 return Err("LdrSym literal-pool addressing requires Thumb-2".to_string());
712 }
713 // 4-byte align the pool start (Thumb-2 word loads require it, and
714 // `Align(PC,4)` in the LDR-literal semantics assumes a word-aligned pool).
715 while code.len() % 4 != 0 {
716 code.push(0x00);
717 }
718 // One distinct pooled word per LdrSym (no dedup: different sites carry
719 // different addends, and the REL addend lives in the word).
720 for lit in &pending_literals {
721 let word_offset = code.len() as u32;
722
723 // REL semantics: the linker computes `S + A`, where A is the in-place
724 // value of the relocated word. Initialize the word to the addend so
725 // the final loaded address is `symbol + addend`.
726 code.extend_from_slice(&(lit.addend as u32).to_le_bytes());
727 relocations.push(CodeRelocation {
728 offset: word_offset,
729 symbol: lit.symbol.clone(),
730 kind: synth_core::backend::RelocKind::Abs32,
731 });
732
733 // Patch the placeholder `LDR.W rd,[pc,#imm12]`. Thumb-2 LDR (literal):
734 // address = Align(PC,4) + imm12, with PC = ldr_offset + 4. The pool is
735 // always after the LDR, so U=1 (already set in hw1 = 0xF8DF).
736 let pc = lit.ldr_offset + 4;
737 let aligned_pc = pc & !3u32;
738 let imm12 = word_offset - aligned_pc;
739 if imm12 > 0xFFF {
740 // Wide LDR-literal range is ±4 KB; these function bodies are far
741 // smaller, but fail cleanly rather than miscompile if exceeded.
742 return Err(format!(
743 "LdrSym literal pool out of range (#345): imm12={} > 4095 \
744 for symbol {}",
745 imm12, lit.symbol
746 ));
747 }
748 let hw2_off = (lit.ldr_offset + 2) as usize;
749 let mut hw2 = u16::from_le_bytes([code[hw2_off], code[hw2_off + 1]]);
750 hw2 = (hw2 & 0xF000) | (imm12 as u16); // keep Rt, set imm12
751 let hw2_bytes = hw2.to_le_bytes();
752 code[hw2_off] = hw2_bytes[0];
753 code[hw2_off + 1] = hw2_bytes[1];
754 }
755 }
756
757 Ok((code, relocations, line_map))
758}
759
760/// Resolve local label branches to byte-accurate offsets (#202).
761///
762/// `select_with_stack` emits conditional/unconditional branches as label
763/// placeholders (`Bcc`/`B`/`Bhs`/`Blo` + `Label`) and never resolves them; the
764/// encoder then emits a `0xD000`/`0xE000` placeholder with offset 0. Before #197
765/// this path only ran for `--no-optimize`/declined functions, so the latent bug
766/// stayed hidden — routing relocatable code through it surfaced branches that
767/// land mid-instruction (a Cortex-M UsageFault) whenever a 32-bit Thumb-2
768/// instruction sits between the branch and its target.
769///
770/// This pass encodes each instruction to learn its real byte length (so 16- vs
771/// 32-bit forms and multi-instruction expansions are exact), maps each `Label`
772/// to its byte position, and rewrites every label branch to the displacement
773/// the encoder consumes: `(target - branch - 4) / 2` halfwords. A bounded
774/// fixed-point handles an offset growing a branch from 16- to 32-bit (which
775/// shifts later positions). `BCondOffset`/`BOffset` already produced inline by
776/// the optimized path carry no label and are left untouched.
777fn resolve_label_branches(
778 arm_instrs: Vec<ArmInstruction>,
779 encoder: &ArmEncoder,
780) -> Result<Vec<ArmInstruction>, String> {
781 use std::collections::HashMap;
782 use synth_synthesis::Condition;
783
784 enum BKind {
785 Cond(Condition),
786 Uncond,
787 }
788 // Record each label branch ONCE — indices are stable across iterations.
789 let mut branches: Vec<(usize, BKind, String)> = Vec::new();
790 for (i, instr) in arm_instrs.iter().enumerate() {
791 match &instr.op {
792 ArmOp::Bcc { cond, label } => branches.push((i, BKind::Cond(*cond), label.clone())),
793 ArmOp::Bhs { label } => branches.push((i, BKind::Cond(Condition::HS), label.clone())),
794 ArmOp::Blo { label } => branches.push((i, BKind::Cond(Condition::LO), label.clone())),
795 ArmOp::B { label } => branches.push((i, BKind::Uncond, label.clone())),
796 _ => {}
797 }
798 }
799 if branches.is_empty() {
800 return Ok(arm_instrs);
801 }
802
803 let mut resolved = arm_instrs;
804 // Sizes only grow (16→32-bit), so this converges quickly; cap for safety.
805 for _ in 0..16 {
806 // 1. Byte position of each instruction (Label encodes to 0 bytes).
807 let mut positions = Vec::with_capacity(resolved.len());
808 let mut pos: i64 = 0;
809 for instr in &resolved {
810 positions.push(pos);
811 pos += encoder
812 .encode(&instr.op)
813 .map_err(|e| format!("branch-resolve size probe failed: {}", e))?
814 .len() as i64;
815 }
816 // 2. Label name -> byte position (owned keys so the borrow ends here).
817 let mut labels: HashMap<String, i64> = HashMap::new();
818 for (i, instr) in resolved.iter().enumerate() {
819 if let ArmOp::Label { name } = &instr.op {
820 labels.insert(name.clone(), positions[i]);
821 }
822 }
823 // 3. Rewrite each branch to its byte-accurate offset.
824 let mut changed = false;
825 for (idx, kind, label) in &branches {
826 // A label not defined locally is an EXTERNAL target (e.g.
827 // `Trap_Handler` resolved by a relocation / the vector table). Leave
828 // such branches as their placeholder for the existing relocation
829 // path — only local control-flow labels are byte-resolved here.
830 let Some(&target) = labels.get(label) else {
831 continue;
832 };
833 // Encoder consumes the field as (target - branch - 4) / 2 halfwords.
834 // Positions are always even, so this division is exact.
835 let halfword_offset = ((target - positions[*idx] - 4) / 2) as i32;
836 let new_op = match kind {
837 BKind::Cond(c) => ArmOp::BCondOffset {
838 cond: *c,
839 offset: halfword_offset,
840 },
841 BKind::Uncond => ArmOp::BOffset {
842 offset: halfword_offset,
843 },
844 };
845 if resolved[*idx].op != new_op {
846 resolved[*idx].op = new_op;
847 changed = true;
848 }
849 }
850 if !changed {
851 break;
852 }
853 }
854 Ok(resolved)
855}
856
857#[cfg(test)]
858mod tests {
859 use super::*;
860
861 #[test]
862 fn test_arm_backend_name() {
863 let backend = ArmBackend::new();
864 assert_eq!(backend.name(), "arm");
865 assert!(backend.is_available());
866 }
867
868 #[test]
869 fn test_arm_backend_capabilities() {
870 let backend = ArmBackend::new();
871 let caps = backend.capabilities();
872 assert!(!caps.produces_elf);
873 assert!(caps.supports_rule_verification);
874 assert!(!caps.is_external);
875 }
876
877 #[test]
878 fn test_compile_add_function() {
879 let backend = ArmBackend::new();
880 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
881 let config = CompileConfig::default();
882
883 let result = backend.compile_function("add", &ops, &config);
884 assert!(result.is_ok());
885
886 let func = result.unwrap();
887 assert_eq!(func.name, "add");
888 assert!(!func.code.is_empty());
889 assert_eq!(func.wasm_ops, ops);
890 }
891
892 /// VCR-DBG-001: the per-instruction source map must cover the function with
893 /// monotonic, in-bounds machine offsets, and must not perturb the emitted
894 /// code (it is captured at encode time, never serialized here).
895 #[test]
896 fn test_line_map_is_wellformed_dbg001() {
897 let backend = ArmBackend::new();
898 let ops = vec![
899 WasmOp::LocalGet(0),
900 WasmOp::LocalGet(1),
901 WasmOp::I32Add,
902 WasmOp::End,
903 ];
904 let config = CompileConfig::default();
905 let func = backend.compile_function("add", &ops, &config).unwrap();
906
907 // Non-empty, and the first instruction starts at machine offset 0.
908 assert!(
909 !func.line_map.is_empty(),
910 "a non-trivial function captures a source map"
911 );
912 assert_eq!(func.line_map[0].0, 0, "first instruction at offset 0");
913
914 // Offsets strictly increase by at least one ARM/Thumb instruction (>= 2
915 // bytes) and every mapped offset lies inside the emitted `.text`.
916 for w in func.line_map.windows(2) {
917 assert!(w[1].0 > w[0].0, "instruction offsets strictly increase");
918 assert!(
919 w[1].0 - w[0].0 >= 2,
920 "each ARM/Thumb instruction is >= 2 bytes"
921 );
922 }
923 let last = func.line_map.last().unwrap().0 as usize;
924 assert!(
925 last < func.code.len(),
926 "every mapped offset lies inside .text"
927 );
928
929 // The side-table is additive: recompiling is deterministic and the map is
930 // consistent with that exact code (capturing it does not alter output).
931 let again = backend.compile_function("add", &ops, &config).unwrap();
932 assert_eq!(
933 again.code, func.code,
934 "compilation deterministic; map is additive"
935 );
936 assert_eq!(again.line_map, func.line_map);
937 }
938
939 #[test]
940 fn test_count_params() {
941 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
942 assert_eq!(count_params(&ops), 2);
943
944 let no_params = vec![WasmOp::I32Const(5), WasmOp::I32Const(3), WasmOp::I32Add];
945 assert_eq!(count_params(&no_params), 0);
946 }
947
948 #[test]
949 fn test_arm_backend_register() {
950 let mut registry = synth_core::BackendRegistry::new();
951 registry.register(Box::new(ArmBackend::new()));
952 assert!(registry.get("arm").is_some());
953 assert_eq!(registry.available().len(), 1);
954 }
955
956 #[test]
957 fn test_compile_import_call_produces_relocations() {
958 let backend = ArmBackend::new();
959 // Simulate a WASM module where func index 0 is an import.
960 // Call(0) should generate MOV R0, #0; BL __meld_dispatch_import
961 let ops = vec![WasmOp::Call(0)];
962 let config = CompileConfig {
963 num_imports: 1,
964 no_optimize: true, // Direct instruction selection to preserve Call semantics
965 ..CompileConfig::default()
966 };
967
968 let result = backend.compile_function("caller", &ops, &config);
969 assert!(result.is_ok());
970
971 let func = result.unwrap();
972 assert!(!func.code.is_empty());
973 assert_eq!(func.relocations.len(), 1);
974 assert_eq!(func.relocations[0].symbol, "__meld_dispatch_import");
975 // The BL is the second instruction (after MOV R0, #0), so offset should be > 0
976 assert!(func.relocations[0].offset > 0);
977 }
978
979 /// Regression test for #197: in `relocatable` mode, an import call must
980 /// relocate against the direct `func_N` symbol (rewritten to the wasm field
981 /// name by `build_relocatable_elf`), NOT `__meld_dispatch_import`. This is
982 /// the ABI half of the #197 fix — without it, a host linker cannot resolve
983 /// the call to the real kernel symbol (e.g. `k_spin_lock`).
984 #[test]
985 fn test_compile_relocatable_import_uses_direct_func_symbol_197() {
986 let backend = ArmBackend::new();
987 let ops = vec![WasmOp::Call(0)]; // func 0 is an import
988 let config = CompileConfig {
989 num_imports: 1,
990 relocatable: true,
991 ..CompileConfig::default()
992 };
993
994 let func = backend
995 .compile_function("caller", &ops, &config)
996 .expect("relocatable import call compiles");
997
998 assert_eq!(func.relocations.len(), 1);
999 assert_eq!(
1000 func.relocations[0].symbol, "func_0",
1001 "#197: relocatable import must relocate against func_0 (→ field name), not Meld dispatch"
1002 );
1003 }
1004
1005 #[test]
1006 fn test_compile_no_imports_no_relocations() {
1007 let backend = ArmBackend::new();
1008 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
1009 let config = CompileConfig::default();
1010
1011 let func = backend.compile_function("add", &ops, &config).unwrap();
1012 assert!(func.relocations.is_empty());
1013 }
1014
1015 /// Regression test for #167: a call to an INTERNAL function
1016 /// (index `>= num_imports`) must record a relocation against `func_{index}`.
1017 /// Before the fix, only `__meld_*` (import) BLs were relocated, so
1018 /// internal `BL func_N` was emitted as an unpatched `bl #0` branching
1019 /// to a garbage address — making the object non-linkable. This test
1020 /// would have caught that regression.
1021 #[test]
1022 fn test_compile_internal_call_produces_relocation_167() {
1023 let backend = ArmBackend::new();
1024 // num_imports = 1, so Call(2) is an INTERNAL call → `BL func_2`.
1025 let ops = vec![WasmOp::Call(2)];
1026 let config = CompileConfig {
1027 num_imports: 1,
1028 no_optimize: true,
1029 ..CompileConfig::default()
1030 };
1031
1032 let func = backend
1033 .compile_function("caller", &ops, &config)
1034 .expect("internal call compiles");
1035
1036 assert_eq!(
1037 func.relocations.len(),
1038 1,
1039 "an internal call must emit exactly one relocation (#167)"
1040 );
1041 assert_eq!(
1042 func.relocations[0].symbol, "func_2",
1043 "internal call must relocate against the callee's func_{{index}} symbol (#167)"
1044 );
1045 }
1046
1047 // ─── Phase 1 safety-bounds plumbing for ARM ──────────────────────────
1048
1049 #[test]
1050 fn arm_safety_bounds_mpu_emits_same_code_as_none() {
1051 // Mpu mode must not introduce any inline check on ARM — the MPU
1052 // handles faults via hardware. The encoded bytes for an i32.load
1053 // should be identical between None and Mpu.
1054 let backend = ArmBackend::new();
1055 let ops = vec![
1056 WasmOp::LocalGet(0),
1057 WasmOp::I32Load {
1058 offset: 0,
1059 align: 2,
1060 },
1061 ];
1062 let cfg_none = CompileConfig {
1063 no_optimize: true,
1064 ..Default::default()
1065 };
1066 let cfg_mpu = CompileConfig {
1067 no_optimize: true,
1068 safety_bounds: SafetyBounds::Mpu,
1069 ..Default::default()
1070 };
1071 let n = backend.compile_function("ld", &ops, &cfg_none).unwrap();
1072 let m = backend.compile_function("ld", &ops, &cfg_mpu).unwrap();
1073 assert_eq!(
1074 n.code, m.code,
1075 "Mpu and None should produce identical ARM bytes (Mpu relies on hardware)"
1076 );
1077 }
1078
1079 #[test]
1080 fn arm_legacy_bounds_check_still_emits_software_check() {
1081 // Legacy CLI users with `--bounds-check` should keep getting the
1082 // software path even though the new SafetyBounds field defaults to None.
1083 let backend = ArmBackend::new();
1084 let ops = vec![
1085 WasmOp::LocalGet(0),
1086 WasmOp::I32Load {
1087 offset: 0,
1088 align: 2,
1089 },
1090 ];
1091 let cfg_legacy = CompileConfig {
1092 no_optimize: true,
1093 bounds_check: true,
1094 ..Default::default()
1095 };
1096 let cfg_software = CompileConfig {
1097 no_optimize: true,
1098 safety_bounds: SafetyBounds::Software,
1099 ..Default::default()
1100 };
1101 let l = backend.compile_function("ld", &ops, &cfg_legacy).unwrap();
1102 let s = backend.compile_function("ld", &ops, &cfg_software).unwrap();
1103 assert_eq!(
1104 l.code, s.code,
1105 "--bounds-check should produce the same bytes as --safety-bounds=software"
1106 );
1107 }
1108
1109 // ========================================================================
1110 // ISA feature gate tests — ensure the compiler never emits unsupported
1111 // instructions for a given target
1112 // ========================================================================
1113
1114 #[test]
1115 fn test_f32_rejected_on_cortex_m3_no_fpu() {
1116 let backend = ArmBackend::new();
1117 let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
1118 let config = CompileConfig {
1119 target: TargetSpec::cortex_m3(),
1120 no_optimize: true,
1121 ..CompileConfig::default()
1122 };
1123
1124 let result = backend.compile_function("fadd", &ops, &config);
1125 assert!(
1126 result.is_err(),
1127 "f32 operations should fail on Cortex-M3 (no FPU)"
1128 );
1129 }
1130
1131 #[test]
1132 fn test_f32_accepted_on_cortex_m4f() {
1133 let backend = ArmBackend::new();
1134 let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
1135 let config = CompileConfig {
1136 target: TargetSpec::cortex_m4f(),
1137 no_optimize: true,
1138 ..CompileConfig::default()
1139 };
1140
1141 let result = backend.compile_function("fadd", &ops, &config);
1142 assert!(
1143 result.is_ok(),
1144 "f32 operations should succeed on Cortex-M4F, got: {:?}",
1145 result.unwrap_err()
1146 );
1147 }
1148
1149 #[test]
1150 fn test_i32_works_on_all_targets() {
1151 let backend = ArmBackend::new();
1152 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
1153
1154 // Cortex-M3 (no FPU)
1155 let config_m3 = CompileConfig {
1156 target: TargetSpec::cortex_m3(),
1157 no_optimize: true,
1158 ..CompileConfig::default()
1159 };
1160 assert!(
1161 backend.compile_function("add", &ops, &config_m3).is_ok(),
1162 "i32 ops should work on Cortex-M3"
1163 );
1164
1165 // Cortex-M4F (single FPU)
1166 let config_m4f = CompileConfig {
1167 target: TargetSpec::cortex_m4f(),
1168 no_optimize: true,
1169 ..CompileConfig::default()
1170 };
1171 assert!(
1172 backend.compile_function("add", &ops, &config_m4f).is_ok(),
1173 "i32 ops should work on Cortex-M4F"
1174 );
1175
1176 // Cortex-M7DP (double FPU)
1177 let config_m7dp = CompileConfig {
1178 target: TargetSpec::cortex_m7dp(),
1179 no_optimize: true,
1180 ..CompileConfig::default()
1181 };
1182 assert!(
1183 backend.compile_function("add", &ops, &config_m7dp).is_ok(),
1184 "i32 ops should work on Cortex-M7DP"
1185 );
1186 }
1187
1188 #[test]
1189 fn test_f32_rejected_on_cortex_m4_no_fpu() {
1190 // Cortex-M4 (without F suffix) has no FPU
1191 let backend = ArmBackend::new();
1192 let ops = vec![WasmOp::F32Const(1.5), WasmOp::F32Const(2.5), WasmOp::F32Mul];
1193 let config = CompileConfig {
1194 target: TargetSpec::cortex_m4(),
1195 no_optimize: true,
1196 ..CompileConfig::default()
1197 };
1198
1199 let result = backend.compile_function("fmul", &ops, &config);
1200 assert!(
1201 result.is_err(),
1202 "f32 operations should fail on Cortex-M4 (no FPU)"
1203 );
1204 }
1205
1206 // ========================================================================
1207 // Issue #120 — f32 ops in the optimized lowering path
1208 //
1209 // `OptimizerBridge::wasm_to_ir` has no handlers for f32/f64 ops, so a
1210 // value-producing float op fell through to `Opcode::Nop`, leaving a
1211 // downstream consumer with an unmapped vreg and tripping the PR #101
1212 // defensive panic in `ir_to_arm`. Customer reproducer: `compiler_builtins
1213 // float::div` and `gale_compute_ipi_mask` in the `falcon-rate-component`
1214 // module.
1215 //
1216 // Fix: `optimize_full` declines float modules with a typed `Err`;
1217 // `compile_wasm_to_arm` falls back to the non-optimized `select_with_stack`
1218 // path, which handles f32 via VFP/FPU. These tests use the *default*
1219 // (optimized) config — `no_optimize` is NOT set — which is the exact
1220 // configuration that panicked pre-fix.
1221 // ========================================================================
1222
1223 /// Pre-fix: this panicked with "vreg vN has no assigned ARM register and
1224 /// no spill slot" inside `ir_to_arm`. Post-fix: the optimized path declines
1225 /// the module and the backend falls back to direct selection, producing a
1226 /// non-empty f32.div lowering on a Cortex-M4F.
1227 #[test]
1228 fn test_issue120_f32_div_compiles_via_optimized_default() {
1229 let backend = ArmBackend::new();
1230 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1231 let config = CompileConfig {
1232 target: TargetSpec::cortex_m4f(),
1233 // no_optimize NOT set — this exercises the optimized path that
1234 // panicked in issue #120, then the fallback to direct selection.
1235 ..CompileConfig::default()
1236 };
1237
1238 let result = backend.compile_function("fdiv", &ops, &config);
1239 assert!(
1240 result.is_ok(),
1241 "f32.div must compile on Cortex-M4F via the optimized->direct \
1242 fallback (issue #120), got: {:?}",
1243 result.as_ref().err()
1244 );
1245 assert!(
1246 !result.unwrap().code.is_empty(),
1247 "f32.div must produce non-empty machine code"
1248 );
1249 }
1250
1251 /// A spread of f32 ops, all through the optimized (default) config, must
1252 /// compile via the fallback on an FPU target without panicking.
1253 #[test]
1254 fn test_issue120_assorted_f32_ops_compile_via_optimized_default() {
1255 let backend = ArmBackend::new();
1256 let config = CompileConfig {
1257 target: TargetSpec::cortex_m4f(),
1258 ..CompileConfig::default()
1259 };
1260
1261 let cases: Vec<(&str, Vec<WasmOp>)> = vec![
1262 (
1263 "fadd",
1264 vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Add],
1265 ),
1266 (
1267 "fmul",
1268 vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Mul],
1269 ),
1270 (
1271 "fsub",
1272 vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Sub],
1273 ),
1274 ];
1275
1276 for (name, ops) in cases {
1277 let result = backend.compile_function(name, &ops, &config);
1278 assert!(
1279 result.is_ok(),
1280 "{name} must compile via the optimized->direct fallback \
1281 (issue #120), got: {:?}",
1282 result.as_ref().err()
1283 );
1284 assert!(
1285 !result.unwrap().code.is_empty(),
1286 "{name} must produce non-empty machine code"
1287 );
1288 }
1289 }
1290
1291 /// The fallback must still honor the ISA feature gate: f32 on a no-FPU
1292 /// target must fail cleanly (not panic) even on the optimized path.
1293 #[test]
1294 fn test_issue120_f32_div_rejected_on_no_fpu_via_optimized() {
1295 let backend = ArmBackend::new();
1296 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1297 let config = CompileConfig {
1298 target: TargetSpec::cortex_m3(),
1299 ..CompileConfig::default()
1300 };
1301
1302 let result = backend.compile_function("fdiv", &ops, &config);
1303 assert!(
1304 result.is_err(),
1305 "f32.div must be rejected on Cortex-M3 (no FPU), not panic"
1306 );
1307 }
1308
1309 /// Issue #94: end-to-end byte-size check for the canonical u64-packed
1310 /// FFI-return hi32 extract pattern. Compiles two near-identical
1311 /// functions — one with the optimized shift-by-32, one with a generic
1312 /// shift-by-7 — and asserts the optimized form is meaningfully smaller.
1313 #[test]
1314 fn test_issue94_hi32_extract_is_smaller_than_generic_shift() {
1315 let backend = ArmBackend::new();
1316 let config = CompileConfig {
1317 target: TargetSpec::cortex_m4f(),
1318 ..CompileConfig::default()
1319 };
1320
1321 // Optimized path: `(local.get 0) >>> 32; wrap_i64`
1322 let ops_hi32 = vec![
1323 WasmOp::LocalGet(0), // i64 param in R0:R1
1324 WasmOp::I64Const(32),
1325 WasmOp::I64ShrU,
1326 WasmOp::I32WrapI64,
1327 ];
1328 let func_hi32 = backend
1329 .compile_function("hi32_extract", &ops_hi32, &config)
1330 .unwrap();
1331
1332 // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the
1333 // shift amount is not a multiple of 32, so it falls through to the
1334 // 38-byte runtime shift.
1335 let ops_generic = vec![
1336 WasmOp::LocalGet(0),
1337 WasmOp::I64Const(7),
1338 WasmOp::I64ShrU,
1339 WasmOp::I32WrapI64,
1340 ];
1341 let func_generic = backend
1342 .compile_function("generic_shr", &ops_generic, &config)
1343 .unwrap();
1344
1345 let bytes_hi32 = func_hi32.code.len();
1346 let bytes_generic = func_generic.code.len();
1347 println!(
1348 "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})",
1349 bytes_hi32,
1350 bytes_generic,
1351 bytes_generic.saturating_sub(bytes_hi32)
1352 );
1353 let hex: String = func_hi32
1354 .code
1355 .iter()
1356 .map(|b| format!("{:02x}", b))
1357 .collect::<Vec<_>>()
1358 .join(" ");
1359 println!("[issue #94] hi32 bytes: {}", hex);
1360 // We expect the optimized form to be at least 30 bytes smaller than
1361 // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.)
1362 assert!(
1363 bytes_hi32 + 30 <= bytes_generic,
1364 "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \
1365 expected optimized form to be at least 30 bytes smaller",
1366 bytes_hi32,
1367 bytes_generic,
1368 );
1369 }
1370}