Skip to main content

synth_backend/
arm_backend.rs

1//! ARM Backend — wraps the instruction selector + optimizer + encoder as a Backend
2//!
3//! This is Synth's custom ARM compiler targeting Cortex-M (Thumb-2).
4//! It's the only backend that supports per-rule formal verification (ASIL D path).
5
6use crate::ArmEncoder;
7use synth_core::backend::{
8    Backend, BackendCapabilities, BackendError, CodeRelocation, CompilationResult, CompileConfig,
9    CompiledFunction, LineMap, SafetyBounds,
10};
11use synth_core::target::{IsaVariant, TargetSpec};
12use synth_core::wasm_decoder::DecodedModule;
13use synth_core::wasm_op::WasmOp;
14use synth_synthesis::{
15    ArmInstruction, ArmOp, BoundsCheckConfig, InstructionSelector, OptimizationConfig,
16    OptimizerBridge, RuleDatabase, validate_instructions,
17};
18
19/// ARM Cortex-M backend using Synth's custom compiler pipeline
20pub struct ArmBackend;
21
22impl ArmBackend {
23    pub fn new() -> Self {
24        Self
25    }
26}
27
28impl Default for ArmBackend {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34impl Backend for ArmBackend {
35    fn name(&self) -> &str {
36        "arm"
37    }
38
39    fn capabilities(&self) -> BackendCapabilities {
40        BackendCapabilities {
41            produces_elf: false,
42            supports_rule_verification: true,
43            supports_binary_verification: true,
44            is_external: false,
45        }
46    }
47
48    fn supported_targets(&self) -> Vec<TargetSpec> {
49        vec![
50            TargetSpec::cortex_m3(),
51            TargetSpec::cortex_m4(),
52            TargetSpec::cortex_m4f(),
53            TargetSpec::cortex_m7(),
54            TargetSpec::cortex_m7dp(),
55        ]
56    }
57
58    fn compile_module(
59        &self,
60        module: &DecodedModule,
61        config: &CompileConfig,
62    ) -> Result<CompilationResult, BackendError> {
63        let exports: Vec<_> = module
64            .functions
65            .iter()
66            .filter(|f| f.export_name.is_some())
67            .collect();
68
69        if exports.is_empty() {
70            return Err(BackendError::CompilationFailed(
71                "no exported functions found".into(),
72            ));
73        }
74
75        let mut functions = Vec::new();
76        for func in &exports {
77            let name = func.export_name.clone().unwrap();
78            // #359: copy THIS function's declared param widths into the config so
79            // `compile_function` (which carries no function index) can refuse a
80            // 64-bit param on the AAPCS stack-argument path. Cheap clone only when
81            // a signature table is present and this function has a width entry —
82            // otherwise reuse the shared config (every existing module unchanged).
83            let func_config = match config.func_params_i64.get(func.index as usize) {
84                Some(p) if !p.is_empty() => Some(CompileConfig {
85                    current_func_params_i64: p.clone(),
86                    ..config.clone()
87                }),
88                _ => None,
89            };
90            let cfg = func_config.as_ref().unwrap_or(config);
91            let compiled = self.compile_function(&name, &func.ops, cfg)?;
92            functions.push(compiled);
93        }
94
95        Ok(CompilationResult {
96            functions,
97            elf: None,
98            backend_name: self.name().to_string(),
99        })
100    }
101
102    fn compile_function(
103        &self,
104        name: &str,
105        ops: &[WasmOp],
106        config: &CompileConfig,
107    ) -> Result<CompiledFunction, BackendError> {
108        let (code, relocations, line_map) =
109            compile_wasm_to_arm(ops, config).map_err(BackendError::CompilationFailed)?;
110
111        Ok(CompiledFunction {
112            name: name.to_string(),
113            code,
114            wasm_ops: ops.to_vec(),
115            relocations,
116            line_map,
117        })
118    }
119
120    fn is_available(&self) -> bool {
121        true // Always available — it's a library backend
122    }
123}
124
125/// Count the number of function parameters by analyzing LocalGet patterns
126fn count_params(wasm_ops: &[WasmOp]) -> u32 {
127    let mut first_access: std::collections::HashMap<u32, bool> = std::collections::HashMap::new();
128    for op in wasm_ops {
129        match op {
130            WasmOp::LocalGet(idx) => {
131                first_access.entry(*idx).or_insert(true);
132            }
133            WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
134                first_access.entry(*idx).or_insert(false);
135            }
136            _ => {}
137        }
138    }
139
140    first_access
141        .iter()
142        .filter_map(
143            |(&idx, &is_read_first)| {
144                if is_read_first { Some(idx + 1) } else { None }
145            },
146        )
147        .max()
148        .unwrap_or(0)
149}
150
151/// Core compilation: WASM ops → ARM machine code bytes + relocations
152///
153/// Returns (code_bytes, relocations) where relocations record BL instructions
154/// that target external symbols (e.g., `__meld_dispatch_import` for import calls).
155fn compile_wasm_to_arm(
156    wasm_ops: &[WasmOp],
157    config: &CompileConfig,
158) -> Result<(Vec<u8>, Vec<CodeRelocation>, LineMap), String> {
159    let num_params = count_params(wasm_ops);
160
161    let bounds_config = match config.effective_safety_bounds() {
162        SafetyBounds::None => BoundsCheckConfig::None,
163        SafetyBounds::Mpu => BoundsCheckConfig::Mpu,
164        SafetyBounds::Software => BoundsCheckConfig::Software,
165        SafetyBounds::Mask => BoundsCheckConfig::Masking,
166    };
167
168    // The non-optimized (direct) instruction-selection path. Handles f32 via
169    // VFP/FPU. Used directly when `--no-optimize` is set, and as the fallback
170    // when the optimized path declines a module (see issue #120 below).
171    //
172    // VCR-RA-001 step 3b-lite (#242): a FRESH selector per attempt, with
173    // `spill_on_exhaustion` set only on the retry — the first pass is the
174    // unmodified default, so every function that compiles today is selected by
175    // exactly the code that compiled it yesterday (bit-identity is structural,
176    // not behavioural).
177    let select_direct_attempt = |spill_on_exhaustion: bool,
178                                 param_backing_on_exhaustion: bool,
179                                 local_promote: bool|
180     -> Result<Vec<ArmInstruction>, synth_core::Error> {
181        let db = RuleDatabase::with_standard_rules();
182        let mut selector =
183            InstructionSelector::with_bounds_check(db.rules().to_vec(), bounds_config);
184        selector.set_target(config.target.fpu, &config.target.triple);
185        if config.num_imports > 0 {
186            selector.set_num_imports(config.num_imports);
187        }
188        // #195: plumb the callee argument-count tables so the direct selector can
189        // marshal call arguments into R0–R3 per AAPCS.
190        selector.set_func_arg_counts(
191            config.func_arg_counts.clone(),
192            config.type_arg_counts.clone(),
193        );
194        // #197: in relocatable host-link mode, emit direct `func_N` BLs for
195        // imports (rewritten to the wasm field name by build_relocatable_elf)
196        // instead of `__meld_dispatch_import`.
197        selector.set_relocatable(config.relocatable);
198        // #237: native-pointer ABI — wasm statics become __synth_wasm_data-relative.
199        selector.set_native_pointer_abi(config.native_pointer_abi, config.linear_memory_bytes);
200        // #311: i64 call results are register PAIRS — tag them.
201        selector.set_result_types(config.func_ret_i64.clone(), config.type_ret_i64.clone());
202        // #359: declared param widths of THIS function, so the AAPCS stack-arg
203        // path can refuse 64-bit params (Ok-or-Err). Empty ⇒ assume i32.
204        selector.set_params_i64(config.current_func_params_i64.clone());
205        // Stack-pointer promotion is meaningful only under the native-pointer ABI;
206        // gating here keeps every non-native compile (all frozen fixtures) on the
207        // legacy R9 globals-table path, bit-identical.
208        if config.native_pointer_abi
209            && let Some((sp_idx, sp_init)) = config.stack_pointer_global
210        {
211            selector.set_native_pointer_stack(sp_idx, sp_init);
212        }
213        selector.set_spill_on_exhaustion(spill_on_exhaustion);
214        selector.set_param_backing_on_exhaustion(param_backing_on_exhaustion);
215        // VCR-RA local promotion (#390, #242): keep eligible non-param i32 locals
216        // in callee-saved registers instead of frame slots — the structural lever
217        // toward native parity. DEFAULT-ON as of v0.14.0: gale's G474RE DWT gate
218        // cleared it as a net win (gust_mix dissolved 58→50 cyc/call −14%, all 5
219        // stack spill/reloads eliminated, correctness bit-identical over [0,2047],
220        // 2.00×→1.72× vs LLVM). Escape hatch: `SYNTH_NO_LOCAL_PROMOTE=1` restores
221        // the frame-slot path. Leaf-only / i32-only / ARM-only (see
222        // compute_local_promotion); the leaf-only lift + i64 locals are follow-ons.
223        // #474: `local_promote` is now a per-attempt parameter so the retry ladder
224        // can drop promotion as an exhaustion-recovery rung (promotion pins r4-r8,
225        // which on a dense function leaves the spill allocator with nothing to
226        // free → the frame-slot path is the escape that restores compilability).
227        selector.set_local_promote(local_promote);
228        selector.select_with_stack(wasm_ops, num_params)
229    };
230    let select_direct = || -> Result<Vec<ArmInstruction>, String> {
231        const SINGLE_EXHAUSTION: &str = "all allocatable registers are live on the stack";
232        const PAIR_EXHAUSTION: &str = "no consecutive pair of free registers for i64";
233        // The full exhaustion-recovery ladder, parameterized on whether local
234        // promotion is enabled. Each rung is reached only when the previous one
235        // returned a recoverable register-exhaustion Err, so a function that
236        // compiles on the first attempt is untouched by the later rungs. Returns
237        // the result AND which rung produced it (for the #242 measurement below).
238        let recovery_ladder =
239            |promote: bool| -> (Result<Vec<ArmInstruction>, synth_core::Error>, &'static str) {
240                let mut attempt = select_direct_attempt(false, false, promote);
241                let mut rung = "base";
242                // VCR-RA-001 step 3b-lite (#242): the i32 register-exhaustion
243                // hard-fail is recoverable — retry with spill-on-exhaustion, which
244                // reserves the spill area and spills the deepest stack value when
245                // the pool is full.
246                if let Err(e) = &attempt
247                    && e.to_string().contains(SINGLE_EXHAUSTION)
248                {
249                    attempt = select_direct_attempt(true, false, promote);
250                    rung = "spill";
251                }
252                // VCR-RA-001 acceptance increment (#242): the i64 consecutive-PAIR
253                // exhaustion is recoverable too — not by stack spilling (the pair
254                // allocator already spills stack values, #171) but by frame-backing
255                // the params (#204) so they stop pinning R0-R3, with spill kept on.
256                if let Err(e) = &attempt
257                    && e.to_string().contains(PAIR_EXHAUSTION)
258                {
259                    attempt = select_direct_attempt(true, true, promote);
260                    rung = "param-backing";
261                }
262                (attempt, rung)
263            };
264        // #474: local promotion (default-on since v0.14.0) is an OPTIMIZATION — it
265        // must never be the reason a function fails to compile. Run the full ladder
266        // with promotion first (so every function that compiles today is
267        // bit-identical), and if it still ends in register exhaustion, fall back to
268        // the promotion-off ladder (the v0.12.0 frame-slot lowering — exactly what
269        // the `SYNTH_NO_LOCAL_PROMOTE=1` workaround does, now automatic). Promotion
270        // pins r4-r8 for the locals; on a dense function that leaves the allocator
271        // with nothing to free, so dropping it restores compilability. The fallback
272        // is reached ONLY by functions that exhaust WITH promotion, so promotion-on
273        // output is untouched by construction (frozen byte gate stays green).
274        let promote = std::env::var("SYNTH_NO_LOCAL_PROMOTE").is_err();
275        let (mut attempt, mut rung) = recovery_ladder(promote);
276        let mut promotion_dropped = false;
277        if promote
278            && attempt
279                .as_ref()
280                .err()
281                .is_some_and(|e| e.to_string().contains("register exhaustion"))
282        {
283            let (rescued, off_rung) = recovery_ladder(false);
284            if rescued.is_ok() {
285                attempt = rescued;
286                rung = off_rung;
287                promotion_dropped = true;
288            }
289        }
290        // VCR-RA measurement (#242): log which recovery rung produced the result,
291        // so the per-rung distribution across a corpus can be measured — the size
292        // of the failure surface a verified allocator must subsume (see
293        // scripts/repro/register_exhaustion_recovery_ladder.md). Logging only:
294        // emitted bytes are unchanged, so the frozen byte gate is unaffected.
295        if std::env::var("SYNTH_RECOVERY_STATS").is_ok() {
296            eprintln!(
297                "[recovery-stats] rung={rung}{} result={}",
298                if promotion_dropped {
299                    " promotion-off"
300                } else {
301                    ""
302                },
303                if attempt.is_ok() { "ok" } else { "exhausted" },
304            );
305        }
306        attempt.map_err(|e| format!("instruction selection failed: {}", e))
307    };
308
309    // Instruction selection: optimized or direct.
310    //
311    // #197: `--relocatable` (host-link ET_REL) forces the direct selector. The
312    // optimized path materializes an absolute linmem base (0x20000100) and does
313    // not preserve caller-saved registers across calls — both wrong for a
314    // host-linked object, where the linmem base arrives via `fp` at runtime and
315    // callees follow AAPCS. `select_with_stack` (now i64-spill capable after
316    // #171) handles fp-relative memory + caller-saved preservation correctly.
317    //
318    // #507: `br_table` is DROPPED during the optimized path's wasm→IR lowering
319    // (`optimize_full`), so `ir_to_arm` never sees the dispatch — it emits the
320    // arm bodies in fall-through sequence with no `cmp`/branch on the selector, a
321    // SILENT miscompile (every input hits the last arm). The selector value isn't
322    // even loaded. Because the drop happens before `ir_to_arm`, there's no `Err`
323    // to fall back on; detect it on the raw wasm op stream here and force the
324    // direct selector (`select_with_stack` lowers `br_table` correctly as a
325    // cmp-chain — confirmed on the `--relocatable` path). Same honest-degradation
326    // contract as the issue-#120 f32 decline: the function still compiles
327    // correctly, just without IR-level optimization. Frozen-safe: the frozen
328    // fixtures compile `--relocatable` (already direct), and no optimized-path
329    // fixture (control_step, flight_algo) contains `br_table`.
330    let has_br_table = wasm_ops
331        .iter()
332        .any(|op| matches!(op, WasmOp::BrTable { .. }));
333    let arm_instrs = if config.no_optimize || config.relocatable || has_br_table {
334        select_direct()?
335    } else {
336        let opt_config = if config.loom_compat {
337            OptimizationConfig::loom_compat()
338        } else {
339            OptimizationConfig::all()
340        };
341
342        let mut bridge = OptimizerBridge::with_config(opt_config);
343        // #188: tell the bridge how many imports there are so it declines only
344        // LOCAL calls (and leaves import calls on the optimized path, keeping
345        // the #173 field-name relocation rewrite intact).
346        bridge.set_num_imports(config.num_imports);
347        // `ir_to_arm` now returns `Result` — an `Err` means the optimized path
348        // hit an unmapped vreg (issue-#93-class). Treat it identically to an
349        // `optimize_full` failure: fall back to the direct selector rather
350        // than propagating, so the function still compiles correctly.
351        match bridge
352            .optimize_full(wasm_ops)
353            .and_then(|(opt_ir, _cfg, _stats)| bridge.ir_to_arm(&opt_ir, num_params as usize))
354        {
355            Ok(arm_ops) => arm_ops
356                .into_iter()
357                .map(|op| ArmInstruction {
358                    op,
359                    source_line: None,
360                })
361                .collect(),
362            // Issue #120: the optimized path declines modules it cannot lower
363            // (notably scalar f32/f64 ops — the IR has no float opcodes). Fall
364            // back to the direct instruction selector, which handles f32 via
365            // VFP/FPU. This is honest degradation: the function still compiles
366            // correctly, just without IR-level optimization.
367            Err(_) => select_direct()?,
368        }
369    };
370
371    // #257/#277: `mul`+`add`→`mla` fusion is intentionally NOT wired here.
372    // The transform is correct and ready (`synth_synthesis::liveness::fuse_mul_add`,
373    // fully tested), but it is **register-allocation-coupled**: over the current
374    // greedy single-pass selector, folding `mul rM,..; add rD,rM,rX` → `mla`
375    // extends the live ranges of the mul inputs to the mla point, and the added
376    // pressure (extra moves/spills) costs more than the single-cycle MLA saves —
377    // gale measured a +2 cyc on-target REGRESSION (flat_flight 255→257, G474RE)
378    // even though it removes 2 instructions and the seam stays 0x07FDF307. So the
379    // fusion stays unwired until the spill-aware allocator (VCR-RA-001) chooses
380    // registers, at which point it becomes net-positive (per #272's plan and the
381    // wiring design note). Lesson (#277): a register-pressure-affecting transform
382    // needs an on-target/allocator-aware gate, not a byte-count gate, before it
383    // can default on.
384
385    // VCR-RA-001 const-CSE / rematerialization-avoidance (#209), the first
386    // allocator-analysis-driven CODEGEN change. Drops `movw` re-materializations
387    // of a constant already resident in another register and retargets the reads
388    // — every rewrite proven by the liveness analysis, and it ONLY removes
389    // materializations (pressure never rises), so unlike the mla fusion (#277) it
390    // cannot regress on-target. Runs on the selected stream before branch
391    // resolution (it removes instructions, shifting byte offsets). Behind
392    // `SYNTH_CONST_CSE=1` while it is validated against the differential oracle +
393    // gale's five on-target baselines; off by default keeps every fixture
394    // bit-identical.
395    let arm_instrs = if std::env::var("SYNTH_CONST_CSE").is_ok() {
396        synth_synthesis::liveness::apply_const_cse(&arm_instrs).0
397    } else {
398        arm_instrs
399    };
400
401    // VCR-RA-001 RANGE RE-ALLOCATION (#209/#242, wiring step 3a) — the first
402    // CONSEQUENTIAL allocator pass: re-colour each maximal straight-line
403    // segment over the R0-R8 pool with value ranges as the allocation unit
404    // (segment inputs + per-register live-outs pinned to their original
405    // registers, reserved R9-R12/SP identity-assigned — each segment is
406    // independently sound, no cross-segment liveness assumed). Renames
407    // registers only: never adds, removes, or reorders instructions, so
408    // labels/branch offsets are unaffected.
409    //
410    // DEFAULT-ON since v0.11.36: gale cleared the gate on-target (G474RE,
411    // #209 2026-06-10) — flag-on output byte-identical to flag-off on
412    // flat_flight/controller/control_step, fires on the filter family with
413    // zero cycle delta and a small size win, all selfchecks green on silicon.
414    // Opt out with `SYNTH_RANGE_REALLOC=0`; per-function stats with
415    // `SYNTH_REALLOC_STATS=1`.
416    //
417    // The companion dead callee-saved-save elimination (gale's "next
418    // consequential lever", same issue comment) then shrinks the prologue
419    // `push {r4-r8,lr}` / epilogue `pop {r4-r8,pc}` to the callee-saved
420    // registers the re-allocated body still touches (leaf-only,
421    // SP-untouched, even-count-padded — see shrink_callee_saved_saves):
422    // ~12 cycles of pure save/restore overhead removed on small leaves.
423    let realloc_on = std::env::var("SYNTH_RANGE_REALLOC").map_or(true, |v| v != "0");
424    let arm_instrs = if realloc_on {
425        use synth_synthesis::rules::Reg;
426        const POOL: [Reg; 9] = [
427            Reg::R0,
428            Reg::R1,
429            Reg::R2,
430            Reg::R3,
431            Reg::R4,
432            Reg::R5,
433            Reg::R6,
434            Reg::R7,
435            Reg::R8,
436        ];
437        let (out, stats) = synth_synthesis::liveness::reallocate_function(&arm_instrs, &POOL);
438        if std::env::var("SYNTH_REALLOC_STATS").is_ok() {
439            eprintln!(
440                "[range-realloc] {} segments: {} reallocated, {} declined ({} validator-rejected), {} need spill (step 4)",
441                stats.segments,
442                stats.reallocated,
443                stats.declined,
444                stats.validator_rejects,
445                stats.needs_spill
446            );
447        }
448        // VCR-RA-002 (#390, epic #242): eliminate a provably-dead stack frame
449        // (`sub sp,#N`/`add sp,#N` reserved by `compute_local_layout` for locals
450        // that promotion homed in registers, never accessed). Removing it saves
451        // the two instructions AND restores the SP-untouched precondition that
452        // `shrink_callee_saved_saves` requires — so it must run FIRST. Flag-off
453        // (opt-in `SYNTH_DEAD_FRAME_ELIM=1`); off ⇒ byte-identical. Default-on
454        // flip held for on-silicon validation, like the realloc/shrink levers.
455        let out = if std::env::var("SYNTH_DEAD_FRAME_ELIM").is_ok() {
456            synth_synthesis::liveness::elide_dead_frame(&out).unwrap_or(out)
457        } else {
458            out
459        };
460        // #490 (epic #242): the optimized selector uses r4-r8 as scratch /
461        // promoted locals but emits no prologue, silently clobbering a caller's
462        // callee-saved registers. Add the missing `push {r4-r8,lr}` /
463        // `pop {r4-r8,pc}` HERE — on the post-realloc body, where realloc has
464        // lowered low-pressure r4-r8 scratch back to r0-r3, so a save is added
465        // only for registers genuinely clobbered. `shrink_callee_saved_saves`
466        // (next) then trims it to the used set. No-op on the direct path (it
467        // already has its own prologue) and on callee-saved-free leaves.
468        let out = synth_synthesis::liveness::ensure_callee_saved_prologue(&out);
469        synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out)
470    } else {
471        // Range-realloc off (`SYNTH_RANGE_REALLOC=0`): the optimized path still
472        // must preserve the callee-saved registers it clobbers (#490). No shrink
473        // (it is coupled to the realloc lever), so the conservative full save
474        // stays — correct, just not minimised in this debug configuration.
475        synth_synthesis::liveness::ensure_callee_saved_prologue(&arm_instrs)
476    };
477
478    // VCR-RA-001 SHADOW ALLOCATION (#209/#242): run the register allocator on
479    // the selected stream and LOG what it finds — without changing a single
480    // emitted byte. This is the measure-only bridge between the built analysis
481    // layer and the eventual virtual-register wiring: it shows, per real
482    // function, whether the allocator can colour it within the R0–R8 pool and
483    // how much const-CSE / rematerialization headroom exists (#209). Enable with
484    // `SYNTH_SHADOW_ALLOC=1`; off by default and side-effect-free either way.
485    if std::env::var("SYNTH_SHADOW_ALLOC").is_ok() {
486        use synth_synthesis::liveness::{
487            AllocationOutcome, allocate_function, function_peak_pressure,
488        };
489        // R9 globals / R10 mem-size / R11 mem-base / R12 IP-scratch are reserved;
490        // pin them above the 0..9 allocatable pool so the colourer keeps R0–R8.
491        let precolored = std::collections::BTreeMap::from([
492            (synth_synthesis::rules::Reg::R9, 9usize),
493            (synth_synthesis::rules::Reg::R10, 10),
494            (synth_synthesis::rules::Reg::R11, 11),
495            (synth_synthesis::rules::Reg::R12, 12),
496        ]);
497        // True VALUE pressure (one node per value, not per reused physical reg):
498        // a NeedsSpill with peak ≤ 9 is a SPURIOUS physical-register spill — the
499        // function fits once virtually allocated.
500        let peak = function_peak_pressure(&arm_instrs);
501        match allocate_function(&arm_instrs, 9, &precolored) {
502            AllocationOutcome::Allocated {
503                remat_opportunities,
504                coloring,
505            } => eprintln!(
506                "[shadow-alloc] OK: {} pregs coloured within R0-R8 pool, peak value-pressure {}, {} const-CSE/remat opportunities",
507                coloring.len(),
508                peak,
509                remat_opportunities
510            ),
511            AllocationOutcome::NeedsSpill(s) => eprintln!(
512                "[shadow-alloc] physical-graph would spill {:?}, but peak value-pressure is {} (≤9 ⇒ spurious; fits once virtually allocated)",
513                s, peak
514            ),
515            AllocationOutcome::Declined => {
516                eprintln!(
517                    "[shadow-alloc] declined (unmodeled construct — calls/i64/fp/offset-branch)"
518                )
519            }
520        }
521    }
522
523    // VCR-SEL-004 cmp→select → IT-block predication fusion (#242). The selector
524    // lowers a `select` whose condition is a comparison to a *materialize then
525    // re-test* sequence (`cmp a,b; SetCond D,c; cmp D,#0; movne dst,v1; moveq
526    // dst,v2`); this collapses it onto the comparison's own flags — deleting the
527    // `SetCond` and the `cmp D,#0` and retargeting the predicated moves to `c` /
528    // `invert(c)` — yielding the textbook predicated clamp (`cmp a,b; movc dst,v1;
529    // mov{!c} dst,v2`). −2 instructions per fused select. gale #428 measured this
530    // as the #1 hot-path size/cycle lever on the gust_mix clamp chain.
531    //
532    // Run LATE: after range re-allocation (so the dead-D proof sees final register
533    // identities) and before encode. Removal-only + rename-only ⇒ no spill
534    // regression and labels/branch offsets are unaffected. Each fusion is proven
535    // sound (flags reused only when nothing clobbers them in the window; the
536    // boolean deleted only when provably dead) — see `fuse_cmp_select`.
537    //
538    // DEFAULT-ON as of v0.13.0 (#428): cmp→select fusion ships by default. The
539    // byte-changing flip is validated by (a) the unicorn execution oracle that runs
540    // the two-move `mov{invert(c)}` arm (cmp_select_two_move_differential.py), (b)
541    // gale's gale_decider_diff 10,596-case sweep across all 8 verified primitives
542    // (native ≡ flag-off ≡ flag-on = 0x88e73178d232bcf5), and (c) the named-anchor
543    // differentials re-run with fusion ON — control_step still 0x00210A55, flat+
544    // inlined flight_algo still 0x07FDF307 (results preserved; bytes deliberately
545    // changed, re-frozen on this commit). Escape hatch: `SYNTH_NO_CMP_SELECT_FUSE=1`
546    // reverts to the pre-fusion lowering. The on-silicon G474RE DWT no-regression
547    // check is a tracked post-ship follow-up (gale owns it).
548    let arm_instrs = if std::env::var("SYNTH_NO_CMP_SELECT_FUSE").is_err() {
549        // The rewritten stream is identical to `fuse_cmp_select`'s 2-tuple form;
550        // the extra `two_move` count is diagnostic only (the fusion census /
551        // blast-radius datum — #7 made that arm reachable).
552        let (out, fused, two_move) =
553            synth_synthesis::liveness::fuse_cmp_select_with_stats(&arm_instrs);
554        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
555            let in_place = fused - two_move;
556            eprintln!(
557                "[cmp-select-fuse] {fused} select(s) fused to predicated moves \
558                 ({two_move} two-move, {in_place} in-place)"
559            );
560        }
561        out
562    } else {
563        arm_instrs
564    };
565
566    // Perf lever 1 toward native parity (#390): redundant stack-reload elimination.
567    // synth lowers every wasm local to a frame slot, so `local.set; local.get` emits
568    // `str rX,[sp,#N]; … ; ldr rY,[sp,#N]`; when rX still holds the value the reload
569    // (a ~2-cycle M4 load) becomes `mov rY,rX`. Removal-of-a-load + rename only ⇒ no
570    // new instruction form and no label/offset change. DEFAULT-ON (#242 feature
571    // loop): validated bit-identical RESULTS on every frozen anchor (control_step
572    // 0x00210A55 13/13, flat+inlined flight_algo 0x07FDF307) with .text reduced on
573    // the shipped --relocatable path, plus 8 unit tests + the frame_slot_dce
574    // execution differential — the same gated path cmp→select took to default-on in
575    // v0.13.0 (G474RE silicon confirms perf post-ship). Escape hatch:
576    // `SYNTH_NO_STACK_FWD=1` restores the frame-resident bytes (frozen-old goldens).
577    let stack_fwd = std::env::var("SYNTH_NO_STACK_FWD").is_err();
578    let arm_instrs = if stack_fwd {
579        let (out, fwd) = synth_synthesis::liveness::forward_stack_reloads(&arm_instrs);
580        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
581            eprintln!("[stack-fwd] {fwd} stack reload(s) forwarded to register moves");
582        }
583        out
584    } else {
585        arm_instrs
586    };
587
588    // VCR-RA frame-slot DCE (#242): once `forward_stack_reloads` has turned the
589    // reloads of a spill slot into register moves, the `str rX,[sp,#N]` that fed
590    // them is a dead store — its slot is never loaded again. Remove it. Pairs
591    // with (and only pays after) stack-reload forwarding, so it shares the flag.
592    let arm_instrs = if stack_fwd {
593        let (out, n) = synth_synthesis::liveness::eliminate_dead_frame_stores(&arm_instrs);
594        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
595            eprintln!("[frame-slot-dce] {n} dead frame store(s) removed");
596        }
597        out
598    } else {
599        arm_instrs
600    };
601
602    // VCR-RA immediate-shift folding (#390, #242): a constant shift amount the
603    // stack selector materialized into a scratch register (`movw rM,#C; lsl rD,rN,rM`)
604    // folds to the immediate form (`lsl rD,rN,#C`), removing the dead `movw` — −1
605    // instruction, −1 live register. Removal-only (offset-neutral before branch
606    // resolution, like the dead-store pass). DEFAULT-ON as of v0.15.0: validated
607    // bit-identical results + a net cycle win on the dissolved hot path (−2
608    // cyc/call, .text 100→90 B on gust_mix). Escape hatch: `SYNTH_NO_IMM_SHIFT_FOLD=1`.
609    let arm_instrs = if std::env::var("SYNTH_NO_IMM_SHIFT_FOLD").is_err() {
610        let (out, folds) = synth_synthesis::liveness::fold_immediate_shifts(&arm_instrs);
611        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
612            eprintln!(
613                "[imm-shift-fold] {folds} register shift(s) folded to immediate, movw dropped"
614            );
615        }
616        out
617    } else {
618        arm_instrs
619    };
620
621    // VCR-RA uxth/uxtb fold (#428, #242): `movw rM,#0xffff; and rD,rN,rM` →
622    // `uxth rD,rN` (and the 0xff/uxtb form), removing the dead `movw` — −1
623    // instruction, −1 live register per 16/8-bit mask. 0xffff/0xff are not Thumb-2
624    // modified immediates so the selector materializes them into a register; the
625    // dedicated zero-extend expresses the same masking inline. Removal-only +
626    // rewrite-in-place (offset-neutral). FLAG-OFF by default (opt-in
627    // `SYNTH_UXTH_FOLD=1`) ⇒ bit-identical (frozen gate green); the byte-changing
628    // default-on flip is the separate on-target-gated step, like the prior levers.
629    let arm_instrs = if std::env::var("SYNTH_UXTH_FOLD").is_ok() {
630        let (out, folds) = synth_synthesis::liveness::fold_uxth(&arm_instrs);
631        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
632            eprintln!("[uxth-fold] {folds} mask-and folded to uxth/uxtb, movw dropped");
633        }
634        out
635    } else {
636        arm_instrs
637    };
638
639    // ISA feature gate: validate that all generated instructions are supported
640    // by the target. This catches FPU instructions on no-FPU targets, double-precision
641    // instructions on single-precision targets, etc.
642    validate_instructions(&arm_instrs, config.target.fpu, &config.target.triple)
643        .map_err(|e| format!("ISA validation failed: {}", e))?;
644
645    // Encode to binary — use Thumb-2 for Cortex-M targets
646    let use_thumb2 = matches!(config.target.isa, IsaVariant::Thumb2 | IsaVariant::Thumb);
647
648    let encoder = if use_thumb2 {
649        ArmEncoder::new_thumb2_with_fpu(config.target.fpu)
650    } else {
651        ArmEncoder::new_arm32()
652    };
653
654    // #202: resolve local label branches (Bcc/B/Bhs/Blo) to byte-accurate
655    // offsets before encoding. `select_with_stack` emits them as label
656    // placeholders and never resolves them — without this they encode as
657    // `bne.n #0` and land mid-instruction whenever a 32-bit Thumb-2 instruction
658    // sits between the branch and its target (UsageFault on real hardware).
659    // Only meaningful for Thumb-2 (the offset units are halfword/PC+4).
660    let arm_instrs = if use_thumb2 {
661        resolve_label_branches(arm_instrs, &encoder)?
662    } else {
663        arm_instrs
664    };
665
666    let mut code = Vec::new();
667    let mut relocations = Vec::new();
668
669    // #345: literal-pool address loads. Each `LdrSym` was encoded as a placeholder
670    // `LDR.W rd,[pc,#0]`; record where its instruction sits and what it loads so
671    // we can append a pooled word (carrying the symbol address via R_ARM_ABS32)
672    // and patch the PC-relative offset once the pool position is known.
673    struct PendingLiteral {
674        ldr_offset: u32,
675        symbol: String,
676        addend: i32,
677    }
678    let mut pending_literals: Vec<PendingLiteral> = Vec::new();
679
680    // VCR-DBG-001: per-instruction source map for DWARF `.debug_line`. Captured
681    // here because `code.len()` immediately before `encode()` is the final
682    // machine offset of the instruction within this function's `.text` — nothing
683    // after the loop shifts earlier instructions (the literal pool is appended at
684    // the end; the LDR patch below is in-place/length-preserving). Purely
685    // additive: it does not touch `code`, so `.text` is byte-identical.
686    let mut line_map: LineMap = Vec::new();
687
688    for instr in &arm_instrs {
689        // Record a relocation for every BL: the encoder emits `bl #0` and
690        // relies on a relocation to patch the target. This covers BOTH import
691        // dispatch stubs (`__meld_*`, undefined externals) AND internal calls
692        // (`func_N`, defined in this object). Previously only `__meld_*` was
693        // recorded, so internal `BL func_N` calls were left as unpatched
694        // `bl #0` placeholders branching to a garbage address (#167).
695        if let ArmOp::Bl { label } = &instr.op {
696            relocations.push(CodeRelocation {
697                offset: code.len() as u32,
698                symbol: label.clone(),
699                kind: synth_core::backend::RelocKind::ThmCall,
700            });
701        }
702        // #237: symbol-relative MOVW/MOVT (the `--native-pointer-abi` static-data
703        // addressing). The encoder writes the addend in place; record the matching
704        // R_ARM_MOVW_ABS_NC / R_ARM_MOVT_ABS so the linker adds the symbol address.
705        if let ArmOp::MovwSym { symbol, .. } = &instr.op {
706            relocations.push(CodeRelocation {
707                offset: code.len() as u32,
708                symbol: symbol.clone(),
709                kind: synth_core::backend::RelocKind::MovwAbs,
710            });
711        }
712        if let ArmOp::MovtSym { symbol, .. } = &instr.op {
713            relocations.push(CodeRelocation {
714                offset: code.len() as u32,
715                symbol: symbol.clone(),
716                kind: synth_core::backend::RelocKind::MovtAbs,
717            });
718        }
719        // #345: defer the literal-pool word + reloc + offset patch to the
720        // post-loop pass (the pool address is not yet known).
721        if let ArmOp::LdrSym { symbol, addend, .. } = &instr.op {
722            pending_literals.push(PendingLiteral {
723                ldr_offset: code.len() as u32,
724                symbol: symbol.clone(),
725                addend: *addend,
726            });
727        }
728
729        // The machine offset of this instruction is the current code length,
730        // captured before the bytes are appended.
731        line_map.push((code.len() as u32, instr.source_line));
732
733        let encoded = encoder
734            .encode(&instr.op)
735            .map_err(|e| format!("ARM encoding failed: {}", e))?;
736        code.extend_from_slice(&encoded);
737    }
738
739    // #345: place the literal pool at the end of this function's `.text`. Gated on
740    // there being at least one `LdrSym` — functions without one are byte-identical
741    // to before (no trailing padding, so downstream `func_offsets` are unchanged
742    // and the frozen differential fixtures stay bit-for-bit equal).
743    if !pending_literals.is_empty() {
744        if !use_thumb2 {
745            return Err("LdrSym literal-pool addressing requires Thumb-2".to_string());
746        }
747        // 4-byte align the pool start (Thumb-2 word loads require it, and
748        // `Align(PC,4)` in the LDR-literal semantics assumes a word-aligned pool).
749        while code.len() % 4 != 0 {
750            code.push(0x00);
751        }
752        // One distinct pooled word per LdrSym (no dedup: different sites carry
753        // different addends, and the REL addend lives in the word).
754        for lit in &pending_literals {
755            let word_offset = code.len() as u32;
756
757            // REL semantics: the linker computes `S + A`, where A is the in-place
758            // value of the relocated word. Initialize the word to the addend so
759            // the final loaded address is `symbol + addend`.
760            code.extend_from_slice(&(lit.addend as u32).to_le_bytes());
761            relocations.push(CodeRelocation {
762                offset: word_offset,
763                symbol: lit.symbol.clone(),
764                kind: synth_core::backend::RelocKind::Abs32,
765            });
766
767            // Patch the placeholder `LDR.W rd,[pc,#imm12]`. Thumb-2 LDR (literal):
768            // address = Align(PC,4) + imm12, with PC = ldr_offset + 4. The pool is
769            // always after the LDR, so U=1 (already set in hw1 = 0xF8DF).
770            let pc = lit.ldr_offset + 4;
771            let aligned_pc = pc & !3u32;
772            let imm12 = word_offset - aligned_pc;
773            if imm12 > 0xFFF {
774                // Wide LDR-literal range is ±4 KB; these function bodies are far
775                // smaller, but fail cleanly rather than miscompile if exceeded.
776                return Err(format!(
777                    "LdrSym literal pool out of range (#345): imm12={} > 4095 \
778                     for symbol {}",
779                    imm12, lit.symbol
780                ));
781            }
782            let hw2_off = (lit.ldr_offset + 2) as usize;
783            let mut hw2 = u16::from_le_bytes([code[hw2_off], code[hw2_off + 1]]);
784            hw2 = (hw2 & 0xF000) | (imm12 as u16); // keep Rt, set imm12
785            let hw2_bytes = hw2.to_le_bytes();
786            code[hw2_off] = hw2_bytes[0];
787            code[hw2_off + 1] = hw2_bytes[1];
788        }
789    }
790
791    Ok((code, relocations, line_map))
792}
793
794/// Resolve local label branches to byte-accurate offsets (#202).
795///
796/// `select_with_stack` emits conditional/unconditional branches as label
797/// placeholders (`Bcc`/`B`/`Bhs`/`Blo` + `Label`) and never resolves them; the
798/// encoder then emits a `0xD000`/`0xE000` placeholder with offset 0. Before #197
799/// this path only ran for `--no-optimize`/declined functions, so the latent bug
800/// stayed hidden — routing relocatable code through it surfaced branches that
801/// land mid-instruction (a Cortex-M UsageFault) whenever a 32-bit Thumb-2
802/// instruction sits between the branch and its target.
803///
804/// This pass encodes each instruction to learn its real byte length (so 16- vs
805/// 32-bit forms and multi-instruction expansions are exact), maps each `Label`
806/// to its byte position, and rewrites every label branch to the displacement
807/// the encoder consumes: `(target - branch - 4) / 2` halfwords. A bounded
808/// fixed-point handles an offset growing a branch from 16- to 32-bit (which
809/// shifts later positions). `BCondOffset`/`BOffset` already produced inline by
810/// the optimized path carry no label and are left untouched.
811fn resolve_label_branches(
812    arm_instrs: Vec<ArmInstruction>,
813    encoder: &ArmEncoder,
814) -> Result<Vec<ArmInstruction>, String> {
815    use std::collections::HashMap;
816    use synth_synthesis::Condition;
817
818    enum BKind {
819        Cond(Condition),
820        Uncond,
821    }
822    // Record each label branch ONCE — indices are stable across iterations.
823    let mut branches: Vec<(usize, BKind, String)> = Vec::new();
824    for (i, instr) in arm_instrs.iter().enumerate() {
825        match &instr.op {
826            ArmOp::Bcc { cond, label } => branches.push((i, BKind::Cond(*cond), label.clone())),
827            ArmOp::Bhs { label } => branches.push((i, BKind::Cond(Condition::HS), label.clone())),
828            ArmOp::Blo { label } => branches.push((i, BKind::Cond(Condition::LO), label.clone())),
829            ArmOp::B { label } => branches.push((i, BKind::Uncond, label.clone())),
830            _ => {}
831        }
832    }
833    if branches.is_empty() {
834        return Ok(arm_instrs);
835    }
836
837    let mut resolved = arm_instrs;
838    // Sizes only grow (16→32-bit), so this converges quickly; cap for safety.
839    for _ in 0..16 {
840        // 1. Byte position of each instruction (Label encodes to 0 bytes).
841        let mut positions = Vec::with_capacity(resolved.len());
842        let mut pos: i64 = 0;
843        for instr in &resolved {
844            positions.push(pos);
845            pos += encoder
846                .encode(&instr.op)
847                .map_err(|e| format!("branch-resolve size probe failed: {}", e))?
848                .len() as i64;
849        }
850        // 2. Label name -> byte position (owned keys so the borrow ends here).
851        let mut labels: HashMap<String, i64> = HashMap::new();
852        for (i, instr) in resolved.iter().enumerate() {
853            if let ArmOp::Label { name } = &instr.op {
854                labels.insert(name.clone(), positions[i]);
855            }
856        }
857        // 3. Rewrite each branch to its byte-accurate offset.
858        let mut changed = false;
859        for (idx, kind, label) in &branches {
860            // A label not defined locally is an EXTERNAL target (e.g.
861            // `Trap_Handler` resolved by a relocation / the vector table). Leave
862            // such branches as their placeholder for the existing relocation
863            // path — only local control-flow labels are byte-resolved here.
864            let Some(&target) = labels.get(label) else {
865                continue;
866            };
867            // Encoder consumes the field as (target - branch - 4) / 2 halfwords.
868            // Positions are always even, so this division is exact.
869            let halfword_offset = ((target - positions[*idx] - 4) / 2) as i32;
870            let new_op = match kind {
871                BKind::Cond(c) => ArmOp::BCondOffset {
872                    cond: *c,
873                    offset: halfword_offset,
874                },
875                BKind::Uncond => ArmOp::BOffset {
876                    offset: halfword_offset,
877                },
878            };
879            if resolved[*idx].op != new_op {
880                resolved[*idx].op = new_op;
881                changed = true;
882            }
883        }
884        if !changed {
885            break;
886        }
887    }
888    Ok(resolved)
889}
890
891#[cfg(test)]
892mod tests {
893    use super::*;
894
895    #[test]
896    fn test_arm_backend_name() {
897        let backend = ArmBackend::new();
898        assert_eq!(backend.name(), "arm");
899        assert!(backend.is_available());
900    }
901
902    #[test]
903    fn test_arm_backend_capabilities() {
904        let backend = ArmBackend::new();
905        let caps = backend.capabilities();
906        assert!(!caps.produces_elf);
907        assert!(caps.supports_rule_verification);
908        assert!(!caps.is_external);
909    }
910
911    #[test]
912    fn test_compile_add_function() {
913        let backend = ArmBackend::new();
914        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
915        let config = CompileConfig::default();
916
917        let result = backend.compile_function("add", &ops, &config);
918        assert!(result.is_ok());
919
920        let func = result.unwrap();
921        assert_eq!(func.name, "add");
922        assert!(!func.code.is_empty());
923        assert_eq!(func.wasm_ops, ops);
924    }
925
926    /// VCR-DBG-001: the per-instruction source map must cover the function with
927    /// monotonic, in-bounds machine offsets, and must not perturb the emitted
928    /// code (it is captured at encode time, never serialized here).
929    #[test]
930    fn test_line_map_is_wellformed_dbg001() {
931        let backend = ArmBackend::new();
932        let ops = vec![
933            WasmOp::LocalGet(0),
934            WasmOp::LocalGet(1),
935            WasmOp::I32Add,
936            WasmOp::End,
937        ];
938        let config = CompileConfig::default();
939        let func = backend.compile_function("add", &ops, &config).unwrap();
940
941        // Non-empty, and the first instruction starts at machine offset 0.
942        assert!(
943            !func.line_map.is_empty(),
944            "a non-trivial function captures a source map"
945        );
946        assert_eq!(func.line_map[0].0, 0, "first instruction at offset 0");
947
948        // Offsets strictly increase by at least one ARM/Thumb instruction (>= 2
949        // bytes) and every mapped offset lies inside the emitted `.text`.
950        for w in func.line_map.windows(2) {
951            assert!(w[1].0 > w[0].0, "instruction offsets strictly increase");
952            assert!(
953                w[1].0 - w[0].0 >= 2,
954                "each ARM/Thumb instruction is >= 2 bytes"
955            );
956        }
957        let last = func.line_map.last().unwrap().0 as usize;
958        assert!(
959            last < func.code.len(),
960            "every mapped offset lies inside .text"
961        );
962
963        // The side-table is additive: recompiling is deterministic and the map is
964        // consistent with that exact code (capturing it does not alter output).
965        let again = backend.compile_function("add", &ops, &config).unwrap();
966        assert_eq!(
967            again.code, func.code,
968            "compilation deterministic; map is additive"
969        );
970        assert_eq!(again.line_map, func.line_map);
971    }
972
973    #[test]
974    fn test_count_params() {
975        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
976        assert_eq!(count_params(&ops), 2);
977
978        let no_params = vec![WasmOp::I32Const(5), WasmOp::I32Const(3), WasmOp::I32Add];
979        assert_eq!(count_params(&no_params), 0);
980    }
981
982    #[test]
983    fn test_arm_backend_register() {
984        let mut registry = synth_core::BackendRegistry::new();
985        registry.register(Box::new(ArmBackend::new()));
986        assert!(registry.get("arm").is_some());
987        assert_eq!(registry.available().len(), 1);
988    }
989
990    #[test]
991    fn test_compile_import_call_produces_relocations() {
992        let backend = ArmBackend::new();
993        // Simulate a WASM module where func index 0 is an import.
994        // Call(0) should generate MOV R0, #0; BL __meld_dispatch_import
995        let ops = vec![WasmOp::Call(0)];
996        let config = CompileConfig {
997            num_imports: 1,
998            no_optimize: true, // Direct instruction selection to preserve Call semantics
999            ..CompileConfig::default()
1000        };
1001
1002        let result = backend.compile_function("caller", &ops, &config);
1003        assert!(result.is_ok());
1004
1005        let func = result.unwrap();
1006        assert!(!func.code.is_empty());
1007        assert_eq!(func.relocations.len(), 1);
1008        assert_eq!(func.relocations[0].symbol, "__meld_dispatch_import");
1009        // The BL is the second instruction (after MOV R0, #0), so offset should be > 0
1010        assert!(func.relocations[0].offset > 0);
1011    }
1012
1013    /// Regression test for #197: in `relocatable` mode, an import call must
1014    /// relocate against the direct `func_N` symbol (rewritten to the wasm field
1015    /// name by `build_relocatable_elf`), NOT `__meld_dispatch_import`. This is
1016    /// the ABI half of the #197 fix — without it, a host linker cannot resolve
1017    /// the call to the real kernel symbol (e.g. `k_spin_lock`).
1018    #[test]
1019    fn test_compile_relocatable_import_uses_direct_func_symbol_197() {
1020        let backend = ArmBackend::new();
1021        let ops = vec![WasmOp::Call(0)]; // func 0 is an import
1022        let config = CompileConfig {
1023            num_imports: 1,
1024            relocatable: true,
1025            ..CompileConfig::default()
1026        };
1027
1028        let func = backend
1029            .compile_function("caller", &ops, &config)
1030            .expect("relocatable import call compiles");
1031
1032        assert_eq!(func.relocations.len(), 1);
1033        assert_eq!(
1034            func.relocations[0].symbol, "func_0",
1035            "#197: relocatable import must relocate against func_0 (→ field name), not Meld dispatch"
1036        );
1037    }
1038
1039    #[test]
1040    fn test_compile_no_imports_no_relocations() {
1041        let backend = ArmBackend::new();
1042        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
1043        let config = CompileConfig::default();
1044
1045        let func = backend.compile_function("add", &ops, &config).unwrap();
1046        assert!(func.relocations.is_empty());
1047    }
1048
1049    /// Regression test for #167: a call to an INTERNAL function
1050    /// (index `>= num_imports`) must record a relocation against `func_{index}`.
1051    /// Before the fix, only `__meld_*` (import) BLs were relocated, so
1052    /// internal `BL func_N` was emitted as an unpatched `bl #0` branching
1053    /// to a garbage address — making the object non-linkable. This test
1054    /// would have caught that regression.
1055    #[test]
1056    fn test_compile_internal_call_produces_relocation_167() {
1057        let backend = ArmBackend::new();
1058        // num_imports = 1, so Call(2) is an INTERNAL call → `BL func_2`.
1059        let ops = vec![WasmOp::Call(2)];
1060        let config = CompileConfig {
1061            num_imports: 1,
1062            no_optimize: true,
1063            ..CompileConfig::default()
1064        };
1065
1066        let func = backend
1067            .compile_function("caller", &ops, &config)
1068            .expect("internal call compiles");
1069
1070        assert_eq!(
1071            func.relocations.len(),
1072            1,
1073            "an internal call must emit exactly one relocation (#167)"
1074        );
1075        assert_eq!(
1076            func.relocations[0].symbol, "func_2",
1077            "internal call must relocate against the callee's func_{{index}} symbol (#167)"
1078        );
1079    }
1080
1081    // ─── Phase 1 safety-bounds plumbing for ARM ──────────────────────────
1082
1083    #[test]
1084    fn arm_safety_bounds_mpu_emits_same_code_as_none() {
1085        // Mpu mode must not introduce any inline check on ARM — the MPU
1086        // handles faults via hardware. The encoded bytes for an i32.load
1087        // should be identical between None and Mpu.
1088        let backend = ArmBackend::new();
1089        let ops = vec![
1090            WasmOp::LocalGet(0),
1091            WasmOp::I32Load {
1092                offset: 0,
1093                align: 2,
1094            },
1095        ];
1096        let cfg_none = CompileConfig {
1097            no_optimize: true,
1098            ..Default::default()
1099        };
1100        let cfg_mpu = CompileConfig {
1101            no_optimize: true,
1102            safety_bounds: SafetyBounds::Mpu,
1103            ..Default::default()
1104        };
1105        let n = backend.compile_function("ld", &ops, &cfg_none).unwrap();
1106        let m = backend.compile_function("ld", &ops, &cfg_mpu).unwrap();
1107        assert_eq!(
1108            n.code, m.code,
1109            "Mpu and None should produce identical ARM bytes (Mpu relies on hardware)"
1110        );
1111    }
1112
1113    #[test]
1114    fn arm_legacy_bounds_check_still_emits_software_check() {
1115        // Legacy CLI users with `--bounds-check` should keep getting the
1116        // software path even though the new SafetyBounds field defaults to None.
1117        let backend = ArmBackend::new();
1118        let ops = vec![
1119            WasmOp::LocalGet(0),
1120            WasmOp::I32Load {
1121                offset: 0,
1122                align: 2,
1123            },
1124        ];
1125        let cfg_legacy = CompileConfig {
1126            no_optimize: true,
1127            bounds_check: true,
1128            ..Default::default()
1129        };
1130        let cfg_software = CompileConfig {
1131            no_optimize: true,
1132            safety_bounds: SafetyBounds::Software,
1133            ..Default::default()
1134        };
1135        let l = backend.compile_function("ld", &ops, &cfg_legacy).unwrap();
1136        let s = backend.compile_function("ld", &ops, &cfg_software).unwrap();
1137        assert_eq!(
1138            l.code, s.code,
1139            "--bounds-check should produce the same bytes as --safety-bounds=software"
1140        );
1141    }
1142
1143    // ========================================================================
1144    // ISA feature gate tests — ensure the compiler never emits unsupported
1145    // instructions for a given target
1146    // ========================================================================
1147
1148    #[test]
1149    fn test_f32_rejected_on_cortex_m3_no_fpu() {
1150        let backend = ArmBackend::new();
1151        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
1152        let config = CompileConfig {
1153            target: TargetSpec::cortex_m3(),
1154            no_optimize: true,
1155            ..CompileConfig::default()
1156        };
1157
1158        let result = backend.compile_function("fadd", &ops, &config);
1159        assert!(
1160            result.is_err(),
1161            "f32 operations should fail on Cortex-M3 (no FPU)"
1162        );
1163    }
1164
1165    #[test]
1166    fn test_f32_accepted_on_cortex_m4f() {
1167        let backend = ArmBackend::new();
1168        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
1169        let config = CompileConfig {
1170            target: TargetSpec::cortex_m4f(),
1171            no_optimize: true,
1172            ..CompileConfig::default()
1173        };
1174
1175        let result = backend.compile_function("fadd", &ops, &config);
1176        assert!(
1177            result.is_ok(),
1178            "f32 operations should succeed on Cortex-M4F, got: {:?}",
1179            result.unwrap_err()
1180        );
1181    }
1182
1183    #[test]
1184    fn test_i32_works_on_all_targets() {
1185        let backend = ArmBackend::new();
1186        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
1187
1188        // Cortex-M3 (no FPU)
1189        let config_m3 = CompileConfig {
1190            target: TargetSpec::cortex_m3(),
1191            no_optimize: true,
1192            ..CompileConfig::default()
1193        };
1194        assert!(
1195            backend.compile_function("add", &ops, &config_m3).is_ok(),
1196            "i32 ops should work on Cortex-M3"
1197        );
1198
1199        // Cortex-M4F (single FPU)
1200        let config_m4f = CompileConfig {
1201            target: TargetSpec::cortex_m4f(),
1202            no_optimize: true,
1203            ..CompileConfig::default()
1204        };
1205        assert!(
1206            backend.compile_function("add", &ops, &config_m4f).is_ok(),
1207            "i32 ops should work on Cortex-M4F"
1208        );
1209
1210        // Cortex-M7DP (double FPU)
1211        let config_m7dp = CompileConfig {
1212            target: TargetSpec::cortex_m7dp(),
1213            no_optimize: true,
1214            ..CompileConfig::default()
1215        };
1216        assert!(
1217            backend.compile_function("add", &ops, &config_m7dp).is_ok(),
1218            "i32 ops should work on Cortex-M7DP"
1219        );
1220    }
1221
1222    #[test]
1223    fn test_f32_rejected_on_cortex_m4_no_fpu() {
1224        // Cortex-M4 (without F suffix) has no FPU
1225        let backend = ArmBackend::new();
1226        let ops = vec![WasmOp::F32Const(1.5), WasmOp::F32Const(2.5), WasmOp::F32Mul];
1227        let config = CompileConfig {
1228            target: TargetSpec::cortex_m4(),
1229            no_optimize: true,
1230            ..CompileConfig::default()
1231        };
1232
1233        let result = backend.compile_function("fmul", &ops, &config);
1234        assert!(
1235            result.is_err(),
1236            "f32 operations should fail on Cortex-M4 (no FPU)"
1237        );
1238    }
1239
1240    // ========================================================================
1241    // Issue #120 — f32 ops in the optimized lowering path
1242    //
1243    // `OptimizerBridge::wasm_to_ir` has no handlers for f32/f64 ops, so a
1244    // value-producing float op fell through to `Opcode::Nop`, leaving a
1245    // downstream consumer with an unmapped vreg and tripping the PR #101
1246    // defensive panic in `ir_to_arm`. Customer reproducer: `compiler_builtins
1247    // float::div` and `gale_compute_ipi_mask` in the `falcon-rate-component`
1248    // module.
1249    //
1250    // Fix: `optimize_full` declines float modules with a typed `Err`;
1251    // `compile_wasm_to_arm` falls back to the non-optimized `select_with_stack`
1252    // path, which handles f32 via VFP/FPU. These tests use the *default*
1253    // (optimized) config — `no_optimize` is NOT set — which is the exact
1254    // configuration that panicked pre-fix.
1255    // ========================================================================
1256
1257    /// Pre-fix: this panicked with "vreg vN has no assigned ARM register and
1258    /// no spill slot" inside `ir_to_arm`. Post-fix: the optimized path declines
1259    /// the module and the backend falls back to direct selection, producing a
1260    /// non-empty f32.div lowering on a Cortex-M4F.
1261    #[test]
1262    fn test_issue120_f32_div_compiles_via_optimized_default() {
1263        let backend = ArmBackend::new();
1264        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1265        let config = CompileConfig {
1266            target: TargetSpec::cortex_m4f(),
1267            // no_optimize NOT set — this exercises the optimized path that
1268            // panicked in issue #120, then the fallback to direct selection.
1269            ..CompileConfig::default()
1270        };
1271
1272        let result = backend.compile_function("fdiv", &ops, &config);
1273        assert!(
1274            result.is_ok(),
1275            "f32.div must compile on Cortex-M4F via the optimized->direct \
1276             fallback (issue #120), got: {:?}",
1277            result.as_ref().err()
1278        );
1279        assert!(
1280            !result.unwrap().code.is_empty(),
1281            "f32.div must produce non-empty machine code"
1282        );
1283    }
1284
1285    /// A spread of f32 ops, all through the optimized (default) config, must
1286    /// compile via the fallback on an FPU target without panicking.
1287    #[test]
1288    fn test_issue120_assorted_f32_ops_compile_via_optimized_default() {
1289        let backend = ArmBackend::new();
1290        let config = CompileConfig {
1291            target: TargetSpec::cortex_m4f(),
1292            ..CompileConfig::default()
1293        };
1294
1295        let cases: Vec<(&str, Vec<WasmOp>)> = vec![
1296            (
1297                "fadd",
1298                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Add],
1299            ),
1300            (
1301                "fmul",
1302                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Mul],
1303            ),
1304            (
1305                "fsub",
1306                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Sub],
1307            ),
1308        ];
1309
1310        for (name, ops) in cases {
1311            let result = backend.compile_function(name, &ops, &config);
1312            assert!(
1313                result.is_ok(),
1314                "{name} must compile via the optimized->direct fallback \
1315                 (issue #120), got: {:?}",
1316                result.as_ref().err()
1317            );
1318            assert!(
1319                !result.unwrap().code.is_empty(),
1320                "{name} must produce non-empty machine code"
1321            );
1322        }
1323    }
1324
1325    /// The fallback must still honor the ISA feature gate: f32 on a no-FPU
1326    /// target must fail cleanly (not panic) even on the optimized path.
1327    #[test]
1328    fn test_issue120_f32_div_rejected_on_no_fpu_via_optimized() {
1329        let backend = ArmBackend::new();
1330        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1331        let config = CompileConfig {
1332            target: TargetSpec::cortex_m3(),
1333            ..CompileConfig::default()
1334        };
1335
1336        let result = backend.compile_function("fdiv", &ops, &config);
1337        assert!(
1338            result.is_err(),
1339            "f32.div must be rejected on Cortex-M3 (no FPU), not panic"
1340        );
1341    }
1342
1343    /// #507: a `br_table` function compiled via the DEFAULT (optimized) config
1344    /// must produce the SAME bytes as the direct (`no_optimize`) selector —
1345    /// i.e. the optimized path declined it to direct, lowering the dispatch as a
1346    /// real cmp-chain instead of silently dropping it (which left all arms in
1347    /// fall-through). Pre-fix the two outputs differed (the optimized one had no
1348    /// selector compare). Execution correctness is gated by
1349    /// `scripts/repro/br_table_507_differential.py`.
1350    #[test]
1351    fn test_507_br_table_declines_to_direct() {
1352        let backend = ArmBackend::new();
1353        // dispatch(sel): br_table over 3 blocks, each storing a marker to mem[0].
1354        let ops = vec![
1355            WasmOp::Block,
1356            WasmOp::Block,
1357            WasmOp::Block,
1358            WasmOp::LocalGet(0),
1359            WasmOp::BrTable {
1360                targets: vec![0, 1, 2],
1361                default: 2,
1362            },
1363            WasmOp::End,
1364            WasmOp::I32Const(0),
1365            WasmOp::I32Const(10),
1366            WasmOp::I32Store {
1367                offset: 0,
1368                align: 2,
1369            },
1370            WasmOp::Return,
1371            WasmOp::End,
1372            WasmOp::I32Const(0),
1373            WasmOp::I32Const(20),
1374            WasmOp::I32Store {
1375                offset: 0,
1376                align: 2,
1377            },
1378            WasmOp::Return,
1379            WasmOp::End,
1380            WasmOp::I32Const(0),
1381            WasmOp::I32Const(30),
1382            WasmOp::I32Store {
1383                offset: 0,
1384                align: 2,
1385            },
1386        ];
1387        let opt = CompileConfig {
1388            target: TargetSpec::cortex_m4(),
1389            ..CompileConfig::default()
1390        };
1391        let direct = CompileConfig {
1392            target: TargetSpec::cortex_m4(),
1393            no_optimize: true,
1394            ..CompileConfig::default()
1395        };
1396        let a = backend
1397            .compile_function("dispatch", &ops, &opt)
1398            .expect("optimized-default must compile br_table (via decline)");
1399        let b = backend
1400            .compile_function("dispatch", &ops, &direct)
1401            .expect("direct must compile br_table");
1402        assert_eq!(
1403            a.code, b.code,
1404            "#507: optimized-default br_table output must be byte-identical to the \
1405             direct selector (i.e. declined to direct), not a dropped dispatch"
1406        );
1407    }
1408
1409    /// Issue #94: end-to-end byte-size check for the canonical u64-packed
1410    /// FFI-return hi32 extract pattern. Compiles two near-identical
1411    /// functions — one with the optimized shift-by-32, one with a generic
1412    /// shift-by-7 — and asserts the optimized form is meaningfully smaller.
1413    #[test]
1414    fn test_issue94_hi32_extract_is_smaller_than_generic_shift() {
1415        let backend = ArmBackend::new();
1416        let config = CompileConfig {
1417            target: TargetSpec::cortex_m4f(),
1418            ..CompileConfig::default()
1419        };
1420
1421        // Optimized path: `(local.get 0) >>> 32; wrap_i64`
1422        let ops_hi32 = vec![
1423            WasmOp::LocalGet(0), // i64 param in R0:R1
1424            WasmOp::I64Const(32),
1425            WasmOp::I64ShrU,
1426            WasmOp::I32WrapI64,
1427        ];
1428        let func_hi32 = backend
1429            .compile_function("hi32_extract", &ops_hi32, &config)
1430            .unwrap();
1431
1432        // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the
1433        // shift amount is not a multiple of 32, so it falls through to the
1434        // 38-byte runtime shift.
1435        let ops_generic = vec![
1436            WasmOp::LocalGet(0),
1437            WasmOp::I64Const(7),
1438            WasmOp::I64ShrU,
1439            WasmOp::I32WrapI64,
1440        ];
1441        let func_generic = backend
1442            .compile_function("generic_shr", &ops_generic, &config)
1443            .unwrap();
1444
1445        let bytes_hi32 = func_hi32.code.len();
1446        let bytes_generic = func_generic.code.len();
1447        println!(
1448            "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})",
1449            bytes_hi32,
1450            bytes_generic,
1451            bytes_generic.saturating_sub(bytes_hi32)
1452        );
1453        let hex: String = func_hi32
1454            .code
1455            .iter()
1456            .map(|b| format!("{:02x}", b))
1457            .collect::<Vec<_>>()
1458            .join(" ");
1459        println!("[issue #94] hi32 bytes: {}", hex);
1460        // We expect the optimized form to be at least 30 bytes smaller than
1461        // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.)
1462        assert!(
1463            bytes_hi32 + 30 <= bytes_generic,
1464            "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \
1465             expected optimized form to be at least 30 bytes smaller",
1466            bytes_hi32,
1467            bytes_generic,
1468        );
1469    }
1470}