Skip to main content

synth_backend/
arm_backend.rs

1//! ARM Backend — wraps the instruction selector + optimizer + encoder as a Backend
2//!
3//! This is Synth's custom ARM compiler targeting Cortex-M (Thumb-2).
4//! It's the only backend that supports per-rule formal verification (ASIL D path).
5
6use crate::ArmEncoder;
7use synth_core::backend::{
8    Backend, BackendCapabilities, BackendError, CodeRelocation, CompilationResult, CompileConfig,
9    CompiledFunction, LineMap, SafetyBounds,
10};
11use synth_core::target::{IsaVariant, TargetSpec};
12use synth_core::wasm_decoder::DecodedModule;
13use synth_core::wasm_op::WasmOp;
14use synth_synthesis::{
15    ArmInstruction, ArmOp, BoundsCheckConfig, InstructionSelector, OptimizationConfig,
16    OptimizerBridge, RuleDatabase, validate_instructions,
17};
18
19/// ARM Cortex-M backend using Synth's custom compiler pipeline
20pub struct ArmBackend;
21
22impl ArmBackend {
23    pub fn new() -> Self {
24        Self
25    }
26}
27
28impl Default for ArmBackend {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34impl Backend for ArmBackend {
35    fn name(&self) -> &str {
36        "arm"
37    }
38
39    fn capabilities(&self) -> BackendCapabilities {
40        BackendCapabilities {
41            produces_elf: false,
42            supports_rule_verification: true,
43            supports_binary_verification: true,
44            is_external: false,
45        }
46    }
47
48    fn supported_targets(&self) -> Vec<TargetSpec> {
49        vec![
50            TargetSpec::cortex_m3(),
51            TargetSpec::cortex_m4(),
52            TargetSpec::cortex_m4f(),
53            TargetSpec::cortex_m7(),
54            TargetSpec::cortex_m7dp(),
55        ]
56    }
57
58    fn compile_module(
59        &self,
60        module: &DecodedModule,
61        config: &CompileConfig,
62    ) -> Result<CompilationResult, BackendError> {
63        let exports: Vec<_> = module
64            .functions
65            .iter()
66            .filter(|f| f.export_name.is_some())
67            .collect();
68
69        if exports.is_empty() {
70            return Err(BackendError::CompilationFailed(
71                "no exported functions found".into(),
72            ));
73        }
74
75        let mut functions = Vec::new();
76        for func in &exports {
77            let name = func.export_name.clone().unwrap();
78            // #359: copy THIS function's declared param widths into the config so
79            // `compile_function` (which carries no function index) can refuse a
80            // 64-bit param on the AAPCS stack-argument path. Cheap clone only when
81            // a signature table is present and this function has a width entry —
82            // otherwise reuse the shared config (every existing module unchanged).
83            let func_config = match config.func_params_i64.get(func.index as usize) {
84                Some(p) if !p.is_empty() => Some(CompileConfig {
85                    current_func_params_i64: p.clone(),
86                    ..config.clone()
87                }),
88                _ => None,
89            };
90            let cfg = func_config.as_ref().unwrap_or(config);
91            let compiled = self.compile_function(&name, &func.ops, cfg)?;
92            functions.push(compiled);
93        }
94
95        Ok(CompilationResult {
96            functions,
97            elf: None,
98            backend_name: self.name().to_string(),
99        })
100    }
101
102    fn compile_function(
103        &self,
104        name: &str,
105        ops: &[WasmOp],
106        config: &CompileConfig,
107    ) -> Result<CompiledFunction, BackendError> {
108        let (code, relocations, line_map) =
109            compile_wasm_to_arm(ops, config).map_err(BackendError::CompilationFailed)?;
110
111        Ok(CompiledFunction {
112            name: name.to_string(),
113            code,
114            wasm_ops: ops.to_vec(),
115            relocations,
116            line_map,
117        })
118    }
119
120    fn is_available(&self) -> bool {
121        true // Always available — it's a library backend
122    }
123}
124
125/// Count the number of function parameters by analyzing LocalGet patterns
126fn count_params(wasm_ops: &[WasmOp]) -> u32 {
127    let mut first_access: std::collections::HashMap<u32, bool> = std::collections::HashMap::new();
128    for op in wasm_ops {
129        match op {
130            WasmOp::LocalGet(idx) => {
131                first_access.entry(*idx).or_insert(true);
132            }
133            WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
134                first_access.entry(*idx).or_insert(false);
135            }
136            _ => {}
137        }
138    }
139
140    first_access
141        .iter()
142        .filter_map(
143            |(&idx, &is_read_first)| {
144                if is_read_first { Some(idx + 1) } else { None }
145            },
146        )
147        .max()
148        .unwrap_or(0)
149}
150
151/// Core compilation: WASM ops → ARM machine code bytes + relocations
152///
153/// Returns (code_bytes, relocations) where relocations record BL instructions
154/// that target external symbols (e.g., `__meld_dispatch_import` for import calls).
155fn compile_wasm_to_arm(
156    wasm_ops: &[WasmOp],
157    config: &CompileConfig,
158) -> Result<(Vec<u8>, Vec<CodeRelocation>, LineMap), String> {
159    let num_params = count_params(wasm_ops);
160
161    let bounds_config = match config.effective_safety_bounds() {
162        SafetyBounds::None => BoundsCheckConfig::None,
163        SafetyBounds::Mpu => BoundsCheckConfig::Mpu,
164        SafetyBounds::Software => BoundsCheckConfig::Software,
165        SafetyBounds::Mask => BoundsCheckConfig::Masking,
166    };
167
168    // The non-optimized (direct) instruction-selection path. Handles f32 via
169    // VFP/FPU. Used directly when `--no-optimize` is set, and as the fallback
170    // when the optimized path declines a module (see issue #120 below).
171    //
172    // VCR-RA-001 step 3b-lite (#242): a FRESH selector per attempt, with
173    // `spill_on_exhaustion` set only on the retry — the first pass is the
174    // unmodified default, so every function that compiles today is selected by
175    // exactly the code that compiled it yesterday (bit-identity is structural,
176    // not behavioural).
177    let select_direct_attempt = |spill_on_exhaustion: bool,
178                                 param_backing_on_exhaustion: bool|
179     -> Result<Vec<ArmInstruction>, synth_core::Error> {
180        let db = RuleDatabase::with_standard_rules();
181        let mut selector =
182            InstructionSelector::with_bounds_check(db.rules().to_vec(), bounds_config);
183        selector.set_target(config.target.fpu, &config.target.triple);
184        if config.num_imports > 0 {
185            selector.set_num_imports(config.num_imports);
186        }
187        // #195: plumb the callee argument-count tables so the direct selector can
188        // marshal call arguments into R0–R3 per AAPCS.
189        selector.set_func_arg_counts(
190            config.func_arg_counts.clone(),
191            config.type_arg_counts.clone(),
192        );
193        // #197: in relocatable host-link mode, emit direct `func_N` BLs for
194        // imports (rewritten to the wasm field name by build_relocatable_elf)
195        // instead of `__meld_dispatch_import`.
196        selector.set_relocatable(config.relocatable);
197        // #237: native-pointer ABI — wasm statics become __synth_wasm_data-relative.
198        selector.set_native_pointer_abi(config.native_pointer_abi, config.linear_memory_bytes);
199        // #311: i64 call results are register PAIRS — tag them.
200        selector.set_result_types(config.func_ret_i64.clone(), config.type_ret_i64.clone());
201        // #359: declared param widths of THIS function, so the AAPCS stack-arg
202        // path can refuse 64-bit params (Ok-or-Err). Empty ⇒ assume i32.
203        selector.set_params_i64(config.current_func_params_i64.clone());
204        // Stack-pointer promotion is meaningful only under the native-pointer ABI;
205        // gating here keeps every non-native compile (all frozen fixtures) on the
206        // legacy R9 globals-table path, bit-identical.
207        if config.native_pointer_abi
208            && let Some((sp_idx, sp_init)) = config.stack_pointer_global
209        {
210            selector.set_native_pointer_stack(sp_idx, sp_init);
211        }
212        selector.set_spill_on_exhaustion(spill_on_exhaustion);
213        selector.set_param_backing_on_exhaustion(param_backing_on_exhaustion);
214        // VCR-RA local promotion (#390, #242): keep eligible non-param i32 locals
215        // in callee-saved registers instead of frame slots — the structural lever
216        // toward native parity. DEFAULT-ON as of v0.14.0: gale's G474RE DWT gate
217        // cleared it as a net win (gust_mix dissolved 58→50 cyc/call −14%, all 5
218        // stack spill/reloads eliminated, correctness bit-identical over [0,2047],
219        // 2.00×→1.72× vs LLVM). Escape hatch: `SYNTH_NO_LOCAL_PROMOTE=1` restores
220        // the frame-slot path. Leaf-only / i32-only / ARM-only (see
221        // compute_local_promotion); the leaf-only lift + i64 locals are follow-ons.
222        selector.set_local_promote(std::env::var("SYNTH_NO_LOCAL_PROMOTE").is_err());
223        selector.select_with_stack(wasm_ops, num_params)
224    };
225    let select_direct = || -> Result<Vec<ArmInstruction>, String> {
226        // The two recoverable exhaustion classes. NOT retried: the i64
227        // spill-slot-pool Err ("spill-slot pool exhausted") — the honest
228        // remaining bound of the 3b-lite allocator.
229        const SINGLE_EXHAUSTION: &str = "all allocatable registers are live on the stack";
230        const PAIR_EXHAUSTION: &str = "no consecutive pair of free registers for i64";
231        let mut attempt = select_direct_attempt(false, false);
232        // VCR-RA-001 step 3b-lite (#242): the i32 register-exhaustion
233        // hard-fail is recoverable — retry with spill-on-exhaustion, which
234        // reserves the spill area and spills the deepest stack value when the
235        // pool is full. Only functions that FAILED the first pass ever reach
236        // this, so existing output is untouched by construction.
237        if let Err(e) = &attempt
238            && e.to_string().contains(SINGLE_EXHAUSTION)
239        {
240            attempt = select_direct_attempt(true, false);
241        }
242        // VCR-RA-001 acceptance increment (#242): the i64 consecutive-PAIR
243        // exhaustion is recoverable too — but not by stack spilling (the pair
244        // allocator already spills stack values, #171): the blockers are the
245        // pinned param home registers. The final retry frame-backs the params
246        // (#204 machinery) so they stop pinning R0-R3, with spill-on-exhaustion
247        // kept on for the single-register pressure the reloads add. Reached
248        // only by functions that failed every earlier pass.
249        if let Err(e) = &attempt
250            && e.to_string().contains(PAIR_EXHAUSTION)
251        {
252            attempt = select_direct_attempt(true, true);
253        }
254        attempt.map_err(|e| format!("instruction selection failed: {}", e))
255    };
256
257    // Instruction selection: optimized or direct.
258    //
259    // #197: `--relocatable` (host-link ET_REL) forces the direct selector. The
260    // optimized path materializes an absolute linmem base (0x20000100) and does
261    // not preserve caller-saved registers across calls — both wrong for a
262    // host-linked object, where the linmem base arrives via `fp` at runtime and
263    // callees follow AAPCS. `select_with_stack` (now i64-spill capable after
264    // #171) handles fp-relative memory + caller-saved preservation correctly.
265    let arm_instrs = if config.no_optimize || config.relocatable {
266        select_direct()?
267    } else {
268        let opt_config = if config.loom_compat {
269            OptimizationConfig::loom_compat()
270        } else {
271            OptimizationConfig::all()
272        };
273
274        let mut bridge = OptimizerBridge::with_config(opt_config);
275        // #188: tell the bridge how many imports there are so it declines only
276        // LOCAL calls (and leaves import calls on the optimized path, keeping
277        // the #173 field-name relocation rewrite intact).
278        bridge.set_num_imports(config.num_imports);
279        // `ir_to_arm` now returns `Result` — an `Err` means the optimized path
280        // hit an unmapped vreg (issue-#93-class). Treat it identically to an
281        // `optimize_full` failure: fall back to the direct selector rather
282        // than propagating, so the function still compiles correctly.
283        match bridge
284            .optimize_full(wasm_ops)
285            .and_then(|(opt_ir, _cfg, _stats)| bridge.ir_to_arm(&opt_ir, num_params as usize))
286        {
287            Ok(arm_ops) => arm_ops
288                .into_iter()
289                .map(|op| ArmInstruction {
290                    op,
291                    source_line: None,
292                })
293                .collect(),
294            // Issue #120: the optimized path declines modules it cannot lower
295            // (notably scalar f32/f64 ops — the IR has no float opcodes). Fall
296            // back to the direct instruction selector, which handles f32 via
297            // VFP/FPU. This is honest degradation: the function still compiles
298            // correctly, just without IR-level optimization.
299            Err(_) => select_direct()?,
300        }
301    };
302
303    // #257/#277: `mul`+`add`→`mla` fusion is intentionally NOT wired here.
304    // The transform is correct and ready (`synth_synthesis::liveness::fuse_mul_add`,
305    // fully tested), but it is **register-allocation-coupled**: over the current
306    // greedy single-pass selector, folding `mul rM,..; add rD,rM,rX` → `mla`
307    // extends the live ranges of the mul inputs to the mla point, and the added
308    // pressure (extra moves/spills) costs more than the single-cycle MLA saves —
309    // gale measured a +2 cyc on-target REGRESSION (flat_flight 255→257, G474RE)
310    // even though it removes 2 instructions and the seam stays 0x07FDF307. So the
311    // fusion stays unwired until the spill-aware allocator (VCR-RA-001) chooses
312    // registers, at which point it becomes net-positive (per #272's plan and the
313    // wiring design note). Lesson (#277): a register-pressure-affecting transform
314    // needs an on-target/allocator-aware gate, not a byte-count gate, before it
315    // can default on.
316
317    // VCR-RA-001 const-CSE / rematerialization-avoidance (#209), the first
318    // allocator-analysis-driven CODEGEN change. Drops `movw` re-materializations
319    // of a constant already resident in another register and retargets the reads
320    // — every rewrite proven by the liveness analysis, and it ONLY removes
321    // materializations (pressure never rises), so unlike the mla fusion (#277) it
322    // cannot regress on-target. Runs on the selected stream before branch
323    // resolution (it removes instructions, shifting byte offsets). Behind
324    // `SYNTH_CONST_CSE=1` while it is validated against the differential oracle +
325    // gale's five on-target baselines; off by default keeps every fixture
326    // bit-identical.
327    let arm_instrs = if std::env::var("SYNTH_CONST_CSE").is_ok() {
328        synth_synthesis::liveness::apply_const_cse(&arm_instrs).0
329    } else {
330        arm_instrs
331    };
332
333    // VCR-RA-001 RANGE RE-ALLOCATION (#209/#242, wiring step 3a) — the first
334    // CONSEQUENTIAL allocator pass: re-colour each maximal straight-line
335    // segment over the R0-R8 pool with value ranges as the allocation unit
336    // (segment inputs + per-register live-outs pinned to their original
337    // registers, reserved R9-R12/SP identity-assigned — each segment is
338    // independently sound, no cross-segment liveness assumed). Renames
339    // registers only: never adds, removes, or reorders instructions, so
340    // labels/branch offsets are unaffected.
341    //
342    // DEFAULT-ON since v0.11.36: gale cleared the gate on-target (G474RE,
343    // #209 2026-06-10) — flag-on output byte-identical to flag-off on
344    // flat_flight/controller/control_step, fires on the filter family with
345    // zero cycle delta and a small size win, all selfchecks green on silicon.
346    // Opt out with `SYNTH_RANGE_REALLOC=0`; per-function stats with
347    // `SYNTH_REALLOC_STATS=1`.
348    //
349    // The companion dead callee-saved-save elimination (gale's "next
350    // consequential lever", same issue comment) then shrinks the prologue
351    // `push {r4-r8,lr}` / epilogue `pop {r4-r8,pc}` to the callee-saved
352    // registers the re-allocated body still touches (leaf-only,
353    // SP-untouched, even-count-padded — see shrink_callee_saved_saves):
354    // ~12 cycles of pure save/restore overhead removed on small leaves.
355    let realloc_on = std::env::var("SYNTH_RANGE_REALLOC").map_or(true, |v| v != "0");
356    let arm_instrs = if realloc_on {
357        use synth_synthesis::rules::Reg;
358        const POOL: [Reg; 9] = [
359            Reg::R0,
360            Reg::R1,
361            Reg::R2,
362            Reg::R3,
363            Reg::R4,
364            Reg::R5,
365            Reg::R6,
366            Reg::R7,
367            Reg::R8,
368        ];
369        let (out, stats) = synth_synthesis::liveness::reallocate_function(&arm_instrs, &POOL);
370        if std::env::var("SYNTH_REALLOC_STATS").is_ok() {
371            eprintln!(
372                "[range-realloc] {} segments: {} reallocated, {} declined ({} validator-rejected), {} need spill (step 4)",
373                stats.segments,
374                stats.reallocated,
375                stats.declined,
376                stats.validator_rejects,
377                stats.needs_spill
378            );
379        }
380        synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out)
381    } else {
382        arm_instrs
383    };
384
385    // VCR-RA-001 SHADOW ALLOCATION (#209/#242): run the register allocator on
386    // the selected stream and LOG what it finds — without changing a single
387    // emitted byte. This is the measure-only bridge between the built analysis
388    // layer and the eventual virtual-register wiring: it shows, per real
389    // function, whether the allocator can colour it within the R0–R8 pool and
390    // how much const-CSE / rematerialization headroom exists (#209). Enable with
391    // `SYNTH_SHADOW_ALLOC=1`; off by default and side-effect-free either way.
392    if std::env::var("SYNTH_SHADOW_ALLOC").is_ok() {
393        use synth_synthesis::liveness::{
394            AllocationOutcome, allocate_function, function_peak_pressure,
395        };
396        // R9 globals / R10 mem-size / R11 mem-base / R12 IP-scratch are reserved;
397        // pin them above the 0..9 allocatable pool so the colourer keeps R0–R8.
398        let precolored = std::collections::BTreeMap::from([
399            (synth_synthesis::rules::Reg::R9, 9usize),
400            (synth_synthesis::rules::Reg::R10, 10),
401            (synth_synthesis::rules::Reg::R11, 11),
402            (synth_synthesis::rules::Reg::R12, 12),
403        ]);
404        // True VALUE pressure (one node per value, not per reused physical reg):
405        // a NeedsSpill with peak ≤ 9 is a SPURIOUS physical-register spill — the
406        // function fits once virtually allocated.
407        let peak = function_peak_pressure(&arm_instrs);
408        match allocate_function(&arm_instrs, 9, &precolored) {
409            AllocationOutcome::Allocated {
410                remat_opportunities,
411                coloring,
412            } => eprintln!(
413                "[shadow-alloc] OK: {} pregs coloured within R0-R8 pool, peak value-pressure {}, {} const-CSE/remat opportunities",
414                coloring.len(),
415                peak,
416                remat_opportunities
417            ),
418            AllocationOutcome::NeedsSpill(s) => eprintln!(
419                "[shadow-alloc] physical-graph would spill {:?}, but peak value-pressure is {} (≤9 ⇒ spurious; fits once virtually allocated)",
420                s, peak
421            ),
422            AllocationOutcome::Declined => {
423                eprintln!(
424                    "[shadow-alloc] declined (unmodeled construct — calls/i64/fp/offset-branch)"
425                )
426            }
427        }
428    }
429
430    // VCR-SEL-004 cmp→select → IT-block predication fusion (#242). The selector
431    // lowers a `select` whose condition is a comparison to a *materialize then
432    // re-test* sequence (`cmp a,b; SetCond D,c; cmp D,#0; movne dst,v1; moveq
433    // dst,v2`); this collapses it onto the comparison's own flags — deleting the
434    // `SetCond` and the `cmp D,#0` and retargeting the predicated moves to `c` /
435    // `invert(c)` — yielding the textbook predicated clamp (`cmp a,b; movc dst,v1;
436    // mov{!c} dst,v2`). −2 instructions per fused select. gale #428 measured this
437    // as the #1 hot-path size/cycle lever on the gust_mix clamp chain.
438    //
439    // Run LATE: after range re-allocation (so the dead-D proof sees final register
440    // identities) and before encode. Removal-only + rename-only ⇒ no spill
441    // regression and labels/branch offsets are unaffected. Each fusion is proven
442    // sound (flags reused only when nothing clobbers them in the window; the
443    // boolean deleted only when provably dead) — see `fuse_cmp_select`.
444    //
445    // DEFAULT-ON as of v0.13.0 (#428): cmp→select fusion ships by default. The
446    // byte-changing flip is validated by (a) the unicorn execution oracle that runs
447    // the two-move `mov{invert(c)}` arm (cmp_select_two_move_differential.py), (b)
448    // gale's gale_decider_diff 10,596-case sweep across all 8 verified primitives
449    // (native ≡ flag-off ≡ flag-on = 0x88e73178d232bcf5), and (c) the named-anchor
450    // differentials re-run with fusion ON — control_step still 0x00210A55, flat+
451    // inlined flight_algo still 0x07FDF307 (results preserved; bytes deliberately
452    // changed, re-frozen on this commit). Escape hatch: `SYNTH_NO_CMP_SELECT_FUSE=1`
453    // reverts to the pre-fusion lowering. The on-silicon G474RE DWT no-regression
454    // check is a tracked post-ship follow-up (gale owns it).
455    let arm_instrs = if std::env::var("SYNTH_NO_CMP_SELECT_FUSE").is_err() {
456        // The rewritten stream is identical to `fuse_cmp_select`'s 2-tuple form;
457        // the extra `two_move` count is diagnostic only (the fusion census /
458        // blast-radius datum — #7 made that arm reachable).
459        let (out, fused, two_move) =
460            synth_synthesis::liveness::fuse_cmp_select_with_stats(&arm_instrs);
461        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
462            let in_place = fused - two_move;
463            eprintln!(
464                "[cmp-select-fuse] {fused} select(s) fused to predicated moves \
465                 ({two_move} two-move, {in_place} in-place)"
466            );
467        }
468        out
469    } else {
470        arm_instrs
471    };
472
473    // Perf lever 1 toward native parity (#390): redundant stack-reload elimination.
474    // synth lowers every wasm local to a frame slot, so `local.set; local.get` emits
475    // `str rX,[sp,#N]; … ; ldr rY,[sp,#N]`; when rX still holds the value the reload
476    // (a ~2-cycle M4 load) becomes `mov rY,rX`. Removal-of-a-load + rename only ⇒ no
477    // new instruction form and no label/offset change. BEHIND `SYNTH_STACK_FWD=1`
478    // (opt-in, off by default ⇒ bit-identical) while it is validated against the
479    // execution differential + gale's G474RE bench — the same gated path the
480    // cmp→select flip took before shipping default-on in v0.13.0.
481    let arm_instrs = if std::env::var("SYNTH_STACK_FWD").is_ok() {
482        let (out, fwd) = synth_synthesis::liveness::forward_stack_reloads(&arm_instrs);
483        if std::env::var("SYNTH_FUSE_STATS").is_ok() {
484            eprintln!("[stack-fwd] {fwd} stack reload(s) forwarded to register moves");
485        }
486        out
487    } else {
488        arm_instrs
489    };
490
491    // ISA feature gate: validate that all generated instructions are supported
492    // by the target. This catches FPU instructions on no-FPU targets, double-precision
493    // instructions on single-precision targets, etc.
494    validate_instructions(&arm_instrs, config.target.fpu, &config.target.triple)
495        .map_err(|e| format!("ISA validation failed: {}", e))?;
496
497    // Encode to binary — use Thumb-2 for Cortex-M targets
498    let use_thumb2 = matches!(config.target.isa, IsaVariant::Thumb2 | IsaVariant::Thumb);
499
500    let encoder = if use_thumb2 {
501        ArmEncoder::new_thumb2_with_fpu(config.target.fpu)
502    } else {
503        ArmEncoder::new_arm32()
504    };
505
506    // #202: resolve local label branches (Bcc/B/Bhs/Blo) to byte-accurate
507    // offsets before encoding. `select_with_stack` emits them as label
508    // placeholders and never resolves them — without this they encode as
509    // `bne.n #0` and land mid-instruction whenever a 32-bit Thumb-2 instruction
510    // sits between the branch and its target (UsageFault on real hardware).
511    // Only meaningful for Thumb-2 (the offset units are halfword/PC+4).
512    let arm_instrs = if use_thumb2 {
513        resolve_label_branches(arm_instrs, &encoder)?
514    } else {
515        arm_instrs
516    };
517
518    let mut code = Vec::new();
519    let mut relocations = Vec::new();
520
521    // #345: literal-pool address loads. Each `LdrSym` was encoded as a placeholder
522    // `LDR.W rd,[pc,#0]`; record where its instruction sits and what it loads so
523    // we can append a pooled word (carrying the symbol address via R_ARM_ABS32)
524    // and patch the PC-relative offset once the pool position is known.
525    struct PendingLiteral {
526        ldr_offset: u32,
527        symbol: String,
528        addend: i32,
529    }
530    let mut pending_literals: Vec<PendingLiteral> = Vec::new();
531
532    // VCR-DBG-001: per-instruction source map for DWARF `.debug_line`. Captured
533    // here because `code.len()` immediately before `encode()` is the final
534    // machine offset of the instruction within this function's `.text` — nothing
535    // after the loop shifts earlier instructions (the literal pool is appended at
536    // the end; the LDR patch below is in-place/length-preserving). Purely
537    // additive: it does not touch `code`, so `.text` is byte-identical.
538    let mut line_map: LineMap = Vec::new();
539
540    for instr in &arm_instrs {
541        // Record a relocation for every BL: the encoder emits `bl #0` and
542        // relies on a relocation to patch the target. This covers BOTH import
543        // dispatch stubs (`__meld_*`, undefined externals) AND internal calls
544        // (`func_N`, defined in this object). Previously only `__meld_*` was
545        // recorded, so internal `BL func_N` calls were left as unpatched
546        // `bl #0` placeholders branching to a garbage address (#167).
547        if let ArmOp::Bl { label } = &instr.op {
548            relocations.push(CodeRelocation {
549                offset: code.len() as u32,
550                symbol: label.clone(),
551                kind: synth_core::backend::RelocKind::ThmCall,
552            });
553        }
554        // #237: symbol-relative MOVW/MOVT (the `--native-pointer-abi` static-data
555        // addressing). The encoder writes the addend in place; record the matching
556        // R_ARM_MOVW_ABS_NC / R_ARM_MOVT_ABS so the linker adds the symbol address.
557        if let ArmOp::MovwSym { symbol, .. } = &instr.op {
558            relocations.push(CodeRelocation {
559                offset: code.len() as u32,
560                symbol: symbol.clone(),
561                kind: synth_core::backend::RelocKind::MovwAbs,
562            });
563        }
564        if let ArmOp::MovtSym { symbol, .. } = &instr.op {
565            relocations.push(CodeRelocation {
566                offset: code.len() as u32,
567                symbol: symbol.clone(),
568                kind: synth_core::backend::RelocKind::MovtAbs,
569            });
570        }
571        // #345: defer the literal-pool word + reloc + offset patch to the
572        // post-loop pass (the pool address is not yet known).
573        if let ArmOp::LdrSym { symbol, addend, .. } = &instr.op {
574            pending_literals.push(PendingLiteral {
575                ldr_offset: code.len() as u32,
576                symbol: symbol.clone(),
577                addend: *addend,
578            });
579        }
580
581        // The machine offset of this instruction is the current code length,
582        // captured before the bytes are appended.
583        line_map.push((code.len() as u32, instr.source_line));
584
585        let encoded = encoder
586            .encode(&instr.op)
587            .map_err(|e| format!("ARM encoding failed: {}", e))?;
588        code.extend_from_slice(&encoded);
589    }
590
591    // #345: place the literal pool at the end of this function's `.text`. Gated on
592    // there being at least one `LdrSym` — functions without one are byte-identical
593    // to before (no trailing padding, so downstream `func_offsets` are unchanged
594    // and the frozen differential fixtures stay bit-for-bit equal).
595    if !pending_literals.is_empty() {
596        if !use_thumb2 {
597            return Err("LdrSym literal-pool addressing requires Thumb-2".to_string());
598        }
599        // 4-byte align the pool start (Thumb-2 word loads require it, and
600        // `Align(PC,4)` in the LDR-literal semantics assumes a word-aligned pool).
601        while code.len() % 4 != 0 {
602            code.push(0x00);
603        }
604        // One distinct pooled word per LdrSym (no dedup: different sites carry
605        // different addends, and the REL addend lives in the word).
606        for lit in &pending_literals {
607            let word_offset = code.len() as u32;
608
609            // REL semantics: the linker computes `S + A`, where A is the in-place
610            // value of the relocated word. Initialize the word to the addend so
611            // the final loaded address is `symbol + addend`.
612            code.extend_from_slice(&(lit.addend as u32).to_le_bytes());
613            relocations.push(CodeRelocation {
614                offset: word_offset,
615                symbol: lit.symbol.clone(),
616                kind: synth_core::backend::RelocKind::Abs32,
617            });
618
619            // Patch the placeholder `LDR.W rd,[pc,#imm12]`. Thumb-2 LDR (literal):
620            // address = Align(PC,4) + imm12, with PC = ldr_offset + 4. The pool is
621            // always after the LDR, so U=1 (already set in hw1 = 0xF8DF).
622            let pc = lit.ldr_offset + 4;
623            let aligned_pc = pc & !3u32;
624            let imm12 = word_offset - aligned_pc;
625            if imm12 > 0xFFF {
626                // Wide LDR-literal range is ±4 KB; these function bodies are far
627                // smaller, but fail cleanly rather than miscompile if exceeded.
628                return Err(format!(
629                    "LdrSym literal pool out of range (#345): imm12={} > 4095 \
630                     for symbol {}",
631                    imm12, lit.symbol
632                ));
633            }
634            let hw2_off = (lit.ldr_offset + 2) as usize;
635            let mut hw2 = u16::from_le_bytes([code[hw2_off], code[hw2_off + 1]]);
636            hw2 = (hw2 & 0xF000) | (imm12 as u16); // keep Rt, set imm12
637            let hw2_bytes = hw2.to_le_bytes();
638            code[hw2_off] = hw2_bytes[0];
639            code[hw2_off + 1] = hw2_bytes[1];
640        }
641    }
642
643    Ok((code, relocations, line_map))
644}
645
646/// Resolve local label branches to byte-accurate offsets (#202).
647///
648/// `select_with_stack` emits conditional/unconditional branches as label
649/// placeholders (`Bcc`/`B`/`Bhs`/`Blo` + `Label`) and never resolves them; the
650/// encoder then emits a `0xD000`/`0xE000` placeholder with offset 0. Before #197
651/// this path only ran for `--no-optimize`/declined functions, so the latent bug
652/// stayed hidden — routing relocatable code through it surfaced branches that
653/// land mid-instruction (a Cortex-M UsageFault) whenever a 32-bit Thumb-2
654/// instruction sits between the branch and its target.
655///
656/// This pass encodes each instruction to learn its real byte length (so 16- vs
657/// 32-bit forms and multi-instruction expansions are exact), maps each `Label`
658/// to its byte position, and rewrites every label branch to the displacement
659/// the encoder consumes: `(target - branch - 4) / 2` halfwords. A bounded
660/// fixed-point handles an offset growing a branch from 16- to 32-bit (which
661/// shifts later positions). `BCondOffset`/`BOffset` already produced inline by
662/// the optimized path carry no label and are left untouched.
663fn resolve_label_branches(
664    arm_instrs: Vec<ArmInstruction>,
665    encoder: &ArmEncoder,
666) -> Result<Vec<ArmInstruction>, String> {
667    use std::collections::HashMap;
668    use synth_synthesis::Condition;
669
670    enum BKind {
671        Cond(Condition),
672        Uncond,
673    }
674    // Record each label branch ONCE — indices are stable across iterations.
675    let mut branches: Vec<(usize, BKind, String)> = Vec::new();
676    for (i, instr) in arm_instrs.iter().enumerate() {
677        match &instr.op {
678            ArmOp::Bcc { cond, label } => branches.push((i, BKind::Cond(*cond), label.clone())),
679            ArmOp::Bhs { label } => branches.push((i, BKind::Cond(Condition::HS), label.clone())),
680            ArmOp::Blo { label } => branches.push((i, BKind::Cond(Condition::LO), label.clone())),
681            ArmOp::B { label } => branches.push((i, BKind::Uncond, label.clone())),
682            _ => {}
683        }
684    }
685    if branches.is_empty() {
686        return Ok(arm_instrs);
687    }
688
689    let mut resolved = arm_instrs;
690    // Sizes only grow (16→32-bit), so this converges quickly; cap for safety.
691    for _ in 0..16 {
692        // 1. Byte position of each instruction (Label encodes to 0 bytes).
693        let mut positions = Vec::with_capacity(resolved.len());
694        let mut pos: i64 = 0;
695        for instr in &resolved {
696            positions.push(pos);
697            pos += encoder
698                .encode(&instr.op)
699                .map_err(|e| format!("branch-resolve size probe failed: {}", e))?
700                .len() as i64;
701        }
702        // 2. Label name -> byte position (owned keys so the borrow ends here).
703        let mut labels: HashMap<String, i64> = HashMap::new();
704        for (i, instr) in resolved.iter().enumerate() {
705            if let ArmOp::Label { name } = &instr.op {
706                labels.insert(name.clone(), positions[i]);
707            }
708        }
709        // 3. Rewrite each branch to its byte-accurate offset.
710        let mut changed = false;
711        for (idx, kind, label) in &branches {
712            // A label not defined locally is an EXTERNAL target (e.g.
713            // `Trap_Handler` resolved by a relocation / the vector table). Leave
714            // such branches as their placeholder for the existing relocation
715            // path — only local control-flow labels are byte-resolved here.
716            let Some(&target) = labels.get(label) else {
717                continue;
718            };
719            // Encoder consumes the field as (target - branch - 4) / 2 halfwords.
720            // Positions are always even, so this division is exact.
721            let halfword_offset = ((target - positions[*idx] - 4) / 2) as i32;
722            let new_op = match kind {
723                BKind::Cond(c) => ArmOp::BCondOffset {
724                    cond: *c,
725                    offset: halfword_offset,
726                },
727                BKind::Uncond => ArmOp::BOffset {
728                    offset: halfword_offset,
729                },
730            };
731            if resolved[*idx].op != new_op {
732                resolved[*idx].op = new_op;
733                changed = true;
734            }
735        }
736        if !changed {
737            break;
738        }
739    }
740    Ok(resolved)
741}
742
743#[cfg(test)]
744mod tests {
745    use super::*;
746
747    #[test]
748    fn test_arm_backend_name() {
749        let backend = ArmBackend::new();
750        assert_eq!(backend.name(), "arm");
751        assert!(backend.is_available());
752    }
753
754    #[test]
755    fn test_arm_backend_capabilities() {
756        let backend = ArmBackend::new();
757        let caps = backend.capabilities();
758        assert!(!caps.produces_elf);
759        assert!(caps.supports_rule_verification);
760        assert!(!caps.is_external);
761    }
762
763    #[test]
764    fn test_compile_add_function() {
765        let backend = ArmBackend::new();
766        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
767        let config = CompileConfig::default();
768
769        let result = backend.compile_function("add", &ops, &config);
770        assert!(result.is_ok());
771
772        let func = result.unwrap();
773        assert_eq!(func.name, "add");
774        assert!(!func.code.is_empty());
775        assert_eq!(func.wasm_ops, ops);
776    }
777
778    /// VCR-DBG-001: the per-instruction source map must cover the function with
779    /// monotonic, in-bounds machine offsets, and must not perturb the emitted
780    /// code (it is captured at encode time, never serialized here).
781    #[test]
782    fn test_line_map_is_wellformed_dbg001() {
783        let backend = ArmBackend::new();
784        let ops = vec![
785            WasmOp::LocalGet(0),
786            WasmOp::LocalGet(1),
787            WasmOp::I32Add,
788            WasmOp::End,
789        ];
790        let config = CompileConfig::default();
791        let func = backend.compile_function("add", &ops, &config).unwrap();
792
793        // Non-empty, and the first instruction starts at machine offset 0.
794        assert!(
795            !func.line_map.is_empty(),
796            "a non-trivial function captures a source map"
797        );
798        assert_eq!(func.line_map[0].0, 0, "first instruction at offset 0");
799
800        // Offsets strictly increase by at least one ARM/Thumb instruction (>= 2
801        // bytes) and every mapped offset lies inside the emitted `.text`.
802        for w in func.line_map.windows(2) {
803            assert!(w[1].0 > w[0].0, "instruction offsets strictly increase");
804            assert!(
805                w[1].0 - w[0].0 >= 2,
806                "each ARM/Thumb instruction is >= 2 bytes"
807            );
808        }
809        let last = func.line_map.last().unwrap().0 as usize;
810        assert!(
811            last < func.code.len(),
812            "every mapped offset lies inside .text"
813        );
814
815        // The side-table is additive: recompiling is deterministic and the map is
816        // consistent with that exact code (capturing it does not alter output).
817        let again = backend.compile_function("add", &ops, &config).unwrap();
818        assert_eq!(
819            again.code, func.code,
820            "compilation deterministic; map is additive"
821        );
822        assert_eq!(again.line_map, func.line_map);
823    }
824
825    #[test]
826    fn test_count_params() {
827        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
828        assert_eq!(count_params(&ops), 2);
829
830        let no_params = vec![WasmOp::I32Const(5), WasmOp::I32Const(3), WasmOp::I32Add];
831        assert_eq!(count_params(&no_params), 0);
832    }
833
834    #[test]
835    fn test_arm_backend_register() {
836        let mut registry = synth_core::BackendRegistry::new();
837        registry.register(Box::new(ArmBackend::new()));
838        assert!(registry.get("arm").is_some());
839        assert_eq!(registry.available().len(), 1);
840    }
841
842    #[test]
843    fn test_compile_import_call_produces_relocations() {
844        let backend = ArmBackend::new();
845        // Simulate a WASM module where func index 0 is an import.
846        // Call(0) should generate MOV R0, #0; BL __meld_dispatch_import
847        let ops = vec![WasmOp::Call(0)];
848        let config = CompileConfig {
849            num_imports: 1,
850            no_optimize: true, // Direct instruction selection to preserve Call semantics
851            ..CompileConfig::default()
852        };
853
854        let result = backend.compile_function("caller", &ops, &config);
855        assert!(result.is_ok());
856
857        let func = result.unwrap();
858        assert!(!func.code.is_empty());
859        assert_eq!(func.relocations.len(), 1);
860        assert_eq!(func.relocations[0].symbol, "__meld_dispatch_import");
861        // The BL is the second instruction (after MOV R0, #0), so offset should be > 0
862        assert!(func.relocations[0].offset > 0);
863    }
864
865    /// Regression test for #197: in `relocatable` mode, an import call must
866    /// relocate against the direct `func_N` symbol (rewritten to the wasm field
867    /// name by `build_relocatable_elf`), NOT `__meld_dispatch_import`. This is
868    /// the ABI half of the #197 fix — without it, a host linker cannot resolve
869    /// the call to the real kernel symbol (e.g. `k_spin_lock`).
870    #[test]
871    fn test_compile_relocatable_import_uses_direct_func_symbol_197() {
872        let backend = ArmBackend::new();
873        let ops = vec![WasmOp::Call(0)]; // func 0 is an import
874        let config = CompileConfig {
875            num_imports: 1,
876            relocatable: true,
877            ..CompileConfig::default()
878        };
879
880        let func = backend
881            .compile_function("caller", &ops, &config)
882            .expect("relocatable import call compiles");
883
884        assert_eq!(func.relocations.len(), 1);
885        assert_eq!(
886            func.relocations[0].symbol, "func_0",
887            "#197: relocatable import must relocate against func_0 (→ field name), not Meld dispatch"
888        );
889    }
890
891    #[test]
892    fn test_compile_no_imports_no_relocations() {
893        let backend = ArmBackend::new();
894        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
895        let config = CompileConfig::default();
896
897        let func = backend.compile_function("add", &ops, &config).unwrap();
898        assert!(func.relocations.is_empty());
899    }
900
901    /// Regression test for #167: a call to an INTERNAL function
902    /// (index `>= num_imports`) must record a relocation against `func_{index}`.
903    /// Before the fix, only `__meld_*` (import) BLs were relocated, so
904    /// internal `BL func_N` was emitted as an unpatched `bl #0` branching
905    /// to a garbage address — making the object non-linkable. This test
906    /// would have caught that regression.
907    #[test]
908    fn test_compile_internal_call_produces_relocation_167() {
909        let backend = ArmBackend::new();
910        // num_imports = 1, so Call(2) is an INTERNAL call → `BL func_2`.
911        let ops = vec![WasmOp::Call(2)];
912        let config = CompileConfig {
913            num_imports: 1,
914            no_optimize: true,
915            ..CompileConfig::default()
916        };
917
918        let func = backend
919            .compile_function("caller", &ops, &config)
920            .expect("internal call compiles");
921
922        assert_eq!(
923            func.relocations.len(),
924            1,
925            "an internal call must emit exactly one relocation (#167)"
926        );
927        assert_eq!(
928            func.relocations[0].symbol, "func_2",
929            "internal call must relocate against the callee's func_{{index}} symbol (#167)"
930        );
931    }
932
933    // ─── Phase 1 safety-bounds plumbing for ARM ──────────────────────────
934
935    #[test]
936    fn arm_safety_bounds_mpu_emits_same_code_as_none() {
937        // Mpu mode must not introduce any inline check on ARM — the MPU
938        // handles faults via hardware. The encoded bytes for an i32.load
939        // should be identical between None and Mpu.
940        let backend = ArmBackend::new();
941        let ops = vec![
942            WasmOp::LocalGet(0),
943            WasmOp::I32Load {
944                offset: 0,
945                align: 2,
946            },
947        ];
948        let cfg_none = CompileConfig {
949            no_optimize: true,
950            ..Default::default()
951        };
952        let cfg_mpu = CompileConfig {
953            no_optimize: true,
954            safety_bounds: SafetyBounds::Mpu,
955            ..Default::default()
956        };
957        let n = backend.compile_function("ld", &ops, &cfg_none).unwrap();
958        let m = backend.compile_function("ld", &ops, &cfg_mpu).unwrap();
959        assert_eq!(
960            n.code, m.code,
961            "Mpu and None should produce identical ARM bytes (Mpu relies on hardware)"
962        );
963    }
964
965    #[test]
966    fn arm_legacy_bounds_check_still_emits_software_check() {
967        // Legacy CLI users with `--bounds-check` should keep getting the
968        // software path even though the new SafetyBounds field defaults to None.
969        let backend = ArmBackend::new();
970        let ops = vec![
971            WasmOp::LocalGet(0),
972            WasmOp::I32Load {
973                offset: 0,
974                align: 2,
975            },
976        ];
977        let cfg_legacy = CompileConfig {
978            no_optimize: true,
979            bounds_check: true,
980            ..Default::default()
981        };
982        let cfg_software = CompileConfig {
983            no_optimize: true,
984            safety_bounds: SafetyBounds::Software,
985            ..Default::default()
986        };
987        let l = backend.compile_function("ld", &ops, &cfg_legacy).unwrap();
988        let s = backend.compile_function("ld", &ops, &cfg_software).unwrap();
989        assert_eq!(
990            l.code, s.code,
991            "--bounds-check should produce the same bytes as --safety-bounds=software"
992        );
993    }
994
995    // ========================================================================
996    // ISA feature gate tests — ensure the compiler never emits unsupported
997    // instructions for a given target
998    // ========================================================================
999
1000    #[test]
1001    fn test_f32_rejected_on_cortex_m3_no_fpu() {
1002        let backend = ArmBackend::new();
1003        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
1004        let config = CompileConfig {
1005            target: TargetSpec::cortex_m3(),
1006            no_optimize: true,
1007            ..CompileConfig::default()
1008        };
1009
1010        let result = backend.compile_function("fadd", &ops, &config);
1011        assert!(
1012            result.is_err(),
1013            "f32 operations should fail on Cortex-M3 (no FPU)"
1014        );
1015    }
1016
1017    #[test]
1018    fn test_f32_accepted_on_cortex_m4f() {
1019        let backend = ArmBackend::new();
1020        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
1021        let config = CompileConfig {
1022            target: TargetSpec::cortex_m4f(),
1023            no_optimize: true,
1024            ..CompileConfig::default()
1025        };
1026
1027        let result = backend.compile_function("fadd", &ops, &config);
1028        assert!(
1029            result.is_ok(),
1030            "f32 operations should succeed on Cortex-M4F, got: {:?}",
1031            result.unwrap_err()
1032        );
1033    }
1034
1035    #[test]
1036    fn test_i32_works_on_all_targets() {
1037        let backend = ArmBackend::new();
1038        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
1039
1040        // Cortex-M3 (no FPU)
1041        let config_m3 = CompileConfig {
1042            target: TargetSpec::cortex_m3(),
1043            no_optimize: true,
1044            ..CompileConfig::default()
1045        };
1046        assert!(
1047            backend.compile_function("add", &ops, &config_m3).is_ok(),
1048            "i32 ops should work on Cortex-M3"
1049        );
1050
1051        // Cortex-M4F (single FPU)
1052        let config_m4f = CompileConfig {
1053            target: TargetSpec::cortex_m4f(),
1054            no_optimize: true,
1055            ..CompileConfig::default()
1056        };
1057        assert!(
1058            backend.compile_function("add", &ops, &config_m4f).is_ok(),
1059            "i32 ops should work on Cortex-M4F"
1060        );
1061
1062        // Cortex-M7DP (double FPU)
1063        let config_m7dp = CompileConfig {
1064            target: TargetSpec::cortex_m7dp(),
1065            no_optimize: true,
1066            ..CompileConfig::default()
1067        };
1068        assert!(
1069            backend.compile_function("add", &ops, &config_m7dp).is_ok(),
1070            "i32 ops should work on Cortex-M7DP"
1071        );
1072    }
1073
1074    #[test]
1075    fn test_f32_rejected_on_cortex_m4_no_fpu() {
1076        // Cortex-M4 (without F suffix) has no FPU
1077        let backend = ArmBackend::new();
1078        let ops = vec![WasmOp::F32Const(1.5), WasmOp::F32Const(2.5), WasmOp::F32Mul];
1079        let config = CompileConfig {
1080            target: TargetSpec::cortex_m4(),
1081            no_optimize: true,
1082            ..CompileConfig::default()
1083        };
1084
1085        let result = backend.compile_function("fmul", &ops, &config);
1086        assert!(
1087            result.is_err(),
1088            "f32 operations should fail on Cortex-M4 (no FPU)"
1089        );
1090    }
1091
1092    // ========================================================================
1093    // Issue #120 — f32 ops in the optimized lowering path
1094    //
1095    // `OptimizerBridge::wasm_to_ir` has no handlers for f32/f64 ops, so a
1096    // value-producing float op fell through to `Opcode::Nop`, leaving a
1097    // downstream consumer with an unmapped vreg and tripping the PR #101
1098    // defensive panic in `ir_to_arm`. Customer reproducer: `compiler_builtins
1099    // float::div` and `gale_compute_ipi_mask` in the `falcon-rate-component`
1100    // module.
1101    //
1102    // Fix: `optimize_full` declines float modules with a typed `Err`;
1103    // `compile_wasm_to_arm` falls back to the non-optimized `select_with_stack`
1104    // path, which handles f32 via VFP/FPU. These tests use the *default*
1105    // (optimized) config — `no_optimize` is NOT set — which is the exact
1106    // configuration that panicked pre-fix.
1107    // ========================================================================
1108
1109    /// Pre-fix: this panicked with "vreg vN has no assigned ARM register and
1110    /// no spill slot" inside `ir_to_arm`. Post-fix: the optimized path declines
1111    /// the module and the backend falls back to direct selection, producing a
1112    /// non-empty f32.div lowering on a Cortex-M4F.
1113    #[test]
1114    fn test_issue120_f32_div_compiles_via_optimized_default() {
1115        let backend = ArmBackend::new();
1116        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1117        let config = CompileConfig {
1118            target: TargetSpec::cortex_m4f(),
1119            // no_optimize NOT set — this exercises the optimized path that
1120            // panicked in issue #120, then the fallback to direct selection.
1121            ..CompileConfig::default()
1122        };
1123
1124        let result = backend.compile_function("fdiv", &ops, &config);
1125        assert!(
1126            result.is_ok(),
1127            "f32.div must compile on Cortex-M4F via the optimized->direct \
1128             fallback (issue #120), got: {:?}",
1129            result.as_ref().err()
1130        );
1131        assert!(
1132            !result.unwrap().code.is_empty(),
1133            "f32.div must produce non-empty machine code"
1134        );
1135    }
1136
1137    /// A spread of f32 ops, all through the optimized (default) config, must
1138    /// compile via the fallback on an FPU target without panicking.
1139    #[test]
1140    fn test_issue120_assorted_f32_ops_compile_via_optimized_default() {
1141        let backend = ArmBackend::new();
1142        let config = CompileConfig {
1143            target: TargetSpec::cortex_m4f(),
1144            ..CompileConfig::default()
1145        };
1146
1147        let cases: Vec<(&str, Vec<WasmOp>)> = vec![
1148            (
1149                "fadd",
1150                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Add],
1151            ),
1152            (
1153                "fmul",
1154                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Mul],
1155            ),
1156            (
1157                "fsub",
1158                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Sub],
1159            ),
1160        ];
1161
1162        for (name, ops) in cases {
1163            let result = backend.compile_function(name, &ops, &config);
1164            assert!(
1165                result.is_ok(),
1166                "{name} must compile via the optimized->direct fallback \
1167                 (issue #120), got: {:?}",
1168                result.as_ref().err()
1169            );
1170            assert!(
1171                !result.unwrap().code.is_empty(),
1172                "{name} must produce non-empty machine code"
1173            );
1174        }
1175    }
1176
1177    /// The fallback must still honor the ISA feature gate: f32 on a no-FPU
1178    /// target must fail cleanly (not panic) even on the optimized path.
1179    #[test]
1180    fn test_issue120_f32_div_rejected_on_no_fpu_via_optimized() {
1181        let backend = ArmBackend::new();
1182        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1183        let config = CompileConfig {
1184            target: TargetSpec::cortex_m3(),
1185            ..CompileConfig::default()
1186        };
1187
1188        let result = backend.compile_function("fdiv", &ops, &config);
1189        assert!(
1190            result.is_err(),
1191            "f32.div must be rejected on Cortex-M3 (no FPU), not panic"
1192        );
1193    }
1194
1195    /// Issue #94: end-to-end byte-size check for the canonical u64-packed
1196    /// FFI-return hi32 extract pattern. Compiles two near-identical
1197    /// functions — one with the optimized shift-by-32, one with a generic
1198    /// shift-by-7 — and asserts the optimized form is meaningfully smaller.
1199    #[test]
1200    fn test_issue94_hi32_extract_is_smaller_than_generic_shift() {
1201        let backend = ArmBackend::new();
1202        let config = CompileConfig {
1203            target: TargetSpec::cortex_m4f(),
1204            ..CompileConfig::default()
1205        };
1206
1207        // Optimized path: `(local.get 0) >>> 32; wrap_i64`
1208        let ops_hi32 = vec![
1209            WasmOp::LocalGet(0), // i64 param in R0:R1
1210            WasmOp::I64Const(32),
1211            WasmOp::I64ShrU,
1212            WasmOp::I32WrapI64,
1213        ];
1214        let func_hi32 = backend
1215            .compile_function("hi32_extract", &ops_hi32, &config)
1216            .unwrap();
1217
1218        // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the
1219        // shift amount is not a multiple of 32, so it falls through to the
1220        // 38-byte runtime shift.
1221        let ops_generic = vec![
1222            WasmOp::LocalGet(0),
1223            WasmOp::I64Const(7),
1224            WasmOp::I64ShrU,
1225            WasmOp::I32WrapI64,
1226        ];
1227        let func_generic = backend
1228            .compile_function("generic_shr", &ops_generic, &config)
1229            .unwrap();
1230
1231        let bytes_hi32 = func_hi32.code.len();
1232        let bytes_generic = func_generic.code.len();
1233        println!(
1234            "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})",
1235            bytes_hi32,
1236            bytes_generic,
1237            bytes_generic.saturating_sub(bytes_hi32)
1238        );
1239        let hex: String = func_hi32
1240            .code
1241            .iter()
1242            .map(|b| format!("{:02x}", b))
1243            .collect::<Vec<_>>()
1244            .join(" ");
1245        println!("[issue #94] hi32 bytes: {}", hex);
1246        // We expect the optimized form to be at least 30 bytes smaller than
1247        // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.)
1248        assert!(
1249            bytes_hi32 + 30 <= bytes_generic,
1250            "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \
1251             expected optimized form to be at least 30 bytes smaller",
1252            bytes_hi32,
1253            bytes_generic,
1254        );
1255    }
1256}