Skip to main content

synth_backend/
arm_backend.rs

1//! ARM Backend — wraps the instruction selector + optimizer + encoder as a Backend
2//!
3//! This is Synth's custom ARM compiler targeting Cortex-M (Thumb-2).
4//! It's the only backend that supports per-rule formal verification (ASIL D path).
5
6use crate::ArmEncoder;
7use synth_core::backend::{
8    Backend, BackendCapabilities, BackendError, CodeRelocation, CompilationResult, CompileConfig,
9    CompiledFunction, SafetyBounds,
10};
11use synth_core::target::{IsaVariant, TargetSpec};
12use synth_core::wasm_decoder::DecodedModule;
13use synth_core::wasm_op::WasmOp;
14use synth_synthesis::{
15    ArmInstruction, ArmOp, BoundsCheckConfig, InstructionSelector, OptimizationConfig,
16    OptimizerBridge, RuleDatabase, validate_instructions,
17};
18
19/// ARM Cortex-M backend using Synth's custom compiler pipeline
20pub struct ArmBackend;
21
22impl ArmBackend {
23    pub fn new() -> Self {
24        Self
25    }
26}
27
28impl Default for ArmBackend {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34impl Backend for ArmBackend {
35    fn name(&self) -> &str {
36        "arm"
37    }
38
39    fn capabilities(&self) -> BackendCapabilities {
40        BackendCapabilities {
41            produces_elf: false,
42            supports_rule_verification: true,
43            supports_binary_verification: true,
44            is_external: false,
45        }
46    }
47
48    fn supported_targets(&self) -> Vec<TargetSpec> {
49        vec![
50            TargetSpec::cortex_m3(),
51            TargetSpec::cortex_m4(),
52            TargetSpec::cortex_m4f(),
53            TargetSpec::cortex_m7(),
54            TargetSpec::cortex_m7dp(),
55        ]
56    }
57
58    fn compile_module(
59        &self,
60        module: &DecodedModule,
61        config: &CompileConfig,
62    ) -> Result<CompilationResult, BackendError> {
63        let exports: Vec<_> = module
64            .functions
65            .iter()
66            .filter(|f| f.export_name.is_some())
67            .collect();
68
69        if exports.is_empty() {
70            return Err(BackendError::CompilationFailed(
71                "no exported functions found".into(),
72            ));
73        }
74
75        let mut functions = Vec::new();
76        for func in &exports {
77            let name = func.export_name.clone().unwrap();
78            // #359: copy THIS function's declared param widths into the config so
79            // `compile_function` (which carries no function index) can refuse a
80            // 64-bit param on the AAPCS stack-argument path. Cheap clone only when
81            // a signature table is present and this function has a width entry —
82            // otherwise reuse the shared config (every existing module unchanged).
83            let func_config = match config.func_params_i64.get(func.index as usize) {
84                Some(p) if !p.is_empty() => Some(CompileConfig {
85                    current_func_params_i64: p.clone(),
86                    ..config.clone()
87                }),
88                _ => None,
89            };
90            let cfg = func_config.as_ref().unwrap_or(config);
91            let compiled = self.compile_function(&name, &func.ops, cfg)?;
92            functions.push(compiled);
93        }
94
95        Ok(CompilationResult {
96            functions,
97            elf: None,
98            backend_name: self.name().to_string(),
99        })
100    }
101
102    fn compile_function(
103        &self,
104        name: &str,
105        ops: &[WasmOp],
106        config: &CompileConfig,
107    ) -> Result<CompiledFunction, BackendError> {
108        let (code, relocations) =
109            compile_wasm_to_arm(ops, config).map_err(BackendError::CompilationFailed)?;
110
111        Ok(CompiledFunction {
112            name: name.to_string(),
113            code,
114            wasm_ops: ops.to_vec(),
115            relocations,
116        })
117    }
118
119    fn is_available(&self) -> bool {
120        true // Always available — it's a library backend
121    }
122}
123
124/// Count the number of function parameters by analyzing LocalGet patterns
125fn count_params(wasm_ops: &[WasmOp]) -> u32 {
126    let mut first_access: std::collections::HashMap<u32, bool> = std::collections::HashMap::new();
127    for op in wasm_ops {
128        match op {
129            WasmOp::LocalGet(idx) => {
130                first_access.entry(*idx).or_insert(true);
131            }
132            WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
133                first_access.entry(*idx).or_insert(false);
134            }
135            _ => {}
136        }
137    }
138
139    first_access
140        .iter()
141        .filter_map(
142            |(&idx, &is_read_first)| {
143                if is_read_first { Some(idx + 1) } else { None }
144            },
145        )
146        .max()
147        .unwrap_or(0)
148}
149
150/// Core compilation: WASM ops → ARM machine code bytes + relocations
151///
152/// Returns (code_bytes, relocations) where relocations record BL instructions
153/// that target external symbols (e.g., `__meld_dispatch_import` for import calls).
154fn compile_wasm_to_arm(
155    wasm_ops: &[WasmOp],
156    config: &CompileConfig,
157) -> Result<(Vec<u8>, Vec<CodeRelocation>), String> {
158    let num_params = count_params(wasm_ops);
159
160    let bounds_config = match config.effective_safety_bounds() {
161        SafetyBounds::None => BoundsCheckConfig::None,
162        SafetyBounds::Mpu => BoundsCheckConfig::Mpu,
163        SafetyBounds::Software => BoundsCheckConfig::Software,
164        SafetyBounds::Mask => BoundsCheckConfig::Masking,
165    };
166
167    // The non-optimized (direct) instruction-selection path. Handles f32 via
168    // VFP/FPU. Used directly when `--no-optimize` is set, and as the fallback
169    // when the optimized path declines a module (see issue #120 below).
170    //
171    // VCR-RA-001 step 3b-lite (#242): a FRESH selector per attempt, with
172    // `spill_on_exhaustion` set only on the retry — the first pass is the
173    // unmodified default, so every function that compiles today is selected by
174    // exactly the code that compiled it yesterday (bit-identity is structural,
175    // not behavioural).
176    let select_direct_attempt = |spill_on_exhaustion: bool,
177                                 param_backing_on_exhaustion: bool|
178     -> Result<Vec<ArmInstruction>, synth_core::Error> {
179        let db = RuleDatabase::with_standard_rules();
180        let mut selector =
181            InstructionSelector::with_bounds_check(db.rules().to_vec(), bounds_config);
182        selector.set_target(config.target.fpu, &config.target.triple);
183        if config.num_imports > 0 {
184            selector.set_num_imports(config.num_imports);
185        }
186        // #195: plumb the callee argument-count tables so the direct selector can
187        // marshal call arguments into R0–R3 per AAPCS.
188        selector.set_func_arg_counts(
189            config.func_arg_counts.clone(),
190            config.type_arg_counts.clone(),
191        );
192        // #197: in relocatable host-link mode, emit direct `func_N` BLs for
193        // imports (rewritten to the wasm field name by build_relocatable_elf)
194        // instead of `__meld_dispatch_import`.
195        selector.set_relocatable(config.relocatable);
196        // #237: native-pointer ABI — wasm statics become __synth_wasm_data-relative.
197        selector.set_native_pointer_abi(config.native_pointer_abi, config.linear_memory_bytes);
198        // #311: i64 call results are register PAIRS — tag them.
199        selector.set_result_types(config.func_ret_i64.clone(), config.type_ret_i64.clone());
200        // #359: declared param widths of THIS function, so the AAPCS stack-arg
201        // path can refuse 64-bit params (Ok-or-Err). Empty ⇒ assume i32.
202        selector.set_params_i64(config.current_func_params_i64.clone());
203        // Stack-pointer promotion is meaningful only under the native-pointer ABI;
204        // gating here keeps every non-native compile (all frozen fixtures) on the
205        // legacy R9 globals-table path, bit-identical.
206        if config.native_pointer_abi
207            && let Some((sp_idx, sp_init)) = config.stack_pointer_global
208        {
209            selector.set_native_pointer_stack(sp_idx, sp_init);
210        }
211        selector.set_spill_on_exhaustion(spill_on_exhaustion);
212        selector.set_param_backing_on_exhaustion(param_backing_on_exhaustion);
213        selector.select_with_stack(wasm_ops, num_params)
214    };
215    let select_direct = || -> Result<Vec<ArmInstruction>, String> {
216        // The two recoverable exhaustion classes. NOT retried: the i64
217        // spill-slot-pool Err ("spill-slot pool exhausted") — the honest
218        // remaining bound of the 3b-lite allocator.
219        const SINGLE_EXHAUSTION: &str = "all allocatable registers are live on the stack";
220        const PAIR_EXHAUSTION: &str = "no consecutive pair of free registers for i64";
221        let mut attempt = select_direct_attempt(false, false);
222        // VCR-RA-001 step 3b-lite (#242): the i32 register-exhaustion
223        // hard-fail is recoverable — retry with spill-on-exhaustion, which
224        // reserves the spill area and spills the deepest stack value when the
225        // pool is full. Only functions that FAILED the first pass ever reach
226        // this, so existing output is untouched by construction.
227        if let Err(e) = &attempt
228            && e.to_string().contains(SINGLE_EXHAUSTION)
229        {
230            attempt = select_direct_attempt(true, false);
231        }
232        // VCR-RA-001 acceptance increment (#242): the i64 consecutive-PAIR
233        // exhaustion is recoverable too — but not by stack spilling (the pair
234        // allocator already spills stack values, #171): the blockers are the
235        // pinned param home registers. The final retry frame-backs the params
236        // (#204 machinery) so they stop pinning R0-R3, with spill-on-exhaustion
237        // kept on for the single-register pressure the reloads add. Reached
238        // only by functions that failed every earlier pass.
239        if let Err(e) = &attempt
240            && e.to_string().contains(PAIR_EXHAUSTION)
241        {
242            attempt = select_direct_attempt(true, true);
243        }
244        attempt.map_err(|e| format!("instruction selection failed: {}", e))
245    };
246
247    // Instruction selection: optimized or direct.
248    //
249    // #197: `--relocatable` (host-link ET_REL) forces the direct selector. The
250    // optimized path materializes an absolute linmem base (0x20000100) and does
251    // not preserve caller-saved registers across calls — both wrong for a
252    // host-linked object, where the linmem base arrives via `fp` at runtime and
253    // callees follow AAPCS. `select_with_stack` (now i64-spill capable after
254    // #171) handles fp-relative memory + caller-saved preservation correctly.
255    let arm_instrs = if config.no_optimize || config.relocatable {
256        select_direct()?
257    } else {
258        let opt_config = if config.loom_compat {
259            OptimizationConfig::loom_compat()
260        } else {
261            OptimizationConfig::all()
262        };
263
264        let mut bridge = OptimizerBridge::with_config(opt_config);
265        // #188: tell the bridge how many imports there are so it declines only
266        // LOCAL calls (and leaves import calls on the optimized path, keeping
267        // the #173 field-name relocation rewrite intact).
268        bridge.set_num_imports(config.num_imports);
269        // `ir_to_arm` now returns `Result` — an `Err` means the optimized path
270        // hit an unmapped vreg (issue-#93-class). Treat it identically to an
271        // `optimize_full` failure: fall back to the direct selector rather
272        // than propagating, so the function still compiles correctly.
273        match bridge
274            .optimize_full(wasm_ops)
275            .and_then(|(opt_ir, _cfg, _stats)| bridge.ir_to_arm(&opt_ir, num_params as usize))
276        {
277            Ok(arm_ops) => arm_ops
278                .into_iter()
279                .map(|op| ArmInstruction {
280                    op,
281                    source_line: None,
282                })
283                .collect(),
284            // Issue #120: the optimized path declines modules it cannot lower
285            // (notably scalar f32/f64 ops — the IR has no float opcodes). Fall
286            // back to the direct instruction selector, which handles f32 via
287            // VFP/FPU. This is honest degradation: the function still compiles
288            // correctly, just without IR-level optimization.
289            Err(_) => select_direct()?,
290        }
291    };
292
293    // #257/#277: `mul`+`add`→`mla` fusion is intentionally NOT wired here.
294    // The transform is correct and ready (`synth_synthesis::liveness::fuse_mul_add`,
295    // fully tested), but it is **register-allocation-coupled**: over the current
296    // greedy single-pass selector, folding `mul rM,..; add rD,rM,rX` → `mla`
297    // extends the live ranges of the mul inputs to the mla point, and the added
298    // pressure (extra moves/spills) costs more than the single-cycle MLA saves —
299    // gale measured a +2 cyc on-target REGRESSION (flat_flight 255→257, G474RE)
300    // even though it removes 2 instructions and the seam stays 0x07FDF307. So the
301    // fusion stays unwired until the spill-aware allocator (VCR-RA-001) chooses
302    // registers, at which point it becomes net-positive (per #272's plan and the
303    // wiring design note). Lesson (#277): a register-pressure-affecting transform
304    // needs an on-target/allocator-aware gate, not a byte-count gate, before it
305    // can default on.
306
307    // VCR-RA-001 const-CSE / rematerialization-avoidance (#209), the first
308    // allocator-analysis-driven CODEGEN change. Drops `movw` re-materializations
309    // of a constant already resident in another register and retargets the reads
310    // — every rewrite proven by the liveness analysis, and it ONLY removes
311    // materializations (pressure never rises), so unlike the mla fusion (#277) it
312    // cannot regress on-target. Runs on the selected stream before branch
313    // resolution (it removes instructions, shifting byte offsets). Behind
314    // `SYNTH_CONST_CSE=1` while it is validated against the differential oracle +
315    // gale's five on-target baselines; off by default keeps every fixture
316    // bit-identical.
317    let arm_instrs = if std::env::var("SYNTH_CONST_CSE").is_ok() {
318        synth_synthesis::liveness::apply_const_cse(&arm_instrs).0
319    } else {
320        arm_instrs
321    };
322
323    // VCR-RA-001 RANGE RE-ALLOCATION (#209/#242, wiring step 3a) — the first
324    // CONSEQUENTIAL allocator pass: re-colour each maximal straight-line
325    // segment over the R0-R8 pool with value ranges as the allocation unit
326    // (segment inputs + per-register live-outs pinned to their original
327    // registers, reserved R9-R12/SP identity-assigned — each segment is
328    // independently sound, no cross-segment liveness assumed). Renames
329    // registers only: never adds, removes, or reorders instructions, so
330    // labels/branch offsets are unaffected.
331    //
332    // DEFAULT-ON since v0.11.36: gale cleared the gate on-target (G474RE,
333    // #209 2026-06-10) — flag-on output byte-identical to flag-off on
334    // flat_flight/controller/control_step, fires on the filter family with
335    // zero cycle delta and a small size win, all selfchecks green on silicon.
336    // Opt out with `SYNTH_RANGE_REALLOC=0`; per-function stats with
337    // `SYNTH_REALLOC_STATS=1`.
338    //
339    // The companion dead callee-saved-save elimination (gale's "next
340    // consequential lever", same issue comment) then shrinks the prologue
341    // `push {r4-r8,lr}` / epilogue `pop {r4-r8,pc}` to the callee-saved
342    // registers the re-allocated body still touches (leaf-only,
343    // SP-untouched, even-count-padded — see shrink_callee_saved_saves):
344    // ~12 cycles of pure save/restore overhead removed on small leaves.
345    let realloc_on = std::env::var("SYNTH_RANGE_REALLOC").map_or(true, |v| v != "0");
346    let arm_instrs = if realloc_on {
347        use synth_synthesis::rules::Reg;
348        const POOL: [Reg; 9] = [
349            Reg::R0,
350            Reg::R1,
351            Reg::R2,
352            Reg::R3,
353            Reg::R4,
354            Reg::R5,
355            Reg::R6,
356            Reg::R7,
357            Reg::R8,
358        ];
359        let (out, stats) = synth_synthesis::liveness::reallocate_function(&arm_instrs, &POOL);
360        if std::env::var("SYNTH_REALLOC_STATS").is_ok() {
361            eprintln!(
362                "[range-realloc] {} segments: {} reallocated, {} declined ({} validator-rejected), {} need spill (step 4)",
363                stats.segments,
364                stats.reallocated,
365                stats.declined,
366                stats.validator_rejects,
367                stats.needs_spill
368            );
369        }
370        synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out)
371    } else {
372        arm_instrs
373    };
374
375    // VCR-RA-001 SHADOW ALLOCATION (#209/#242): run the register allocator on
376    // the selected stream and LOG what it finds — without changing a single
377    // emitted byte. This is the measure-only bridge between the built analysis
378    // layer and the eventual virtual-register wiring: it shows, per real
379    // function, whether the allocator can colour it within the R0–R8 pool and
380    // how much const-CSE / rematerialization headroom exists (#209). Enable with
381    // `SYNTH_SHADOW_ALLOC=1`; off by default and side-effect-free either way.
382    if std::env::var("SYNTH_SHADOW_ALLOC").is_ok() {
383        use synth_synthesis::liveness::{
384            AllocationOutcome, allocate_function, function_peak_pressure,
385        };
386        // R9 globals / R10 mem-size / R11 mem-base / R12 IP-scratch are reserved;
387        // pin them above the 0..9 allocatable pool so the colourer keeps R0–R8.
388        let precolored = std::collections::BTreeMap::from([
389            (synth_synthesis::rules::Reg::R9, 9usize),
390            (synth_synthesis::rules::Reg::R10, 10),
391            (synth_synthesis::rules::Reg::R11, 11),
392            (synth_synthesis::rules::Reg::R12, 12),
393        ]);
394        // True VALUE pressure (one node per value, not per reused physical reg):
395        // a NeedsSpill with peak ≤ 9 is a SPURIOUS physical-register spill — the
396        // function fits once virtually allocated.
397        let peak = function_peak_pressure(&arm_instrs);
398        match allocate_function(&arm_instrs, 9, &precolored) {
399            AllocationOutcome::Allocated {
400                remat_opportunities,
401                coloring,
402            } => eprintln!(
403                "[shadow-alloc] OK: {} pregs coloured within R0-R8 pool, peak value-pressure {}, {} const-CSE/remat opportunities",
404                coloring.len(),
405                peak,
406                remat_opportunities
407            ),
408            AllocationOutcome::NeedsSpill(s) => eprintln!(
409                "[shadow-alloc] physical-graph would spill {:?}, but peak value-pressure is {} (≤9 ⇒ spurious; fits once virtually allocated)",
410                s, peak
411            ),
412            AllocationOutcome::Declined => {
413                eprintln!(
414                    "[shadow-alloc] declined (unmodeled construct — calls/i64/fp/offset-branch)"
415                )
416            }
417        }
418    }
419
420    // ISA feature gate: validate that all generated instructions are supported
421    // by the target. This catches FPU instructions on no-FPU targets, double-precision
422    // instructions on single-precision targets, etc.
423    validate_instructions(&arm_instrs, config.target.fpu, &config.target.triple)
424        .map_err(|e| format!("ISA validation failed: {}", e))?;
425
426    // Encode to binary — use Thumb-2 for Cortex-M targets
427    let use_thumb2 = matches!(config.target.isa, IsaVariant::Thumb2 | IsaVariant::Thumb);
428
429    let encoder = if use_thumb2 {
430        ArmEncoder::new_thumb2_with_fpu(config.target.fpu)
431    } else {
432        ArmEncoder::new_arm32()
433    };
434
435    // #202: resolve local label branches (Bcc/B/Bhs/Blo) to byte-accurate
436    // offsets before encoding. `select_with_stack` emits them as label
437    // placeholders and never resolves them — without this they encode as
438    // `bne.n #0` and land mid-instruction whenever a 32-bit Thumb-2 instruction
439    // sits between the branch and its target (UsageFault on real hardware).
440    // Only meaningful for Thumb-2 (the offset units are halfword/PC+4).
441    let arm_instrs = if use_thumb2 {
442        resolve_label_branches(arm_instrs, &encoder)?
443    } else {
444        arm_instrs
445    };
446
447    let mut code = Vec::new();
448    let mut relocations = Vec::new();
449
450    // #345: literal-pool address loads. Each `LdrSym` was encoded as a placeholder
451    // `LDR.W rd,[pc,#0]`; record where its instruction sits and what it loads so
452    // we can append a pooled word (carrying the symbol address via R_ARM_ABS32)
453    // and patch the PC-relative offset once the pool position is known.
454    struct PendingLiteral {
455        ldr_offset: u32,
456        symbol: String,
457        addend: i32,
458    }
459    let mut pending_literals: Vec<PendingLiteral> = Vec::new();
460
461    for instr in &arm_instrs {
462        // Record a relocation for every BL: the encoder emits `bl #0` and
463        // relies on a relocation to patch the target. This covers BOTH import
464        // dispatch stubs (`__meld_*`, undefined externals) AND internal calls
465        // (`func_N`, defined in this object). Previously only `__meld_*` was
466        // recorded, so internal `BL func_N` calls were left as unpatched
467        // `bl #0` placeholders branching to a garbage address (#167).
468        if let ArmOp::Bl { label } = &instr.op {
469            relocations.push(CodeRelocation {
470                offset: code.len() as u32,
471                symbol: label.clone(),
472                kind: synth_core::backend::RelocKind::ThmCall,
473            });
474        }
475        // #237: symbol-relative MOVW/MOVT (the `--native-pointer-abi` static-data
476        // addressing). The encoder writes the addend in place; record the matching
477        // R_ARM_MOVW_ABS_NC / R_ARM_MOVT_ABS so the linker adds the symbol address.
478        if let ArmOp::MovwSym { symbol, .. } = &instr.op {
479            relocations.push(CodeRelocation {
480                offset: code.len() as u32,
481                symbol: symbol.clone(),
482                kind: synth_core::backend::RelocKind::MovwAbs,
483            });
484        }
485        if let ArmOp::MovtSym { symbol, .. } = &instr.op {
486            relocations.push(CodeRelocation {
487                offset: code.len() as u32,
488                symbol: symbol.clone(),
489                kind: synth_core::backend::RelocKind::MovtAbs,
490            });
491        }
492        // #345: defer the literal-pool word + reloc + offset patch to the
493        // post-loop pass (the pool address is not yet known).
494        if let ArmOp::LdrSym { symbol, addend, .. } = &instr.op {
495            pending_literals.push(PendingLiteral {
496                ldr_offset: code.len() as u32,
497                symbol: symbol.clone(),
498                addend: *addend,
499            });
500        }
501
502        let encoded = encoder
503            .encode(&instr.op)
504            .map_err(|e| format!("ARM encoding failed: {}", e))?;
505        code.extend_from_slice(&encoded);
506    }
507
508    // #345: place the literal pool at the end of this function's `.text`. Gated on
509    // there being at least one `LdrSym` — functions without one are byte-identical
510    // to before (no trailing padding, so downstream `func_offsets` are unchanged
511    // and the frozen differential fixtures stay bit-for-bit equal).
512    if !pending_literals.is_empty() {
513        if !use_thumb2 {
514            return Err("LdrSym literal-pool addressing requires Thumb-2".to_string());
515        }
516        // 4-byte align the pool start (Thumb-2 word loads require it, and
517        // `Align(PC,4)` in the LDR-literal semantics assumes a word-aligned pool).
518        while code.len() % 4 != 0 {
519            code.push(0x00);
520        }
521        // One distinct pooled word per LdrSym (no dedup: different sites carry
522        // different addends, and the REL addend lives in the word).
523        for lit in &pending_literals {
524            let word_offset = code.len() as u32;
525
526            // REL semantics: the linker computes `S + A`, where A is the in-place
527            // value of the relocated word. Initialize the word to the addend so
528            // the final loaded address is `symbol + addend`.
529            code.extend_from_slice(&(lit.addend as u32).to_le_bytes());
530            relocations.push(CodeRelocation {
531                offset: word_offset,
532                symbol: lit.symbol.clone(),
533                kind: synth_core::backend::RelocKind::Abs32,
534            });
535
536            // Patch the placeholder `LDR.W rd,[pc,#imm12]`. Thumb-2 LDR (literal):
537            // address = Align(PC,4) + imm12, with PC = ldr_offset + 4. The pool is
538            // always after the LDR, so U=1 (already set in hw1 = 0xF8DF).
539            let pc = lit.ldr_offset + 4;
540            let aligned_pc = pc & !3u32;
541            let imm12 = word_offset - aligned_pc;
542            if imm12 > 0xFFF {
543                // Wide LDR-literal range is ±4 KB; these function bodies are far
544                // smaller, but fail cleanly rather than miscompile if exceeded.
545                return Err(format!(
546                    "LdrSym literal pool out of range (#345): imm12={} > 4095 \
547                     for symbol {}",
548                    imm12, lit.symbol
549                ));
550            }
551            let hw2_off = (lit.ldr_offset + 2) as usize;
552            let mut hw2 = u16::from_le_bytes([code[hw2_off], code[hw2_off + 1]]);
553            hw2 = (hw2 & 0xF000) | (imm12 as u16); // keep Rt, set imm12
554            let hw2_bytes = hw2.to_le_bytes();
555            code[hw2_off] = hw2_bytes[0];
556            code[hw2_off + 1] = hw2_bytes[1];
557        }
558    }
559
560    Ok((code, relocations))
561}
562
563/// Resolve local label branches to byte-accurate offsets (#202).
564///
565/// `select_with_stack` emits conditional/unconditional branches as label
566/// placeholders (`Bcc`/`B`/`Bhs`/`Blo` + `Label`) and never resolves them; the
567/// encoder then emits a `0xD000`/`0xE000` placeholder with offset 0. Before #197
568/// this path only ran for `--no-optimize`/declined functions, so the latent bug
569/// stayed hidden — routing relocatable code through it surfaced branches that
570/// land mid-instruction (a Cortex-M UsageFault) whenever a 32-bit Thumb-2
571/// instruction sits between the branch and its target.
572///
573/// This pass encodes each instruction to learn its real byte length (so 16- vs
574/// 32-bit forms and multi-instruction expansions are exact), maps each `Label`
575/// to its byte position, and rewrites every label branch to the displacement
576/// the encoder consumes: `(target - branch - 4) / 2` halfwords. A bounded
577/// fixed-point handles an offset growing a branch from 16- to 32-bit (which
578/// shifts later positions). `BCondOffset`/`BOffset` already produced inline by
579/// the optimized path carry no label and are left untouched.
580fn resolve_label_branches(
581    arm_instrs: Vec<ArmInstruction>,
582    encoder: &ArmEncoder,
583) -> Result<Vec<ArmInstruction>, String> {
584    use std::collections::HashMap;
585    use synth_synthesis::Condition;
586
587    enum BKind {
588        Cond(Condition),
589        Uncond,
590    }
591    // Record each label branch ONCE — indices are stable across iterations.
592    let mut branches: Vec<(usize, BKind, String)> = Vec::new();
593    for (i, instr) in arm_instrs.iter().enumerate() {
594        match &instr.op {
595            ArmOp::Bcc { cond, label } => branches.push((i, BKind::Cond(*cond), label.clone())),
596            ArmOp::Bhs { label } => branches.push((i, BKind::Cond(Condition::HS), label.clone())),
597            ArmOp::Blo { label } => branches.push((i, BKind::Cond(Condition::LO), label.clone())),
598            ArmOp::B { label } => branches.push((i, BKind::Uncond, label.clone())),
599            _ => {}
600        }
601    }
602    if branches.is_empty() {
603        return Ok(arm_instrs);
604    }
605
606    let mut resolved = arm_instrs;
607    // Sizes only grow (16→32-bit), so this converges quickly; cap for safety.
608    for _ in 0..16 {
609        // 1. Byte position of each instruction (Label encodes to 0 bytes).
610        let mut positions = Vec::with_capacity(resolved.len());
611        let mut pos: i64 = 0;
612        for instr in &resolved {
613            positions.push(pos);
614            pos += encoder
615                .encode(&instr.op)
616                .map_err(|e| format!("branch-resolve size probe failed: {}", e))?
617                .len() as i64;
618        }
619        // 2. Label name -> byte position (owned keys so the borrow ends here).
620        let mut labels: HashMap<String, i64> = HashMap::new();
621        for (i, instr) in resolved.iter().enumerate() {
622            if let ArmOp::Label { name } = &instr.op {
623                labels.insert(name.clone(), positions[i]);
624            }
625        }
626        // 3. Rewrite each branch to its byte-accurate offset.
627        let mut changed = false;
628        for (idx, kind, label) in &branches {
629            // A label not defined locally is an EXTERNAL target (e.g.
630            // `Trap_Handler` resolved by a relocation / the vector table). Leave
631            // such branches as their placeholder for the existing relocation
632            // path — only local control-flow labels are byte-resolved here.
633            let Some(&target) = labels.get(label) else {
634                continue;
635            };
636            // Encoder consumes the field as (target - branch - 4) / 2 halfwords.
637            // Positions are always even, so this division is exact.
638            let halfword_offset = ((target - positions[*idx] - 4) / 2) as i32;
639            let new_op = match kind {
640                BKind::Cond(c) => ArmOp::BCondOffset {
641                    cond: *c,
642                    offset: halfword_offset,
643                },
644                BKind::Uncond => ArmOp::BOffset {
645                    offset: halfword_offset,
646                },
647            };
648            if resolved[*idx].op != new_op {
649                resolved[*idx].op = new_op;
650                changed = true;
651            }
652        }
653        if !changed {
654            break;
655        }
656    }
657    Ok(resolved)
658}
659
660#[cfg(test)]
661mod tests {
662    use super::*;
663
664    #[test]
665    fn test_arm_backend_name() {
666        let backend = ArmBackend::new();
667        assert_eq!(backend.name(), "arm");
668        assert!(backend.is_available());
669    }
670
671    #[test]
672    fn test_arm_backend_capabilities() {
673        let backend = ArmBackend::new();
674        let caps = backend.capabilities();
675        assert!(!caps.produces_elf);
676        assert!(caps.supports_rule_verification);
677        assert!(!caps.is_external);
678    }
679
680    #[test]
681    fn test_compile_add_function() {
682        let backend = ArmBackend::new();
683        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
684        let config = CompileConfig::default();
685
686        let result = backend.compile_function("add", &ops, &config);
687        assert!(result.is_ok());
688
689        let func = result.unwrap();
690        assert_eq!(func.name, "add");
691        assert!(!func.code.is_empty());
692        assert_eq!(func.wasm_ops, ops);
693    }
694
695    #[test]
696    fn test_count_params() {
697        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
698        assert_eq!(count_params(&ops), 2);
699
700        let no_params = vec![WasmOp::I32Const(5), WasmOp::I32Const(3), WasmOp::I32Add];
701        assert_eq!(count_params(&no_params), 0);
702    }
703
704    #[test]
705    fn test_arm_backend_register() {
706        let mut registry = synth_core::BackendRegistry::new();
707        registry.register(Box::new(ArmBackend::new()));
708        assert!(registry.get("arm").is_some());
709        assert_eq!(registry.available().len(), 1);
710    }
711
712    #[test]
713    fn test_compile_import_call_produces_relocations() {
714        let backend = ArmBackend::new();
715        // Simulate a WASM module where func index 0 is an import.
716        // Call(0) should generate MOV R0, #0; BL __meld_dispatch_import
717        let ops = vec![WasmOp::Call(0)];
718        let config = CompileConfig {
719            num_imports: 1,
720            no_optimize: true, // Direct instruction selection to preserve Call semantics
721            ..CompileConfig::default()
722        };
723
724        let result = backend.compile_function("caller", &ops, &config);
725        assert!(result.is_ok());
726
727        let func = result.unwrap();
728        assert!(!func.code.is_empty());
729        assert_eq!(func.relocations.len(), 1);
730        assert_eq!(func.relocations[0].symbol, "__meld_dispatch_import");
731        // The BL is the second instruction (after MOV R0, #0), so offset should be > 0
732        assert!(func.relocations[0].offset > 0);
733    }
734
735    /// Regression test for #197: in `relocatable` mode, an import call must
736    /// relocate against the direct `func_N` symbol (rewritten to the wasm field
737    /// name by `build_relocatable_elf`), NOT `__meld_dispatch_import`. This is
738    /// the ABI half of the #197 fix — without it, a host linker cannot resolve
739    /// the call to the real kernel symbol (e.g. `k_spin_lock`).
740    #[test]
741    fn test_compile_relocatable_import_uses_direct_func_symbol_197() {
742        let backend = ArmBackend::new();
743        let ops = vec![WasmOp::Call(0)]; // func 0 is an import
744        let config = CompileConfig {
745            num_imports: 1,
746            relocatable: true,
747            ..CompileConfig::default()
748        };
749
750        let func = backend
751            .compile_function("caller", &ops, &config)
752            .expect("relocatable import call compiles");
753
754        assert_eq!(func.relocations.len(), 1);
755        assert_eq!(
756            func.relocations[0].symbol, "func_0",
757            "#197: relocatable import must relocate against func_0 (→ field name), not Meld dispatch"
758        );
759    }
760
761    #[test]
762    fn test_compile_no_imports_no_relocations() {
763        let backend = ArmBackend::new();
764        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
765        let config = CompileConfig::default();
766
767        let func = backend.compile_function("add", &ops, &config).unwrap();
768        assert!(func.relocations.is_empty());
769    }
770
771    /// Regression test for #167: a call to an INTERNAL function
772    /// (index `>= num_imports`) must record a relocation against `func_{index}`.
773    /// Before the fix, only `__meld_*` (import) BLs were relocated, so
774    /// internal `BL func_N` was emitted as an unpatched `bl #0` branching
775    /// to a garbage address — making the object non-linkable. This test
776    /// would have caught that regression.
777    #[test]
778    fn test_compile_internal_call_produces_relocation_167() {
779        let backend = ArmBackend::new();
780        // num_imports = 1, so Call(2) is an INTERNAL call → `BL func_2`.
781        let ops = vec![WasmOp::Call(2)];
782        let config = CompileConfig {
783            num_imports: 1,
784            no_optimize: true,
785            ..CompileConfig::default()
786        };
787
788        let func = backend
789            .compile_function("caller", &ops, &config)
790            .expect("internal call compiles");
791
792        assert_eq!(
793            func.relocations.len(),
794            1,
795            "an internal call must emit exactly one relocation (#167)"
796        );
797        assert_eq!(
798            func.relocations[0].symbol, "func_2",
799            "internal call must relocate against the callee's func_{{index}} symbol (#167)"
800        );
801    }
802
803    // ─── Phase 1 safety-bounds plumbing for ARM ──────────────────────────
804
805    #[test]
806    fn arm_safety_bounds_mpu_emits_same_code_as_none() {
807        // Mpu mode must not introduce any inline check on ARM — the MPU
808        // handles faults via hardware. The encoded bytes for an i32.load
809        // should be identical between None and Mpu.
810        let backend = ArmBackend::new();
811        let ops = vec![
812            WasmOp::LocalGet(0),
813            WasmOp::I32Load {
814                offset: 0,
815                align: 2,
816            },
817        ];
818        let cfg_none = CompileConfig {
819            no_optimize: true,
820            ..Default::default()
821        };
822        let cfg_mpu = CompileConfig {
823            no_optimize: true,
824            safety_bounds: SafetyBounds::Mpu,
825            ..Default::default()
826        };
827        let n = backend.compile_function("ld", &ops, &cfg_none).unwrap();
828        let m = backend.compile_function("ld", &ops, &cfg_mpu).unwrap();
829        assert_eq!(
830            n.code, m.code,
831            "Mpu and None should produce identical ARM bytes (Mpu relies on hardware)"
832        );
833    }
834
835    #[test]
836    fn arm_legacy_bounds_check_still_emits_software_check() {
837        // Legacy CLI users with `--bounds-check` should keep getting the
838        // software path even though the new SafetyBounds field defaults to None.
839        let backend = ArmBackend::new();
840        let ops = vec![
841            WasmOp::LocalGet(0),
842            WasmOp::I32Load {
843                offset: 0,
844                align: 2,
845            },
846        ];
847        let cfg_legacy = CompileConfig {
848            no_optimize: true,
849            bounds_check: true,
850            ..Default::default()
851        };
852        let cfg_software = CompileConfig {
853            no_optimize: true,
854            safety_bounds: SafetyBounds::Software,
855            ..Default::default()
856        };
857        let l = backend.compile_function("ld", &ops, &cfg_legacy).unwrap();
858        let s = backend.compile_function("ld", &ops, &cfg_software).unwrap();
859        assert_eq!(
860            l.code, s.code,
861            "--bounds-check should produce the same bytes as --safety-bounds=software"
862        );
863    }
864
865    // ========================================================================
866    // ISA feature gate tests — ensure the compiler never emits unsupported
867    // instructions for a given target
868    // ========================================================================
869
870    #[test]
871    fn test_f32_rejected_on_cortex_m3_no_fpu() {
872        let backend = ArmBackend::new();
873        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
874        let config = CompileConfig {
875            target: TargetSpec::cortex_m3(),
876            no_optimize: true,
877            ..CompileConfig::default()
878        };
879
880        let result = backend.compile_function("fadd", &ops, &config);
881        assert!(
882            result.is_err(),
883            "f32 operations should fail on Cortex-M3 (no FPU)"
884        );
885    }
886
887    #[test]
888    fn test_f32_accepted_on_cortex_m4f() {
889        let backend = ArmBackend::new();
890        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
891        let config = CompileConfig {
892            target: TargetSpec::cortex_m4f(),
893            no_optimize: true,
894            ..CompileConfig::default()
895        };
896
897        let result = backend.compile_function("fadd", &ops, &config);
898        assert!(
899            result.is_ok(),
900            "f32 operations should succeed on Cortex-M4F, got: {:?}",
901            result.unwrap_err()
902        );
903    }
904
905    #[test]
906    fn test_i32_works_on_all_targets() {
907        let backend = ArmBackend::new();
908        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
909
910        // Cortex-M3 (no FPU)
911        let config_m3 = CompileConfig {
912            target: TargetSpec::cortex_m3(),
913            no_optimize: true,
914            ..CompileConfig::default()
915        };
916        assert!(
917            backend.compile_function("add", &ops, &config_m3).is_ok(),
918            "i32 ops should work on Cortex-M3"
919        );
920
921        // Cortex-M4F (single FPU)
922        let config_m4f = CompileConfig {
923            target: TargetSpec::cortex_m4f(),
924            no_optimize: true,
925            ..CompileConfig::default()
926        };
927        assert!(
928            backend.compile_function("add", &ops, &config_m4f).is_ok(),
929            "i32 ops should work on Cortex-M4F"
930        );
931
932        // Cortex-M7DP (double FPU)
933        let config_m7dp = CompileConfig {
934            target: TargetSpec::cortex_m7dp(),
935            no_optimize: true,
936            ..CompileConfig::default()
937        };
938        assert!(
939            backend.compile_function("add", &ops, &config_m7dp).is_ok(),
940            "i32 ops should work on Cortex-M7DP"
941        );
942    }
943
944    #[test]
945    fn test_f32_rejected_on_cortex_m4_no_fpu() {
946        // Cortex-M4 (without F suffix) has no FPU
947        let backend = ArmBackend::new();
948        let ops = vec![WasmOp::F32Const(1.5), WasmOp::F32Const(2.5), WasmOp::F32Mul];
949        let config = CompileConfig {
950            target: TargetSpec::cortex_m4(),
951            no_optimize: true,
952            ..CompileConfig::default()
953        };
954
955        let result = backend.compile_function("fmul", &ops, &config);
956        assert!(
957            result.is_err(),
958            "f32 operations should fail on Cortex-M4 (no FPU)"
959        );
960    }
961
962    // ========================================================================
963    // Issue #120 — f32 ops in the optimized lowering path
964    //
965    // `OptimizerBridge::wasm_to_ir` has no handlers for f32/f64 ops, so a
966    // value-producing float op fell through to `Opcode::Nop`, leaving a
967    // downstream consumer with an unmapped vreg and tripping the PR #101
968    // defensive panic in `ir_to_arm`. Customer reproducer: `compiler_builtins
969    // float::div` and `gale_compute_ipi_mask` in the `falcon-rate-component`
970    // module.
971    //
972    // Fix: `optimize_full` declines float modules with a typed `Err`;
973    // `compile_wasm_to_arm` falls back to the non-optimized `select_with_stack`
974    // path, which handles f32 via VFP/FPU. These tests use the *default*
975    // (optimized) config — `no_optimize` is NOT set — which is the exact
976    // configuration that panicked pre-fix.
977    // ========================================================================
978
979    /// Pre-fix: this panicked with "vreg vN has no assigned ARM register and
980    /// no spill slot" inside `ir_to_arm`. Post-fix: the optimized path declines
981    /// the module and the backend falls back to direct selection, producing a
982    /// non-empty f32.div lowering on a Cortex-M4F.
983    #[test]
984    fn test_issue120_f32_div_compiles_via_optimized_default() {
985        let backend = ArmBackend::new();
986        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
987        let config = CompileConfig {
988            target: TargetSpec::cortex_m4f(),
989            // no_optimize NOT set — this exercises the optimized path that
990            // panicked in issue #120, then the fallback to direct selection.
991            ..CompileConfig::default()
992        };
993
994        let result = backend.compile_function("fdiv", &ops, &config);
995        assert!(
996            result.is_ok(),
997            "f32.div must compile on Cortex-M4F via the optimized->direct \
998             fallback (issue #120), got: {:?}",
999            result.as_ref().err()
1000        );
1001        assert!(
1002            !result.unwrap().code.is_empty(),
1003            "f32.div must produce non-empty machine code"
1004        );
1005    }
1006
1007    /// A spread of f32 ops, all through the optimized (default) config, must
1008    /// compile via the fallback on an FPU target without panicking.
1009    #[test]
1010    fn test_issue120_assorted_f32_ops_compile_via_optimized_default() {
1011        let backend = ArmBackend::new();
1012        let config = CompileConfig {
1013            target: TargetSpec::cortex_m4f(),
1014            ..CompileConfig::default()
1015        };
1016
1017        let cases: Vec<(&str, Vec<WasmOp>)> = vec![
1018            (
1019                "fadd",
1020                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Add],
1021            ),
1022            (
1023                "fmul",
1024                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Mul],
1025            ),
1026            (
1027                "fsub",
1028                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Sub],
1029            ),
1030        ];
1031
1032        for (name, ops) in cases {
1033            let result = backend.compile_function(name, &ops, &config);
1034            assert!(
1035                result.is_ok(),
1036                "{name} must compile via the optimized->direct fallback \
1037                 (issue #120), got: {:?}",
1038                result.as_ref().err()
1039            );
1040            assert!(
1041                !result.unwrap().code.is_empty(),
1042                "{name} must produce non-empty machine code"
1043            );
1044        }
1045    }
1046
1047    /// The fallback must still honor the ISA feature gate: f32 on a no-FPU
1048    /// target must fail cleanly (not panic) even on the optimized path.
1049    #[test]
1050    fn test_issue120_f32_div_rejected_on_no_fpu_via_optimized() {
1051        let backend = ArmBackend::new();
1052        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
1053        let config = CompileConfig {
1054            target: TargetSpec::cortex_m3(),
1055            ..CompileConfig::default()
1056        };
1057
1058        let result = backend.compile_function("fdiv", &ops, &config);
1059        assert!(
1060            result.is_err(),
1061            "f32.div must be rejected on Cortex-M3 (no FPU), not panic"
1062        );
1063    }
1064
1065    /// Issue #94: end-to-end byte-size check for the canonical u64-packed
1066    /// FFI-return hi32 extract pattern. Compiles two near-identical
1067    /// functions — one with the optimized shift-by-32, one with a generic
1068    /// shift-by-7 — and asserts the optimized form is meaningfully smaller.
1069    #[test]
1070    fn test_issue94_hi32_extract_is_smaller_than_generic_shift() {
1071        let backend = ArmBackend::new();
1072        let config = CompileConfig {
1073            target: TargetSpec::cortex_m4f(),
1074            ..CompileConfig::default()
1075        };
1076
1077        // Optimized path: `(local.get 0) >>> 32; wrap_i64`
1078        let ops_hi32 = vec![
1079            WasmOp::LocalGet(0), // i64 param in R0:R1
1080            WasmOp::I64Const(32),
1081            WasmOp::I64ShrU,
1082            WasmOp::I32WrapI64,
1083        ];
1084        let func_hi32 = backend
1085            .compile_function("hi32_extract", &ops_hi32, &config)
1086            .unwrap();
1087
1088        // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the
1089        // shift amount is not a multiple of 32, so it falls through to the
1090        // 38-byte runtime shift.
1091        let ops_generic = vec![
1092            WasmOp::LocalGet(0),
1093            WasmOp::I64Const(7),
1094            WasmOp::I64ShrU,
1095            WasmOp::I32WrapI64,
1096        ];
1097        let func_generic = backend
1098            .compile_function("generic_shr", &ops_generic, &config)
1099            .unwrap();
1100
1101        let bytes_hi32 = func_hi32.code.len();
1102        let bytes_generic = func_generic.code.len();
1103        println!(
1104            "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})",
1105            bytes_hi32,
1106            bytes_generic,
1107            bytes_generic.saturating_sub(bytes_hi32)
1108        );
1109        let hex: String = func_hi32
1110            .code
1111            .iter()
1112            .map(|b| format!("{:02x}", b))
1113            .collect::<Vec<_>>()
1114            .join(" ");
1115        println!("[issue #94] hi32 bytes: {}", hex);
1116        // We expect the optimized form to be at least 30 bytes smaller than
1117        // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.)
1118        assert!(
1119            bytes_hi32 + 30 <= bytes_generic,
1120            "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \
1121             expected optimized form to be at least 30 bytes smaller",
1122            bytes_hi32,
1123            bytes_generic,
1124        );
1125    }
1126}