Skip to main content

synth_backend/
arm_backend.rs

1//! ARM Backend — wraps the instruction selector + optimizer + encoder as a Backend
2//!
3//! This is Synth's custom ARM compiler targeting Cortex-M (Thumb-2).
4//! It's the only backend that supports per-rule formal verification (ASIL D path).
5
6use crate::ArmEncoder;
7use synth_core::backend::{
8    Backend, BackendCapabilities, BackendError, CodeRelocation, CompilationResult, CompileConfig,
9    CompiledFunction, SafetyBounds,
10};
11use synth_core::target::{IsaVariant, TargetSpec};
12use synth_core::wasm_decoder::DecodedModule;
13use synth_core::wasm_op::WasmOp;
14use synth_synthesis::{
15    ArmInstruction, ArmOp, BoundsCheckConfig, InstructionSelector, OptimizationConfig,
16    OptimizerBridge, RuleDatabase, validate_instructions,
17};
18
19/// ARM Cortex-M backend using Synth's custom compiler pipeline
20pub struct ArmBackend;
21
22impl ArmBackend {
23    pub fn new() -> Self {
24        Self
25    }
26}
27
28impl Default for ArmBackend {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34impl Backend for ArmBackend {
35    fn name(&self) -> &str {
36        "arm"
37    }
38
39    fn capabilities(&self) -> BackendCapabilities {
40        BackendCapabilities {
41            produces_elf: false,
42            supports_rule_verification: true,
43            supports_binary_verification: true,
44            is_external: false,
45        }
46    }
47
48    fn supported_targets(&self) -> Vec<TargetSpec> {
49        vec![
50            TargetSpec::cortex_m3(),
51            TargetSpec::cortex_m4(),
52            TargetSpec::cortex_m4f(),
53            TargetSpec::cortex_m7(),
54            TargetSpec::cortex_m7dp(),
55        ]
56    }
57
58    fn compile_module(
59        &self,
60        module: &DecodedModule,
61        config: &CompileConfig,
62    ) -> Result<CompilationResult, BackendError> {
63        let exports: Vec<_> = module
64            .functions
65            .iter()
66            .filter(|f| f.export_name.is_some())
67            .collect();
68
69        if exports.is_empty() {
70            return Err(BackendError::CompilationFailed(
71                "no exported functions found".into(),
72            ));
73        }
74
75        let mut functions = Vec::new();
76        for func in &exports {
77            let name = func.export_name.clone().unwrap();
78            let compiled = self.compile_function(&name, &func.ops, config)?;
79            functions.push(compiled);
80        }
81
82        Ok(CompilationResult {
83            functions,
84            elf: None,
85            backend_name: self.name().to_string(),
86        })
87    }
88
89    fn compile_function(
90        &self,
91        name: &str,
92        ops: &[WasmOp],
93        config: &CompileConfig,
94    ) -> Result<CompiledFunction, BackendError> {
95        let (code, relocations) =
96            compile_wasm_to_arm(ops, config).map_err(BackendError::CompilationFailed)?;
97
98        Ok(CompiledFunction {
99            name: name.to_string(),
100            code,
101            wasm_ops: ops.to_vec(),
102            relocations,
103        })
104    }
105
106    fn is_available(&self) -> bool {
107        true // Always available — it's a library backend
108    }
109}
110
111/// Count the number of function parameters by analyzing LocalGet patterns
112fn count_params(wasm_ops: &[WasmOp]) -> u32 {
113    let mut first_access: std::collections::HashMap<u32, bool> = std::collections::HashMap::new();
114    for op in wasm_ops {
115        match op {
116            WasmOp::LocalGet(idx) => {
117                first_access.entry(*idx).or_insert(true);
118            }
119            WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
120                first_access.entry(*idx).or_insert(false);
121            }
122            _ => {}
123        }
124    }
125
126    first_access
127        .iter()
128        .filter_map(
129            |(&idx, &is_read_first)| {
130                if is_read_first { Some(idx + 1) } else { None }
131            },
132        )
133        .max()
134        .unwrap_or(0)
135}
136
137/// Core compilation: WASM ops → ARM machine code bytes + relocations
138///
139/// Returns (code_bytes, relocations) where relocations record BL instructions
140/// that target external symbols (e.g., `__meld_dispatch_import` for import calls).
141fn compile_wasm_to_arm(
142    wasm_ops: &[WasmOp],
143    config: &CompileConfig,
144) -> Result<(Vec<u8>, Vec<CodeRelocation>), String> {
145    let num_params = count_params(wasm_ops);
146
147    let bounds_config = match config.effective_safety_bounds() {
148        SafetyBounds::None => BoundsCheckConfig::None,
149        SafetyBounds::Mpu => BoundsCheckConfig::Mpu,
150        SafetyBounds::Software => BoundsCheckConfig::Software,
151        SafetyBounds::Mask => BoundsCheckConfig::Masking,
152    };
153
154    // The non-optimized (direct) instruction-selection path. Handles f32 via
155    // VFP/FPU. Used directly when `--no-optimize` is set, and as the fallback
156    // when the optimized path declines a module (see issue #120 below).
157    //
158    // VCR-RA-001 step 3b-lite (#242): a FRESH selector per attempt, with
159    // `spill_on_exhaustion` set only on the retry — the first pass is the
160    // unmodified default, so every function that compiles today is selected by
161    // exactly the code that compiled it yesterday (bit-identity is structural,
162    // not behavioural).
163    let select_direct_attempt =
164        |spill_on_exhaustion: bool| -> Result<Vec<ArmInstruction>, synth_core::Error> {
165            let db = RuleDatabase::with_standard_rules();
166            let mut selector =
167                InstructionSelector::with_bounds_check(db.rules().to_vec(), bounds_config);
168            selector.set_target(config.target.fpu, &config.target.triple);
169            if config.num_imports > 0 {
170                selector.set_num_imports(config.num_imports);
171            }
172            // #195: plumb the callee argument-count tables so the direct selector can
173            // marshal call arguments into R0–R3 per AAPCS.
174            selector.set_func_arg_counts(
175                config.func_arg_counts.clone(),
176                config.type_arg_counts.clone(),
177            );
178            // #197: in relocatable host-link mode, emit direct `func_N` BLs for
179            // imports (rewritten to the wasm field name by build_relocatable_elf)
180            // instead of `__meld_dispatch_import`.
181            selector.set_relocatable(config.relocatable);
182            // #237: native-pointer ABI — wasm statics become __synth_wasm_data-relative.
183            selector.set_native_pointer_abi(config.native_pointer_abi, config.linear_memory_bytes);
184            // #311: i64 call results are register PAIRS — tag them.
185            selector.set_result_types(config.func_ret_i64.clone(), config.type_ret_i64.clone());
186            // Stack-pointer promotion is meaningful only under the native-pointer ABI;
187            // gating here keeps every non-native compile (all frozen fixtures) on the
188            // legacy R9 globals-table path, bit-identical.
189            if config.native_pointer_abi
190                && let Some((sp_idx, sp_init)) = config.stack_pointer_global
191            {
192                selector.set_native_pointer_stack(sp_idx, sp_init);
193            }
194            selector.set_spill_on_exhaustion(spill_on_exhaustion);
195            selector.select_with_stack(wasm_ops, num_params)
196        };
197    let select_direct = || -> Result<Vec<ArmInstruction>, String> {
198        match select_direct_attempt(false) {
199            Ok(instrs) => Ok(instrs),
200            // VCR-RA-001 step 3b-lite (#242): the i32 register-exhaustion
201            // hard-fail is recoverable — retry once with spill-on-exhaustion,
202            // which reserves the spill area and spills the deepest stack value
203            // when the pool is full. Only functions that FAILED the first pass
204            // ever reach this, so existing output is untouched by construction.
205            Err(e)
206                if e.to_string()
207                    .contains("all allocatable registers are live on the stack") =>
208            {
209                select_direct_attempt(true)
210                    .map_err(|e| format!("instruction selection failed: {}", e))
211            }
212            Err(e) => Err(format!("instruction selection failed: {}", e)),
213        }
214    };
215
216    // Instruction selection: optimized or direct.
217    //
218    // #197: `--relocatable` (host-link ET_REL) forces the direct selector. The
219    // optimized path materializes an absolute linmem base (0x20000100) and does
220    // not preserve caller-saved registers across calls — both wrong for a
221    // host-linked object, where the linmem base arrives via `fp` at runtime and
222    // callees follow AAPCS. `select_with_stack` (now i64-spill capable after
223    // #171) handles fp-relative memory + caller-saved preservation correctly.
224    let arm_instrs = if config.no_optimize || config.relocatable {
225        select_direct()?
226    } else {
227        let opt_config = if config.loom_compat {
228            OptimizationConfig::loom_compat()
229        } else {
230            OptimizationConfig::all()
231        };
232
233        let mut bridge = OptimizerBridge::with_config(opt_config);
234        // #188: tell the bridge how many imports there are so it declines only
235        // LOCAL calls (and leaves import calls on the optimized path, keeping
236        // the #173 field-name relocation rewrite intact).
237        bridge.set_num_imports(config.num_imports);
238        // `ir_to_arm` now returns `Result` — an `Err` means the optimized path
239        // hit an unmapped vreg (issue-#93-class). Treat it identically to an
240        // `optimize_full` failure: fall back to the direct selector rather
241        // than propagating, so the function still compiles correctly.
242        match bridge
243            .optimize_full(wasm_ops)
244            .and_then(|(opt_ir, _cfg, _stats)| bridge.ir_to_arm(&opt_ir, num_params as usize))
245        {
246            Ok(arm_ops) => arm_ops
247                .into_iter()
248                .map(|op| ArmInstruction {
249                    op,
250                    source_line: None,
251                })
252                .collect(),
253            // Issue #120: the optimized path declines modules it cannot lower
254            // (notably scalar f32/f64 ops — the IR has no float opcodes). Fall
255            // back to the direct instruction selector, which handles f32 via
256            // VFP/FPU. This is honest degradation: the function still compiles
257            // correctly, just without IR-level optimization.
258            Err(_) => select_direct()?,
259        }
260    };
261
262    // #257/#277: `mul`+`add`→`mla` fusion is intentionally NOT wired here.
263    // The transform is correct and ready (`synth_synthesis::liveness::fuse_mul_add`,
264    // fully tested), but it is **register-allocation-coupled**: over the current
265    // greedy single-pass selector, folding `mul rM,..; add rD,rM,rX` → `mla`
266    // extends the live ranges of the mul inputs to the mla point, and the added
267    // pressure (extra moves/spills) costs more than the single-cycle MLA saves —
268    // gale measured a +2 cyc on-target REGRESSION (flat_flight 255→257, G474RE)
269    // even though it removes 2 instructions and the seam stays 0x07FDF307. So the
270    // fusion stays unwired until the spill-aware allocator (VCR-RA-001) chooses
271    // registers, at which point it becomes net-positive (per #272's plan and the
272    // wiring design note). Lesson (#277): a register-pressure-affecting transform
273    // needs an on-target/allocator-aware gate, not a byte-count gate, before it
274    // can default on.
275
276    // VCR-RA-001 const-CSE / rematerialization-avoidance (#209), the first
277    // allocator-analysis-driven CODEGEN change. Drops `movw` re-materializations
278    // of a constant already resident in another register and retargets the reads
279    // — every rewrite proven by the liveness analysis, and it ONLY removes
280    // materializations (pressure never rises), so unlike the mla fusion (#277) it
281    // cannot regress on-target. Runs on the selected stream before branch
282    // resolution (it removes instructions, shifting byte offsets). Behind
283    // `SYNTH_CONST_CSE=1` while it is validated against the differential oracle +
284    // gale's five on-target baselines; off by default keeps every fixture
285    // bit-identical.
286    let arm_instrs = if std::env::var("SYNTH_CONST_CSE").is_ok() {
287        synth_synthesis::liveness::apply_const_cse(&arm_instrs).0
288    } else {
289        arm_instrs
290    };
291
292    // VCR-RA-001 RANGE RE-ALLOCATION (#209/#242, wiring step 3a) — the first
293    // CONSEQUENTIAL allocator pass: re-colour each maximal straight-line
294    // segment over the R0-R8 pool with value ranges as the allocation unit
295    // (segment inputs + per-register live-outs pinned to their original
296    // registers, reserved R9-R12/SP identity-assigned — each segment is
297    // independently sound, no cross-segment liveness assumed). Renames
298    // registers only: never adds, removes, or reorders instructions, so
299    // labels/branch offsets are unaffected.
300    //
301    // DEFAULT-ON since v0.11.36: gale cleared the gate on-target (G474RE,
302    // #209 2026-06-10) — flag-on output byte-identical to flag-off on
303    // flat_flight/controller/control_step, fires on the filter family with
304    // zero cycle delta and a small size win, all selfchecks green on silicon.
305    // Opt out with `SYNTH_RANGE_REALLOC=0`; per-function stats with
306    // `SYNTH_REALLOC_STATS=1`.
307    //
308    // The companion dead callee-saved-save elimination (gale's "next
309    // consequential lever", same issue comment) then shrinks the prologue
310    // `push {r4-r8,lr}` / epilogue `pop {r4-r8,pc}` to the callee-saved
311    // registers the re-allocated body still touches (leaf-only,
312    // SP-untouched, even-count-padded — see shrink_callee_saved_saves):
313    // ~12 cycles of pure save/restore overhead removed on small leaves.
314    let realloc_on = std::env::var("SYNTH_RANGE_REALLOC").map_or(true, |v| v != "0");
315    let arm_instrs = if realloc_on {
316        use synth_synthesis::rules::Reg;
317        const POOL: [Reg; 9] = [
318            Reg::R0,
319            Reg::R1,
320            Reg::R2,
321            Reg::R3,
322            Reg::R4,
323            Reg::R5,
324            Reg::R6,
325            Reg::R7,
326            Reg::R8,
327        ];
328        let (out, stats) = synth_synthesis::liveness::reallocate_function(&arm_instrs, &POOL);
329        if std::env::var("SYNTH_REALLOC_STATS").is_ok() {
330            eprintln!(
331                "[range-realloc] {} segments: {} reallocated, {} declined ({} validator-rejected), {} need spill (step 4)",
332                stats.segments,
333                stats.reallocated,
334                stats.declined,
335                stats.validator_rejects,
336                stats.needs_spill
337            );
338        }
339        synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out)
340    } else {
341        arm_instrs
342    };
343
344    // VCR-RA-001 SHADOW ALLOCATION (#209/#242): run the register allocator on
345    // the selected stream and LOG what it finds — without changing a single
346    // emitted byte. This is the measure-only bridge between the built analysis
347    // layer and the eventual virtual-register wiring: it shows, per real
348    // function, whether the allocator can colour it within the R0–R8 pool and
349    // how much const-CSE / rematerialization headroom exists (#209). Enable with
350    // `SYNTH_SHADOW_ALLOC=1`; off by default and side-effect-free either way.
351    if std::env::var("SYNTH_SHADOW_ALLOC").is_ok() {
352        use synth_synthesis::liveness::{
353            AllocationOutcome, allocate_function, function_peak_pressure,
354        };
355        // R9 globals / R10 mem-size / R11 mem-base / R12 IP-scratch are reserved;
356        // pin them above the 0..9 allocatable pool so the colourer keeps R0–R8.
357        let precolored = std::collections::BTreeMap::from([
358            (synth_synthesis::rules::Reg::R9, 9usize),
359            (synth_synthesis::rules::Reg::R10, 10),
360            (synth_synthesis::rules::Reg::R11, 11),
361            (synth_synthesis::rules::Reg::R12, 12),
362        ]);
363        // True VALUE pressure (one node per value, not per reused physical reg):
364        // a NeedsSpill with peak ≤ 9 is a SPURIOUS physical-register spill — the
365        // function fits once virtually allocated.
366        let peak = function_peak_pressure(&arm_instrs);
367        match allocate_function(&arm_instrs, 9, &precolored) {
368            AllocationOutcome::Allocated {
369                remat_opportunities,
370                coloring,
371            } => eprintln!(
372                "[shadow-alloc] OK: {} pregs coloured within R0-R8 pool, peak value-pressure {}, {} const-CSE/remat opportunities",
373                coloring.len(),
374                peak,
375                remat_opportunities
376            ),
377            AllocationOutcome::NeedsSpill(s) => eprintln!(
378                "[shadow-alloc] physical-graph would spill {:?}, but peak value-pressure is {} (≤9 ⇒ spurious; fits once virtually allocated)",
379                s, peak
380            ),
381            AllocationOutcome::Declined => {
382                eprintln!(
383                    "[shadow-alloc] declined (unmodeled construct — calls/i64/fp/offset-branch)"
384                )
385            }
386        }
387    }
388
389    // ISA feature gate: validate that all generated instructions are supported
390    // by the target. This catches FPU instructions on no-FPU targets, double-precision
391    // instructions on single-precision targets, etc.
392    validate_instructions(&arm_instrs, config.target.fpu, &config.target.triple)
393        .map_err(|e| format!("ISA validation failed: {}", e))?;
394
395    // Encode to binary — use Thumb-2 for Cortex-M targets
396    let use_thumb2 = matches!(config.target.isa, IsaVariant::Thumb2 | IsaVariant::Thumb);
397
398    let encoder = if use_thumb2 {
399        ArmEncoder::new_thumb2_with_fpu(config.target.fpu)
400    } else {
401        ArmEncoder::new_arm32()
402    };
403
404    // #202: resolve local label branches (Bcc/B/Bhs/Blo) to byte-accurate
405    // offsets before encoding. `select_with_stack` emits them as label
406    // placeholders and never resolves them — without this they encode as
407    // `bne.n #0` and land mid-instruction whenever a 32-bit Thumb-2 instruction
408    // sits between the branch and its target (UsageFault on real hardware).
409    // Only meaningful for Thumb-2 (the offset units are halfword/PC+4).
410    let arm_instrs = if use_thumb2 {
411        resolve_label_branches(arm_instrs, &encoder)?
412    } else {
413        arm_instrs
414    };
415
416    let mut code = Vec::new();
417    let mut relocations = Vec::new();
418
419    for instr in &arm_instrs {
420        // Record a relocation for every BL: the encoder emits `bl #0` and
421        // relies on a relocation to patch the target. This covers BOTH import
422        // dispatch stubs (`__meld_*`, undefined externals) AND internal calls
423        // (`func_N`, defined in this object). Previously only `__meld_*` was
424        // recorded, so internal `BL func_N` calls were left as unpatched
425        // `bl #0` placeholders branching to a garbage address (#167).
426        if let ArmOp::Bl { label } = &instr.op {
427            relocations.push(CodeRelocation {
428                offset: code.len() as u32,
429                symbol: label.clone(),
430                kind: synth_core::backend::RelocKind::ThmCall,
431            });
432        }
433        // #237: symbol-relative MOVW/MOVT (the `--native-pointer-abi` static-data
434        // addressing). The encoder writes the addend in place; record the matching
435        // R_ARM_MOVW_ABS_NC / R_ARM_MOVT_ABS so the linker adds the symbol address.
436        if let ArmOp::MovwSym { symbol, .. } = &instr.op {
437            relocations.push(CodeRelocation {
438                offset: code.len() as u32,
439                symbol: symbol.clone(),
440                kind: synth_core::backend::RelocKind::MovwAbs,
441            });
442        }
443        if let ArmOp::MovtSym { symbol, .. } = &instr.op {
444            relocations.push(CodeRelocation {
445                offset: code.len() as u32,
446                symbol: symbol.clone(),
447                kind: synth_core::backend::RelocKind::MovtAbs,
448            });
449        }
450
451        let encoded = encoder
452            .encode(&instr.op)
453            .map_err(|e| format!("ARM encoding failed: {}", e))?;
454        code.extend_from_slice(&encoded);
455    }
456
457    Ok((code, relocations))
458}
459
460/// Resolve local label branches to byte-accurate offsets (#202).
461///
462/// `select_with_stack` emits conditional/unconditional branches as label
463/// placeholders (`Bcc`/`B`/`Bhs`/`Blo` + `Label`) and never resolves them; the
464/// encoder then emits a `0xD000`/`0xE000` placeholder with offset 0. Before #197
465/// this path only ran for `--no-optimize`/declined functions, so the latent bug
466/// stayed hidden — routing relocatable code through it surfaced branches that
467/// land mid-instruction (a Cortex-M UsageFault) whenever a 32-bit Thumb-2
468/// instruction sits between the branch and its target.
469///
470/// This pass encodes each instruction to learn its real byte length (so 16- vs
471/// 32-bit forms and multi-instruction expansions are exact), maps each `Label`
472/// to its byte position, and rewrites every label branch to the displacement
473/// the encoder consumes: `(target - branch - 4) / 2` halfwords. A bounded
474/// fixed-point handles an offset growing a branch from 16- to 32-bit (which
475/// shifts later positions). `BCondOffset`/`BOffset` already produced inline by
476/// the optimized path carry no label and are left untouched.
477fn resolve_label_branches(
478    arm_instrs: Vec<ArmInstruction>,
479    encoder: &ArmEncoder,
480) -> Result<Vec<ArmInstruction>, String> {
481    use std::collections::HashMap;
482    use synth_synthesis::Condition;
483
484    enum BKind {
485        Cond(Condition),
486        Uncond,
487    }
488    // Record each label branch ONCE — indices are stable across iterations.
489    let mut branches: Vec<(usize, BKind, String)> = Vec::new();
490    for (i, instr) in arm_instrs.iter().enumerate() {
491        match &instr.op {
492            ArmOp::Bcc { cond, label } => branches.push((i, BKind::Cond(*cond), label.clone())),
493            ArmOp::Bhs { label } => branches.push((i, BKind::Cond(Condition::HS), label.clone())),
494            ArmOp::Blo { label } => branches.push((i, BKind::Cond(Condition::LO), label.clone())),
495            ArmOp::B { label } => branches.push((i, BKind::Uncond, label.clone())),
496            _ => {}
497        }
498    }
499    if branches.is_empty() {
500        return Ok(arm_instrs);
501    }
502
503    let mut resolved = arm_instrs;
504    // Sizes only grow (16→32-bit), so this converges quickly; cap for safety.
505    for _ in 0..16 {
506        // 1. Byte position of each instruction (Label encodes to 0 bytes).
507        let mut positions = Vec::with_capacity(resolved.len());
508        let mut pos: i64 = 0;
509        for instr in &resolved {
510            positions.push(pos);
511            pos += encoder
512                .encode(&instr.op)
513                .map_err(|e| format!("branch-resolve size probe failed: {}", e))?
514                .len() as i64;
515        }
516        // 2. Label name -> byte position (owned keys so the borrow ends here).
517        let mut labels: HashMap<String, i64> = HashMap::new();
518        for (i, instr) in resolved.iter().enumerate() {
519            if let ArmOp::Label { name } = &instr.op {
520                labels.insert(name.clone(), positions[i]);
521            }
522        }
523        // 3. Rewrite each branch to its byte-accurate offset.
524        let mut changed = false;
525        for (idx, kind, label) in &branches {
526            // A label not defined locally is an EXTERNAL target (e.g.
527            // `Trap_Handler` resolved by a relocation / the vector table). Leave
528            // such branches as their placeholder for the existing relocation
529            // path — only local control-flow labels are byte-resolved here.
530            let Some(&target) = labels.get(label) else {
531                continue;
532            };
533            // Encoder consumes the field as (target - branch - 4) / 2 halfwords.
534            // Positions are always even, so this division is exact.
535            let halfword_offset = ((target - positions[*idx] - 4) / 2) as i32;
536            let new_op = match kind {
537                BKind::Cond(c) => ArmOp::BCondOffset {
538                    cond: *c,
539                    offset: halfword_offset,
540                },
541                BKind::Uncond => ArmOp::BOffset {
542                    offset: halfword_offset,
543                },
544            };
545            if resolved[*idx].op != new_op {
546                resolved[*idx].op = new_op;
547                changed = true;
548            }
549        }
550        if !changed {
551            break;
552        }
553    }
554    Ok(resolved)
555}
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560
561    #[test]
562    fn test_arm_backend_name() {
563        let backend = ArmBackend::new();
564        assert_eq!(backend.name(), "arm");
565        assert!(backend.is_available());
566    }
567
568    #[test]
569    fn test_arm_backend_capabilities() {
570        let backend = ArmBackend::new();
571        let caps = backend.capabilities();
572        assert!(!caps.produces_elf);
573        assert!(caps.supports_rule_verification);
574        assert!(!caps.is_external);
575    }
576
577    #[test]
578    fn test_compile_add_function() {
579        let backend = ArmBackend::new();
580        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
581        let config = CompileConfig::default();
582
583        let result = backend.compile_function("add", &ops, &config);
584        assert!(result.is_ok());
585
586        let func = result.unwrap();
587        assert_eq!(func.name, "add");
588        assert!(!func.code.is_empty());
589        assert_eq!(func.wasm_ops, ops);
590    }
591
592    #[test]
593    fn test_count_params() {
594        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
595        assert_eq!(count_params(&ops), 2);
596
597        let no_params = vec![WasmOp::I32Const(5), WasmOp::I32Const(3), WasmOp::I32Add];
598        assert_eq!(count_params(&no_params), 0);
599    }
600
601    #[test]
602    fn test_arm_backend_register() {
603        let mut registry = synth_core::BackendRegistry::new();
604        registry.register(Box::new(ArmBackend::new()));
605        assert!(registry.get("arm").is_some());
606        assert_eq!(registry.available().len(), 1);
607    }
608
609    #[test]
610    fn test_compile_import_call_produces_relocations() {
611        let backend = ArmBackend::new();
612        // Simulate a WASM module where func index 0 is an import.
613        // Call(0) should generate MOV R0, #0; BL __meld_dispatch_import
614        let ops = vec![WasmOp::Call(0)];
615        let config = CompileConfig {
616            num_imports: 1,
617            no_optimize: true, // Direct instruction selection to preserve Call semantics
618            ..CompileConfig::default()
619        };
620
621        let result = backend.compile_function("caller", &ops, &config);
622        assert!(result.is_ok());
623
624        let func = result.unwrap();
625        assert!(!func.code.is_empty());
626        assert_eq!(func.relocations.len(), 1);
627        assert_eq!(func.relocations[0].symbol, "__meld_dispatch_import");
628        // The BL is the second instruction (after MOV R0, #0), so offset should be > 0
629        assert!(func.relocations[0].offset > 0);
630    }
631
632    /// Regression test for #197: in `relocatable` mode, an import call must
633    /// relocate against the direct `func_N` symbol (rewritten to the wasm field
634    /// name by `build_relocatable_elf`), NOT `__meld_dispatch_import`. This is
635    /// the ABI half of the #197 fix — without it, a host linker cannot resolve
636    /// the call to the real kernel symbol (e.g. `k_spin_lock`).
637    #[test]
638    fn test_compile_relocatable_import_uses_direct_func_symbol_197() {
639        let backend = ArmBackend::new();
640        let ops = vec![WasmOp::Call(0)]; // func 0 is an import
641        let config = CompileConfig {
642            num_imports: 1,
643            relocatable: true,
644            ..CompileConfig::default()
645        };
646
647        let func = backend
648            .compile_function("caller", &ops, &config)
649            .expect("relocatable import call compiles");
650
651        assert_eq!(func.relocations.len(), 1);
652        assert_eq!(
653            func.relocations[0].symbol, "func_0",
654            "#197: relocatable import must relocate against func_0 (→ field name), not Meld dispatch"
655        );
656    }
657
658    #[test]
659    fn test_compile_no_imports_no_relocations() {
660        let backend = ArmBackend::new();
661        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
662        let config = CompileConfig::default();
663
664        let func = backend.compile_function("add", &ops, &config).unwrap();
665        assert!(func.relocations.is_empty());
666    }
667
668    /// Regression test for #167: a call to an INTERNAL function
669    /// (index `>= num_imports`) must record a relocation against `func_{index}`.
670    /// Before the fix, only `__meld_*` (import) BLs were relocated, so
671    /// internal `BL func_N` was emitted as an unpatched `bl #0` branching
672    /// to a garbage address — making the object non-linkable. This test
673    /// would have caught that regression.
674    #[test]
675    fn test_compile_internal_call_produces_relocation_167() {
676        let backend = ArmBackend::new();
677        // num_imports = 1, so Call(2) is an INTERNAL call → `BL func_2`.
678        let ops = vec![WasmOp::Call(2)];
679        let config = CompileConfig {
680            num_imports: 1,
681            no_optimize: true,
682            ..CompileConfig::default()
683        };
684
685        let func = backend
686            .compile_function("caller", &ops, &config)
687            .expect("internal call compiles");
688
689        assert_eq!(
690            func.relocations.len(),
691            1,
692            "an internal call must emit exactly one relocation (#167)"
693        );
694        assert_eq!(
695            func.relocations[0].symbol, "func_2",
696            "internal call must relocate against the callee's func_{{index}} symbol (#167)"
697        );
698    }
699
700    // ─── Phase 1 safety-bounds plumbing for ARM ──────────────────────────
701
702    #[test]
703    fn arm_safety_bounds_mpu_emits_same_code_as_none() {
704        // Mpu mode must not introduce any inline check on ARM — the MPU
705        // handles faults via hardware. The encoded bytes for an i32.load
706        // should be identical between None and Mpu.
707        let backend = ArmBackend::new();
708        let ops = vec![
709            WasmOp::LocalGet(0),
710            WasmOp::I32Load {
711                offset: 0,
712                align: 2,
713            },
714        ];
715        let cfg_none = CompileConfig {
716            no_optimize: true,
717            ..Default::default()
718        };
719        let cfg_mpu = CompileConfig {
720            no_optimize: true,
721            safety_bounds: SafetyBounds::Mpu,
722            ..Default::default()
723        };
724        let n = backend.compile_function("ld", &ops, &cfg_none).unwrap();
725        let m = backend.compile_function("ld", &ops, &cfg_mpu).unwrap();
726        assert_eq!(
727            n.code, m.code,
728            "Mpu and None should produce identical ARM bytes (Mpu relies on hardware)"
729        );
730    }
731
732    #[test]
733    fn arm_legacy_bounds_check_still_emits_software_check() {
734        // Legacy CLI users with `--bounds-check` should keep getting the
735        // software path even though the new SafetyBounds field defaults to None.
736        let backend = ArmBackend::new();
737        let ops = vec![
738            WasmOp::LocalGet(0),
739            WasmOp::I32Load {
740                offset: 0,
741                align: 2,
742            },
743        ];
744        let cfg_legacy = CompileConfig {
745            no_optimize: true,
746            bounds_check: true,
747            ..Default::default()
748        };
749        let cfg_software = CompileConfig {
750            no_optimize: true,
751            safety_bounds: SafetyBounds::Software,
752            ..Default::default()
753        };
754        let l = backend.compile_function("ld", &ops, &cfg_legacy).unwrap();
755        let s = backend.compile_function("ld", &ops, &cfg_software).unwrap();
756        assert_eq!(
757            l.code, s.code,
758            "--bounds-check should produce the same bytes as --safety-bounds=software"
759        );
760    }
761
762    // ========================================================================
763    // ISA feature gate tests — ensure the compiler never emits unsupported
764    // instructions for a given target
765    // ========================================================================
766
767    #[test]
768    fn test_f32_rejected_on_cortex_m3_no_fpu() {
769        let backend = ArmBackend::new();
770        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
771        let config = CompileConfig {
772            target: TargetSpec::cortex_m3(),
773            no_optimize: true,
774            ..CompileConfig::default()
775        };
776
777        let result = backend.compile_function("fadd", &ops, &config);
778        assert!(
779            result.is_err(),
780            "f32 operations should fail on Cortex-M3 (no FPU)"
781        );
782    }
783
784    #[test]
785    fn test_f32_accepted_on_cortex_m4f() {
786        let backend = ArmBackend::new();
787        let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
788        let config = CompileConfig {
789            target: TargetSpec::cortex_m4f(),
790            no_optimize: true,
791            ..CompileConfig::default()
792        };
793
794        let result = backend.compile_function("fadd", &ops, &config);
795        assert!(
796            result.is_ok(),
797            "f32 operations should succeed on Cortex-M4F, got: {:?}",
798            result.unwrap_err()
799        );
800    }
801
802    #[test]
803    fn test_i32_works_on_all_targets() {
804        let backend = ArmBackend::new();
805        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
806
807        // Cortex-M3 (no FPU)
808        let config_m3 = CompileConfig {
809            target: TargetSpec::cortex_m3(),
810            no_optimize: true,
811            ..CompileConfig::default()
812        };
813        assert!(
814            backend.compile_function("add", &ops, &config_m3).is_ok(),
815            "i32 ops should work on Cortex-M3"
816        );
817
818        // Cortex-M4F (single FPU)
819        let config_m4f = CompileConfig {
820            target: TargetSpec::cortex_m4f(),
821            no_optimize: true,
822            ..CompileConfig::default()
823        };
824        assert!(
825            backend.compile_function("add", &ops, &config_m4f).is_ok(),
826            "i32 ops should work on Cortex-M4F"
827        );
828
829        // Cortex-M7DP (double FPU)
830        let config_m7dp = CompileConfig {
831            target: TargetSpec::cortex_m7dp(),
832            no_optimize: true,
833            ..CompileConfig::default()
834        };
835        assert!(
836            backend.compile_function("add", &ops, &config_m7dp).is_ok(),
837            "i32 ops should work on Cortex-M7DP"
838        );
839    }
840
841    #[test]
842    fn test_f32_rejected_on_cortex_m4_no_fpu() {
843        // Cortex-M4 (without F suffix) has no FPU
844        let backend = ArmBackend::new();
845        let ops = vec![WasmOp::F32Const(1.5), WasmOp::F32Const(2.5), WasmOp::F32Mul];
846        let config = CompileConfig {
847            target: TargetSpec::cortex_m4(),
848            no_optimize: true,
849            ..CompileConfig::default()
850        };
851
852        let result = backend.compile_function("fmul", &ops, &config);
853        assert!(
854            result.is_err(),
855            "f32 operations should fail on Cortex-M4 (no FPU)"
856        );
857    }
858
859    // ========================================================================
860    // Issue #120 — f32 ops in the optimized lowering path
861    //
862    // `OptimizerBridge::wasm_to_ir` has no handlers for f32/f64 ops, so a
863    // value-producing float op fell through to `Opcode::Nop`, leaving a
864    // downstream consumer with an unmapped vreg and tripping the PR #101
865    // defensive panic in `ir_to_arm`. Customer reproducer: `compiler_builtins
866    // float::div` and `gale_compute_ipi_mask` in the `falcon-rate-component`
867    // module.
868    //
869    // Fix: `optimize_full` declines float modules with a typed `Err`;
870    // `compile_wasm_to_arm` falls back to the non-optimized `select_with_stack`
871    // path, which handles f32 via VFP/FPU. These tests use the *default*
872    // (optimized) config — `no_optimize` is NOT set — which is the exact
873    // configuration that panicked pre-fix.
874    // ========================================================================
875
876    /// Pre-fix: this panicked with "vreg vN has no assigned ARM register and
877    /// no spill slot" inside `ir_to_arm`. Post-fix: the optimized path declines
878    /// the module and the backend falls back to direct selection, producing a
879    /// non-empty f32.div lowering on a Cortex-M4F.
880    #[test]
881    fn test_issue120_f32_div_compiles_via_optimized_default() {
882        let backend = ArmBackend::new();
883        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
884        let config = CompileConfig {
885            target: TargetSpec::cortex_m4f(),
886            // no_optimize NOT set — this exercises the optimized path that
887            // panicked in issue #120, then the fallback to direct selection.
888            ..CompileConfig::default()
889        };
890
891        let result = backend.compile_function("fdiv", &ops, &config);
892        assert!(
893            result.is_ok(),
894            "f32.div must compile on Cortex-M4F via the optimized->direct \
895             fallback (issue #120), got: {:?}",
896            result.as_ref().err()
897        );
898        assert!(
899            !result.unwrap().code.is_empty(),
900            "f32.div must produce non-empty machine code"
901        );
902    }
903
904    /// A spread of f32 ops, all through the optimized (default) config, must
905    /// compile via the fallback on an FPU target without panicking.
906    #[test]
907    fn test_issue120_assorted_f32_ops_compile_via_optimized_default() {
908        let backend = ArmBackend::new();
909        let config = CompileConfig {
910            target: TargetSpec::cortex_m4f(),
911            ..CompileConfig::default()
912        };
913
914        let cases: Vec<(&str, Vec<WasmOp>)> = vec![
915            (
916                "fadd",
917                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Add],
918            ),
919            (
920                "fmul",
921                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Mul],
922            ),
923            (
924                "fsub",
925                vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Sub],
926            ),
927        ];
928
929        for (name, ops) in cases {
930            let result = backend.compile_function(name, &ops, &config);
931            assert!(
932                result.is_ok(),
933                "{name} must compile via the optimized->direct fallback \
934                 (issue #120), got: {:?}",
935                result.as_ref().err()
936            );
937            assert!(
938                !result.unwrap().code.is_empty(),
939                "{name} must produce non-empty machine code"
940            );
941        }
942    }
943
944    /// The fallback must still honor the ISA feature gate: f32 on a no-FPU
945    /// target must fail cleanly (not panic) even on the optimized path.
946    #[test]
947    fn test_issue120_f32_div_rejected_on_no_fpu_via_optimized() {
948        let backend = ArmBackend::new();
949        let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
950        let config = CompileConfig {
951            target: TargetSpec::cortex_m3(),
952            ..CompileConfig::default()
953        };
954
955        let result = backend.compile_function("fdiv", &ops, &config);
956        assert!(
957            result.is_err(),
958            "f32.div must be rejected on Cortex-M3 (no FPU), not panic"
959        );
960    }
961
962    /// Issue #94: end-to-end byte-size check for the canonical u64-packed
963    /// FFI-return hi32 extract pattern. Compiles two near-identical
964    /// functions — one with the optimized shift-by-32, one with a generic
965    /// shift-by-7 — and asserts the optimized form is meaningfully smaller.
966    #[test]
967    fn test_issue94_hi32_extract_is_smaller_than_generic_shift() {
968        let backend = ArmBackend::new();
969        let config = CompileConfig {
970            target: TargetSpec::cortex_m4f(),
971            ..CompileConfig::default()
972        };
973
974        // Optimized path: `(local.get 0) >>> 32; wrap_i64`
975        let ops_hi32 = vec![
976            WasmOp::LocalGet(0), // i64 param in R0:R1
977            WasmOp::I64Const(32),
978            WasmOp::I64ShrU,
979            WasmOp::I32WrapI64,
980        ];
981        let func_hi32 = backend
982            .compile_function("hi32_extract", &ops_hi32, &config)
983            .unwrap();
984
985        // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the
986        // shift amount is not a multiple of 32, so it falls through to the
987        // 38-byte runtime shift.
988        let ops_generic = vec![
989            WasmOp::LocalGet(0),
990            WasmOp::I64Const(7),
991            WasmOp::I64ShrU,
992            WasmOp::I32WrapI64,
993        ];
994        let func_generic = backend
995            .compile_function("generic_shr", &ops_generic, &config)
996            .unwrap();
997
998        let bytes_hi32 = func_hi32.code.len();
999        let bytes_generic = func_generic.code.len();
1000        println!(
1001            "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})",
1002            bytes_hi32,
1003            bytes_generic,
1004            bytes_generic.saturating_sub(bytes_hi32)
1005        );
1006        let hex: String = func_hi32
1007            .code
1008            .iter()
1009            .map(|b| format!("{:02x}", b))
1010            .collect::<Vec<_>>()
1011            .join(" ");
1012        println!("[issue #94] hi32 bytes: {}", hex);
1013        // We expect the optimized form to be at least 30 bytes smaller than
1014        // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.)
1015        assert!(
1016            bytes_hi32 + 30 <= bytes_generic,
1017            "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \
1018             expected optimized form to be at least 30 bytes smaller",
1019            bytes_hi32,
1020            bytes_generic,
1021        );
1022    }
1023}