synth_backend/arm_backend.rs
1//! ARM Backend — wraps the instruction selector + optimizer + encoder as a Backend
2//!
3//! This is Synth's custom ARM compiler targeting Cortex-M (Thumb-2).
4//! It's the only backend that supports per-rule formal verification (ASIL D path).
5
6use crate::ArmEncoder;
7use synth_core::backend::{
8 Backend, BackendCapabilities, BackendError, CodeRelocation, CompilationResult, CompileConfig,
9 CompiledFunction, SafetyBounds,
10};
11use synth_core::target::{IsaVariant, TargetSpec};
12use synth_core::wasm_decoder::DecodedModule;
13use synth_core::wasm_op::WasmOp;
14use synth_synthesis::{
15 ArmInstruction, ArmOp, BoundsCheckConfig, InstructionSelector, OptimizationConfig,
16 OptimizerBridge, RuleDatabase, validate_instructions,
17};
18
19/// ARM Cortex-M backend using Synth's custom compiler pipeline
20pub struct ArmBackend;
21
22impl ArmBackend {
23 pub fn new() -> Self {
24 Self
25 }
26}
27
28impl Default for ArmBackend {
29 fn default() -> Self {
30 Self::new()
31 }
32}
33
34impl Backend for ArmBackend {
35 fn name(&self) -> &str {
36 "arm"
37 }
38
39 fn capabilities(&self) -> BackendCapabilities {
40 BackendCapabilities {
41 produces_elf: false,
42 supports_rule_verification: true,
43 supports_binary_verification: true,
44 is_external: false,
45 }
46 }
47
48 fn supported_targets(&self) -> Vec<TargetSpec> {
49 vec![
50 TargetSpec::cortex_m3(),
51 TargetSpec::cortex_m4(),
52 TargetSpec::cortex_m4f(),
53 TargetSpec::cortex_m7(),
54 TargetSpec::cortex_m7dp(),
55 ]
56 }
57
58 fn compile_module(
59 &self,
60 module: &DecodedModule,
61 config: &CompileConfig,
62 ) -> Result<CompilationResult, BackendError> {
63 let exports: Vec<_> = module
64 .functions
65 .iter()
66 .filter(|f| f.export_name.is_some())
67 .collect();
68
69 if exports.is_empty() {
70 return Err(BackendError::CompilationFailed(
71 "no exported functions found".into(),
72 ));
73 }
74
75 let mut functions = Vec::new();
76 for func in &exports {
77 let name = func.export_name.clone().unwrap();
78 let compiled = self.compile_function(&name, &func.ops, config)?;
79 functions.push(compiled);
80 }
81
82 Ok(CompilationResult {
83 functions,
84 elf: None,
85 backend_name: self.name().to_string(),
86 })
87 }
88
89 fn compile_function(
90 &self,
91 name: &str,
92 ops: &[WasmOp],
93 config: &CompileConfig,
94 ) -> Result<CompiledFunction, BackendError> {
95 let (code, relocations) =
96 compile_wasm_to_arm(ops, config).map_err(BackendError::CompilationFailed)?;
97
98 Ok(CompiledFunction {
99 name: name.to_string(),
100 code,
101 wasm_ops: ops.to_vec(),
102 relocations,
103 })
104 }
105
106 fn is_available(&self) -> bool {
107 true // Always available — it's a library backend
108 }
109}
110
111/// Count the number of function parameters by analyzing LocalGet patterns
112fn count_params(wasm_ops: &[WasmOp]) -> u32 {
113 let mut first_access: std::collections::HashMap<u32, bool> = std::collections::HashMap::new();
114 for op in wasm_ops {
115 match op {
116 WasmOp::LocalGet(idx) => {
117 first_access.entry(*idx).or_insert(true);
118 }
119 WasmOp::LocalSet(idx) | WasmOp::LocalTee(idx) => {
120 first_access.entry(*idx).or_insert(false);
121 }
122 _ => {}
123 }
124 }
125
126 first_access
127 .iter()
128 .filter_map(
129 |(&idx, &is_read_first)| {
130 if is_read_first { Some(idx + 1) } else { None }
131 },
132 )
133 .max()
134 .unwrap_or(0)
135}
136
137/// Core compilation: WASM ops → ARM machine code bytes + relocations
138///
139/// Returns (code_bytes, relocations) where relocations record BL instructions
140/// that target external symbols (e.g., `__meld_dispatch_import` for import calls).
141fn compile_wasm_to_arm(
142 wasm_ops: &[WasmOp],
143 config: &CompileConfig,
144) -> Result<(Vec<u8>, Vec<CodeRelocation>), String> {
145 let num_params = count_params(wasm_ops);
146
147 let bounds_config = match config.effective_safety_bounds() {
148 SafetyBounds::None => BoundsCheckConfig::None,
149 SafetyBounds::Mpu => BoundsCheckConfig::Mpu,
150 SafetyBounds::Software => BoundsCheckConfig::Software,
151 SafetyBounds::Mask => BoundsCheckConfig::Masking,
152 };
153
154 // The non-optimized (direct) instruction-selection path. Handles f32 via
155 // VFP/FPU. Used directly when `--no-optimize` is set, and as the fallback
156 // when the optimized path declines a module (see issue #120 below).
157 //
158 // VCR-RA-001 step 3b-lite (#242): a FRESH selector per attempt, with
159 // `spill_on_exhaustion` set only on the retry — the first pass is the
160 // unmodified default, so every function that compiles today is selected by
161 // exactly the code that compiled it yesterday (bit-identity is structural,
162 // not behavioural).
163 let select_direct_attempt = |spill_on_exhaustion: bool,
164 param_backing_on_exhaustion: bool|
165 -> Result<Vec<ArmInstruction>, synth_core::Error> {
166 let db = RuleDatabase::with_standard_rules();
167 let mut selector =
168 InstructionSelector::with_bounds_check(db.rules().to_vec(), bounds_config);
169 selector.set_target(config.target.fpu, &config.target.triple);
170 if config.num_imports > 0 {
171 selector.set_num_imports(config.num_imports);
172 }
173 // #195: plumb the callee argument-count tables so the direct selector can
174 // marshal call arguments into R0–R3 per AAPCS.
175 selector.set_func_arg_counts(
176 config.func_arg_counts.clone(),
177 config.type_arg_counts.clone(),
178 );
179 // #197: in relocatable host-link mode, emit direct `func_N` BLs for
180 // imports (rewritten to the wasm field name by build_relocatable_elf)
181 // instead of `__meld_dispatch_import`.
182 selector.set_relocatable(config.relocatable);
183 // #237: native-pointer ABI — wasm statics become __synth_wasm_data-relative.
184 selector.set_native_pointer_abi(config.native_pointer_abi, config.linear_memory_bytes);
185 // #311: i64 call results are register PAIRS — tag them.
186 selector.set_result_types(config.func_ret_i64.clone(), config.type_ret_i64.clone());
187 // Stack-pointer promotion is meaningful only under the native-pointer ABI;
188 // gating here keeps every non-native compile (all frozen fixtures) on the
189 // legacy R9 globals-table path, bit-identical.
190 if config.native_pointer_abi
191 && let Some((sp_idx, sp_init)) = config.stack_pointer_global
192 {
193 selector.set_native_pointer_stack(sp_idx, sp_init);
194 }
195 selector.set_spill_on_exhaustion(spill_on_exhaustion);
196 selector.set_param_backing_on_exhaustion(param_backing_on_exhaustion);
197 selector.select_with_stack(wasm_ops, num_params)
198 };
199 let select_direct = || -> Result<Vec<ArmInstruction>, String> {
200 // The two recoverable exhaustion classes. NOT retried: the i64
201 // spill-slot-pool Err ("spill-slot pool exhausted") — the honest
202 // remaining bound of the 3b-lite allocator.
203 const SINGLE_EXHAUSTION: &str = "all allocatable registers are live on the stack";
204 const PAIR_EXHAUSTION: &str = "no consecutive pair of free registers for i64";
205 let mut attempt = select_direct_attempt(false, false);
206 // VCR-RA-001 step 3b-lite (#242): the i32 register-exhaustion
207 // hard-fail is recoverable — retry with spill-on-exhaustion, which
208 // reserves the spill area and spills the deepest stack value when the
209 // pool is full. Only functions that FAILED the first pass ever reach
210 // this, so existing output is untouched by construction.
211 if let Err(e) = &attempt
212 && e.to_string().contains(SINGLE_EXHAUSTION)
213 {
214 attempt = select_direct_attempt(true, false);
215 }
216 // VCR-RA-001 acceptance increment (#242): the i64 consecutive-PAIR
217 // exhaustion is recoverable too — but not by stack spilling (the pair
218 // allocator already spills stack values, #171): the blockers are the
219 // pinned param home registers. The final retry frame-backs the params
220 // (#204 machinery) so they stop pinning R0-R3, with spill-on-exhaustion
221 // kept on for the single-register pressure the reloads add. Reached
222 // only by functions that failed every earlier pass.
223 if let Err(e) = &attempt
224 && e.to_string().contains(PAIR_EXHAUSTION)
225 {
226 attempt = select_direct_attempt(true, true);
227 }
228 attempt.map_err(|e| format!("instruction selection failed: {}", e))
229 };
230
231 // Instruction selection: optimized or direct.
232 //
233 // #197: `--relocatable` (host-link ET_REL) forces the direct selector. The
234 // optimized path materializes an absolute linmem base (0x20000100) and does
235 // not preserve caller-saved registers across calls — both wrong for a
236 // host-linked object, where the linmem base arrives via `fp` at runtime and
237 // callees follow AAPCS. `select_with_stack` (now i64-spill capable after
238 // #171) handles fp-relative memory + caller-saved preservation correctly.
239 let arm_instrs = if config.no_optimize || config.relocatable {
240 select_direct()?
241 } else {
242 let opt_config = if config.loom_compat {
243 OptimizationConfig::loom_compat()
244 } else {
245 OptimizationConfig::all()
246 };
247
248 let mut bridge = OptimizerBridge::with_config(opt_config);
249 // #188: tell the bridge how many imports there are so it declines only
250 // LOCAL calls (and leaves import calls on the optimized path, keeping
251 // the #173 field-name relocation rewrite intact).
252 bridge.set_num_imports(config.num_imports);
253 // `ir_to_arm` now returns `Result` — an `Err` means the optimized path
254 // hit an unmapped vreg (issue-#93-class). Treat it identically to an
255 // `optimize_full` failure: fall back to the direct selector rather
256 // than propagating, so the function still compiles correctly.
257 match bridge
258 .optimize_full(wasm_ops)
259 .and_then(|(opt_ir, _cfg, _stats)| bridge.ir_to_arm(&opt_ir, num_params as usize))
260 {
261 Ok(arm_ops) => arm_ops
262 .into_iter()
263 .map(|op| ArmInstruction {
264 op,
265 source_line: None,
266 })
267 .collect(),
268 // Issue #120: the optimized path declines modules it cannot lower
269 // (notably scalar f32/f64 ops — the IR has no float opcodes). Fall
270 // back to the direct instruction selector, which handles f32 via
271 // VFP/FPU. This is honest degradation: the function still compiles
272 // correctly, just without IR-level optimization.
273 Err(_) => select_direct()?,
274 }
275 };
276
277 // #257/#277: `mul`+`add`→`mla` fusion is intentionally NOT wired here.
278 // The transform is correct and ready (`synth_synthesis::liveness::fuse_mul_add`,
279 // fully tested), but it is **register-allocation-coupled**: over the current
280 // greedy single-pass selector, folding `mul rM,..; add rD,rM,rX` → `mla`
281 // extends the live ranges of the mul inputs to the mla point, and the added
282 // pressure (extra moves/spills) costs more than the single-cycle MLA saves —
283 // gale measured a +2 cyc on-target REGRESSION (flat_flight 255→257, G474RE)
284 // even though it removes 2 instructions and the seam stays 0x07FDF307. So the
285 // fusion stays unwired until the spill-aware allocator (VCR-RA-001) chooses
286 // registers, at which point it becomes net-positive (per #272's plan and the
287 // wiring design note). Lesson (#277): a register-pressure-affecting transform
288 // needs an on-target/allocator-aware gate, not a byte-count gate, before it
289 // can default on.
290
291 // VCR-RA-001 const-CSE / rematerialization-avoidance (#209), the first
292 // allocator-analysis-driven CODEGEN change. Drops `movw` re-materializations
293 // of a constant already resident in another register and retargets the reads
294 // — every rewrite proven by the liveness analysis, and it ONLY removes
295 // materializations (pressure never rises), so unlike the mla fusion (#277) it
296 // cannot regress on-target. Runs on the selected stream before branch
297 // resolution (it removes instructions, shifting byte offsets). Behind
298 // `SYNTH_CONST_CSE=1` while it is validated against the differential oracle +
299 // gale's five on-target baselines; off by default keeps every fixture
300 // bit-identical.
301 let arm_instrs = if std::env::var("SYNTH_CONST_CSE").is_ok() {
302 synth_synthesis::liveness::apply_const_cse(&arm_instrs).0
303 } else {
304 arm_instrs
305 };
306
307 // VCR-RA-001 RANGE RE-ALLOCATION (#209/#242, wiring step 3a) — the first
308 // CONSEQUENTIAL allocator pass: re-colour each maximal straight-line
309 // segment over the R0-R8 pool with value ranges as the allocation unit
310 // (segment inputs + per-register live-outs pinned to their original
311 // registers, reserved R9-R12/SP identity-assigned — each segment is
312 // independently sound, no cross-segment liveness assumed). Renames
313 // registers only: never adds, removes, or reorders instructions, so
314 // labels/branch offsets are unaffected.
315 //
316 // DEFAULT-ON since v0.11.36: gale cleared the gate on-target (G474RE,
317 // #209 2026-06-10) — flag-on output byte-identical to flag-off on
318 // flat_flight/controller/control_step, fires on the filter family with
319 // zero cycle delta and a small size win, all selfchecks green on silicon.
320 // Opt out with `SYNTH_RANGE_REALLOC=0`; per-function stats with
321 // `SYNTH_REALLOC_STATS=1`.
322 //
323 // The companion dead callee-saved-save elimination (gale's "next
324 // consequential lever", same issue comment) then shrinks the prologue
325 // `push {r4-r8,lr}` / epilogue `pop {r4-r8,pc}` to the callee-saved
326 // registers the re-allocated body still touches (leaf-only,
327 // SP-untouched, even-count-padded — see shrink_callee_saved_saves):
328 // ~12 cycles of pure save/restore overhead removed on small leaves.
329 let realloc_on = std::env::var("SYNTH_RANGE_REALLOC").map_or(true, |v| v != "0");
330 let arm_instrs = if realloc_on {
331 use synth_synthesis::rules::Reg;
332 const POOL: [Reg; 9] = [
333 Reg::R0,
334 Reg::R1,
335 Reg::R2,
336 Reg::R3,
337 Reg::R4,
338 Reg::R5,
339 Reg::R6,
340 Reg::R7,
341 Reg::R8,
342 ];
343 let (out, stats) = synth_synthesis::liveness::reallocate_function(&arm_instrs, &POOL);
344 if std::env::var("SYNTH_REALLOC_STATS").is_ok() {
345 eprintln!(
346 "[range-realloc] {} segments: {} reallocated, {} declined ({} validator-rejected), {} need spill (step 4)",
347 stats.segments,
348 stats.reallocated,
349 stats.declined,
350 stats.validator_rejects,
351 stats.needs_spill
352 );
353 }
354 synth_synthesis::liveness::shrink_callee_saved_saves(&out).unwrap_or(out)
355 } else {
356 arm_instrs
357 };
358
359 // VCR-RA-001 SHADOW ALLOCATION (#209/#242): run the register allocator on
360 // the selected stream and LOG what it finds — without changing a single
361 // emitted byte. This is the measure-only bridge between the built analysis
362 // layer and the eventual virtual-register wiring: it shows, per real
363 // function, whether the allocator can colour it within the R0–R8 pool and
364 // how much const-CSE / rematerialization headroom exists (#209). Enable with
365 // `SYNTH_SHADOW_ALLOC=1`; off by default and side-effect-free either way.
366 if std::env::var("SYNTH_SHADOW_ALLOC").is_ok() {
367 use synth_synthesis::liveness::{
368 AllocationOutcome, allocate_function, function_peak_pressure,
369 };
370 // R9 globals / R10 mem-size / R11 mem-base / R12 IP-scratch are reserved;
371 // pin them above the 0..9 allocatable pool so the colourer keeps R0–R8.
372 let precolored = std::collections::BTreeMap::from([
373 (synth_synthesis::rules::Reg::R9, 9usize),
374 (synth_synthesis::rules::Reg::R10, 10),
375 (synth_synthesis::rules::Reg::R11, 11),
376 (synth_synthesis::rules::Reg::R12, 12),
377 ]);
378 // True VALUE pressure (one node per value, not per reused physical reg):
379 // a NeedsSpill with peak ≤ 9 is a SPURIOUS physical-register spill — the
380 // function fits once virtually allocated.
381 let peak = function_peak_pressure(&arm_instrs);
382 match allocate_function(&arm_instrs, 9, &precolored) {
383 AllocationOutcome::Allocated {
384 remat_opportunities,
385 coloring,
386 } => eprintln!(
387 "[shadow-alloc] OK: {} pregs coloured within R0-R8 pool, peak value-pressure {}, {} const-CSE/remat opportunities",
388 coloring.len(),
389 peak,
390 remat_opportunities
391 ),
392 AllocationOutcome::NeedsSpill(s) => eprintln!(
393 "[shadow-alloc] physical-graph would spill {:?}, but peak value-pressure is {} (≤9 ⇒ spurious; fits once virtually allocated)",
394 s, peak
395 ),
396 AllocationOutcome::Declined => {
397 eprintln!(
398 "[shadow-alloc] declined (unmodeled construct — calls/i64/fp/offset-branch)"
399 )
400 }
401 }
402 }
403
404 // ISA feature gate: validate that all generated instructions are supported
405 // by the target. This catches FPU instructions on no-FPU targets, double-precision
406 // instructions on single-precision targets, etc.
407 validate_instructions(&arm_instrs, config.target.fpu, &config.target.triple)
408 .map_err(|e| format!("ISA validation failed: {}", e))?;
409
410 // Encode to binary — use Thumb-2 for Cortex-M targets
411 let use_thumb2 = matches!(config.target.isa, IsaVariant::Thumb2 | IsaVariant::Thumb);
412
413 let encoder = if use_thumb2 {
414 ArmEncoder::new_thumb2_with_fpu(config.target.fpu)
415 } else {
416 ArmEncoder::new_arm32()
417 };
418
419 // #202: resolve local label branches (Bcc/B/Bhs/Blo) to byte-accurate
420 // offsets before encoding. `select_with_stack` emits them as label
421 // placeholders and never resolves them — without this they encode as
422 // `bne.n #0` and land mid-instruction whenever a 32-bit Thumb-2 instruction
423 // sits between the branch and its target (UsageFault on real hardware).
424 // Only meaningful for Thumb-2 (the offset units are halfword/PC+4).
425 let arm_instrs = if use_thumb2 {
426 resolve_label_branches(arm_instrs, &encoder)?
427 } else {
428 arm_instrs
429 };
430
431 let mut code = Vec::new();
432 let mut relocations = Vec::new();
433
434 for instr in &arm_instrs {
435 // Record a relocation for every BL: the encoder emits `bl #0` and
436 // relies on a relocation to patch the target. This covers BOTH import
437 // dispatch stubs (`__meld_*`, undefined externals) AND internal calls
438 // (`func_N`, defined in this object). Previously only `__meld_*` was
439 // recorded, so internal `BL func_N` calls were left as unpatched
440 // `bl #0` placeholders branching to a garbage address (#167).
441 if let ArmOp::Bl { label } = &instr.op {
442 relocations.push(CodeRelocation {
443 offset: code.len() as u32,
444 symbol: label.clone(),
445 kind: synth_core::backend::RelocKind::ThmCall,
446 });
447 }
448 // #237: symbol-relative MOVW/MOVT (the `--native-pointer-abi` static-data
449 // addressing). The encoder writes the addend in place; record the matching
450 // R_ARM_MOVW_ABS_NC / R_ARM_MOVT_ABS so the linker adds the symbol address.
451 if let ArmOp::MovwSym { symbol, .. } = &instr.op {
452 relocations.push(CodeRelocation {
453 offset: code.len() as u32,
454 symbol: symbol.clone(),
455 kind: synth_core::backend::RelocKind::MovwAbs,
456 });
457 }
458 if let ArmOp::MovtSym { symbol, .. } = &instr.op {
459 relocations.push(CodeRelocation {
460 offset: code.len() as u32,
461 symbol: symbol.clone(),
462 kind: synth_core::backend::RelocKind::MovtAbs,
463 });
464 }
465
466 let encoded = encoder
467 .encode(&instr.op)
468 .map_err(|e| format!("ARM encoding failed: {}", e))?;
469 code.extend_from_slice(&encoded);
470 }
471
472 Ok((code, relocations))
473}
474
475/// Resolve local label branches to byte-accurate offsets (#202).
476///
477/// `select_with_stack` emits conditional/unconditional branches as label
478/// placeholders (`Bcc`/`B`/`Bhs`/`Blo` + `Label`) and never resolves them; the
479/// encoder then emits a `0xD000`/`0xE000` placeholder with offset 0. Before #197
480/// this path only ran for `--no-optimize`/declined functions, so the latent bug
481/// stayed hidden — routing relocatable code through it surfaced branches that
482/// land mid-instruction (a Cortex-M UsageFault) whenever a 32-bit Thumb-2
483/// instruction sits between the branch and its target.
484///
485/// This pass encodes each instruction to learn its real byte length (so 16- vs
486/// 32-bit forms and multi-instruction expansions are exact), maps each `Label`
487/// to its byte position, and rewrites every label branch to the displacement
488/// the encoder consumes: `(target - branch - 4) / 2` halfwords. A bounded
489/// fixed-point handles an offset growing a branch from 16- to 32-bit (which
490/// shifts later positions). `BCondOffset`/`BOffset` already produced inline by
491/// the optimized path carry no label and are left untouched.
492fn resolve_label_branches(
493 arm_instrs: Vec<ArmInstruction>,
494 encoder: &ArmEncoder,
495) -> Result<Vec<ArmInstruction>, String> {
496 use std::collections::HashMap;
497 use synth_synthesis::Condition;
498
499 enum BKind {
500 Cond(Condition),
501 Uncond,
502 }
503 // Record each label branch ONCE — indices are stable across iterations.
504 let mut branches: Vec<(usize, BKind, String)> = Vec::new();
505 for (i, instr) in arm_instrs.iter().enumerate() {
506 match &instr.op {
507 ArmOp::Bcc { cond, label } => branches.push((i, BKind::Cond(*cond), label.clone())),
508 ArmOp::Bhs { label } => branches.push((i, BKind::Cond(Condition::HS), label.clone())),
509 ArmOp::Blo { label } => branches.push((i, BKind::Cond(Condition::LO), label.clone())),
510 ArmOp::B { label } => branches.push((i, BKind::Uncond, label.clone())),
511 _ => {}
512 }
513 }
514 if branches.is_empty() {
515 return Ok(arm_instrs);
516 }
517
518 let mut resolved = arm_instrs;
519 // Sizes only grow (16→32-bit), so this converges quickly; cap for safety.
520 for _ in 0..16 {
521 // 1. Byte position of each instruction (Label encodes to 0 bytes).
522 let mut positions = Vec::with_capacity(resolved.len());
523 let mut pos: i64 = 0;
524 for instr in &resolved {
525 positions.push(pos);
526 pos += encoder
527 .encode(&instr.op)
528 .map_err(|e| format!("branch-resolve size probe failed: {}", e))?
529 .len() as i64;
530 }
531 // 2. Label name -> byte position (owned keys so the borrow ends here).
532 let mut labels: HashMap<String, i64> = HashMap::new();
533 for (i, instr) in resolved.iter().enumerate() {
534 if let ArmOp::Label { name } = &instr.op {
535 labels.insert(name.clone(), positions[i]);
536 }
537 }
538 // 3. Rewrite each branch to its byte-accurate offset.
539 let mut changed = false;
540 for (idx, kind, label) in &branches {
541 // A label not defined locally is an EXTERNAL target (e.g.
542 // `Trap_Handler` resolved by a relocation / the vector table). Leave
543 // such branches as their placeholder for the existing relocation
544 // path — only local control-flow labels are byte-resolved here.
545 let Some(&target) = labels.get(label) else {
546 continue;
547 };
548 // Encoder consumes the field as (target - branch - 4) / 2 halfwords.
549 // Positions are always even, so this division is exact.
550 let halfword_offset = ((target - positions[*idx] - 4) / 2) as i32;
551 let new_op = match kind {
552 BKind::Cond(c) => ArmOp::BCondOffset {
553 cond: *c,
554 offset: halfword_offset,
555 },
556 BKind::Uncond => ArmOp::BOffset {
557 offset: halfword_offset,
558 },
559 };
560 if resolved[*idx].op != new_op {
561 resolved[*idx].op = new_op;
562 changed = true;
563 }
564 }
565 if !changed {
566 break;
567 }
568 }
569 Ok(resolved)
570}
571
572#[cfg(test)]
573mod tests {
574 use super::*;
575
576 #[test]
577 fn test_arm_backend_name() {
578 let backend = ArmBackend::new();
579 assert_eq!(backend.name(), "arm");
580 assert!(backend.is_available());
581 }
582
583 #[test]
584 fn test_arm_backend_capabilities() {
585 let backend = ArmBackend::new();
586 let caps = backend.capabilities();
587 assert!(!caps.produces_elf);
588 assert!(caps.supports_rule_verification);
589 assert!(!caps.is_external);
590 }
591
592 #[test]
593 fn test_compile_add_function() {
594 let backend = ArmBackend::new();
595 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
596 let config = CompileConfig::default();
597
598 let result = backend.compile_function("add", &ops, &config);
599 assert!(result.is_ok());
600
601 let func = result.unwrap();
602 assert_eq!(func.name, "add");
603 assert!(!func.code.is_empty());
604 assert_eq!(func.wasm_ops, ops);
605 }
606
607 #[test]
608 fn test_count_params() {
609 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
610 assert_eq!(count_params(&ops), 2);
611
612 let no_params = vec![WasmOp::I32Const(5), WasmOp::I32Const(3), WasmOp::I32Add];
613 assert_eq!(count_params(&no_params), 0);
614 }
615
616 #[test]
617 fn test_arm_backend_register() {
618 let mut registry = synth_core::BackendRegistry::new();
619 registry.register(Box::new(ArmBackend::new()));
620 assert!(registry.get("arm").is_some());
621 assert_eq!(registry.available().len(), 1);
622 }
623
624 #[test]
625 fn test_compile_import_call_produces_relocations() {
626 let backend = ArmBackend::new();
627 // Simulate a WASM module where func index 0 is an import.
628 // Call(0) should generate MOV R0, #0; BL __meld_dispatch_import
629 let ops = vec![WasmOp::Call(0)];
630 let config = CompileConfig {
631 num_imports: 1,
632 no_optimize: true, // Direct instruction selection to preserve Call semantics
633 ..CompileConfig::default()
634 };
635
636 let result = backend.compile_function("caller", &ops, &config);
637 assert!(result.is_ok());
638
639 let func = result.unwrap();
640 assert!(!func.code.is_empty());
641 assert_eq!(func.relocations.len(), 1);
642 assert_eq!(func.relocations[0].symbol, "__meld_dispatch_import");
643 // The BL is the second instruction (after MOV R0, #0), so offset should be > 0
644 assert!(func.relocations[0].offset > 0);
645 }
646
647 /// Regression test for #197: in `relocatable` mode, an import call must
648 /// relocate against the direct `func_N` symbol (rewritten to the wasm field
649 /// name by `build_relocatable_elf`), NOT `__meld_dispatch_import`. This is
650 /// the ABI half of the #197 fix — without it, a host linker cannot resolve
651 /// the call to the real kernel symbol (e.g. `k_spin_lock`).
652 #[test]
653 fn test_compile_relocatable_import_uses_direct_func_symbol_197() {
654 let backend = ArmBackend::new();
655 let ops = vec![WasmOp::Call(0)]; // func 0 is an import
656 let config = CompileConfig {
657 num_imports: 1,
658 relocatable: true,
659 ..CompileConfig::default()
660 };
661
662 let func = backend
663 .compile_function("caller", &ops, &config)
664 .expect("relocatable import call compiles");
665
666 assert_eq!(func.relocations.len(), 1);
667 assert_eq!(
668 func.relocations[0].symbol, "func_0",
669 "#197: relocatable import must relocate against func_0 (→ field name), not Meld dispatch"
670 );
671 }
672
673 #[test]
674 fn test_compile_no_imports_no_relocations() {
675 let backend = ArmBackend::new();
676 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
677 let config = CompileConfig::default();
678
679 let func = backend.compile_function("add", &ops, &config).unwrap();
680 assert!(func.relocations.is_empty());
681 }
682
683 /// Regression test for #167: a call to an INTERNAL function
684 /// (index `>= num_imports`) must record a relocation against `func_{index}`.
685 /// Before the fix, only `__meld_*` (import) BLs were relocated, so
686 /// internal `BL func_N` was emitted as an unpatched `bl #0` branching
687 /// to a garbage address — making the object non-linkable. This test
688 /// would have caught that regression.
689 #[test]
690 fn test_compile_internal_call_produces_relocation_167() {
691 let backend = ArmBackend::new();
692 // num_imports = 1, so Call(2) is an INTERNAL call → `BL func_2`.
693 let ops = vec![WasmOp::Call(2)];
694 let config = CompileConfig {
695 num_imports: 1,
696 no_optimize: true,
697 ..CompileConfig::default()
698 };
699
700 let func = backend
701 .compile_function("caller", &ops, &config)
702 .expect("internal call compiles");
703
704 assert_eq!(
705 func.relocations.len(),
706 1,
707 "an internal call must emit exactly one relocation (#167)"
708 );
709 assert_eq!(
710 func.relocations[0].symbol, "func_2",
711 "internal call must relocate against the callee's func_{{index}} symbol (#167)"
712 );
713 }
714
715 // ─── Phase 1 safety-bounds plumbing for ARM ──────────────────────────
716
717 #[test]
718 fn arm_safety_bounds_mpu_emits_same_code_as_none() {
719 // Mpu mode must not introduce any inline check on ARM — the MPU
720 // handles faults via hardware. The encoded bytes for an i32.load
721 // should be identical between None and Mpu.
722 let backend = ArmBackend::new();
723 let ops = vec![
724 WasmOp::LocalGet(0),
725 WasmOp::I32Load {
726 offset: 0,
727 align: 2,
728 },
729 ];
730 let cfg_none = CompileConfig {
731 no_optimize: true,
732 ..Default::default()
733 };
734 let cfg_mpu = CompileConfig {
735 no_optimize: true,
736 safety_bounds: SafetyBounds::Mpu,
737 ..Default::default()
738 };
739 let n = backend.compile_function("ld", &ops, &cfg_none).unwrap();
740 let m = backend.compile_function("ld", &ops, &cfg_mpu).unwrap();
741 assert_eq!(
742 n.code, m.code,
743 "Mpu and None should produce identical ARM bytes (Mpu relies on hardware)"
744 );
745 }
746
747 #[test]
748 fn arm_legacy_bounds_check_still_emits_software_check() {
749 // Legacy CLI users with `--bounds-check` should keep getting the
750 // software path even though the new SafetyBounds field defaults to None.
751 let backend = ArmBackend::new();
752 let ops = vec![
753 WasmOp::LocalGet(0),
754 WasmOp::I32Load {
755 offset: 0,
756 align: 2,
757 },
758 ];
759 let cfg_legacy = CompileConfig {
760 no_optimize: true,
761 bounds_check: true,
762 ..Default::default()
763 };
764 let cfg_software = CompileConfig {
765 no_optimize: true,
766 safety_bounds: SafetyBounds::Software,
767 ..Default::default()
768 };
769 let l = backend.compile_function("ld", &ops, &cfg_legacy).unwrap();
770 let s = backend.compile_function("ld", &ops, &cfg_software).unwrap();
771 assert_eq!(
772 l.code, s.code,
773 "--bounds-check should produce the same bytes as --safety-bounds=software"
774 );
775 }
776
777 // ========================================================================
778 // ISA feature gate tests — ensure the compiler never emits unsupported
779 // instructions for a given target
780 // ========================================================================
781
782 #[test]
783 fn test_f32_rejected_on_cortex_m3_no_fpu() {
784 let backend = ArmBackend::new();
785 let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
786 let config = CompileConfig {
787 target: TargetSpec::cortex_m3(),
788 no_optimize: true,
789 ..CompileConfig::default()
790 };
791
792 let result = backend.compile_function("fadd", &ops, &config);
793 assert!(
794 result.is_err(),
795 "f32 operations should fail on Cortex-M3 (no FPU)"
796 );
797 }
798
799 #[test]
800 fn test_f32_accepted_on_cortex_m4f() {
801 let backend = ArmBackend::new();
802 let ops = vec![WasmOp::F32Const(1.0), WasmOp::F32Const(2.0), WasmOp::F32Add];
803 let config = CompileConfig {
804 target: TargetSpec::cortex_m4f(),
805 no_optimize: true,
806 ..CompileConfig::default()
807 };
808
809 let result = backend.compile_function("fadd", &ops, &config);
810 assert!(
811 result.is_ok(),
812 "f32 operations should succeed on Cortex-M4F, got: {:?}",
813 result.unwrap_err()
814 );
815 }
816
817 #[test]
818 fn test_i32_works_on_all_targets() {
819 let backend = ArmBackend::new();
820 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::I32Add];
821
822 // Cortex-M3 (no FPU)
823 let config_m3 = CompileConfig {
824 target: TargetSpec::cortex_m3(),
825 no_optimize: true,
826 ..CompileConfig::default()
827 };
828 assert!(
829 backend.compile_function("add", &ops, &config_m3).is_ok(),
830 "i32 ops should work on Cortex-M3"
831 );
832
833 // Cortex-M4F (single FPU)
834 let config_m4f = CompileConfig {
835 target: TargetSpec::cortex_m4f(),
836 no_optimize: true,
837 ..CompileConfig::default()
838 };
839 assert!(
840 backend.compile_function("add", &ops, &config_m4f).is_ok(),
841 "i32 ops should work on Cortex-M4F"
842 );
843
844 // Cortex-M7DP (double FPU)
845 let config_m7dp = CompileConfig {
846 target: TargetSpec::cortex_m7dp(),
847 no_optimize: true,
848 ..CompileConfig::default()
849 };
850 assert!(
851 backend.compile_function("add", &ops, &config_m7dp).is_ok(),
852 "i32 ops should work on Cortex-M7DP"
853 );
854 }
855
856 #[test]
857 fn test_f32_rejected_on_cortex_m4_no_fpu() {
858 // Cortex-M4 (without F suffix) has no FPU
859 let backend = ArmBackend::new();
860 let ops = vec![WasmOp::F32Const(1.5), WasmOp::F32Const(2.5), WasmOp::F32Mul];
861 let config = CompileConfig {
862 target: TargetSpec::cortex_m4(),
863 no_optimize: true,
864 ..CompileConfig::default()
865 };
866
867 let result = backend.compile_function("fmul", &ops, &config);
868 assert!(
869 result.is_err(),
870 "f32 operations should fail on Cortex-M4 (no FPU)"
871 );
872 }
873
874 // ========================================================================
875 // Issue #120 — f32 ops in the optimized lowering path
876 //
877 // `OptimizerBridge::wasm_to_ir` has no handlers for f32/f64 ops, so a
878 // value-producing float op fell through to `Opcode::Nop`, leaving a
879 // downstream consumer with an unmapped vreg and tripping the PR #101
880 // defensive panic in `ir_to_arm`. Customer reproducer: `compiler_builtins
881 // float::div` and `gale_compute_ipi_mask` in the `falcon-rate-component`
882 // module.
883 //
884 // Fix: `optimize_full` declines float modules with a typed `Err`;
885 // `compile_wasm_to_arm` falls back to the non-optimized `select_with_stack`
886 // path, which handles f32 via VFP/FPU. These tests use the *default*
887 // (optimized) config — `no_optimize` is NOT set — which is the exact
888 // configuration that panicked pre-fix.
889 // ========================================================================
890
891 /// Pre-fix: this panicked with "vreg vN has no assigned ARM register and
892 /// no spill slot" inside `ir_to_arm`. Post-fix: the optimized path declines
893 /// the module and the backend falls back to direct selection, producing a
894 /// non-empty f32.div lowering on a Cortex-M4F.
895 #[test]
896 fn test_issue120_f32_div_compiles_via_optimized_default() {
897 let backend = ArmBackend::new();
898 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
899 let config = CompileConfig {
900 target: TargetSpec::cortex_m4f(),
901 // no_optimize NOT set — this exercises the optimized path that
902 // panicked in issue #120, then the fallback to direct selection.
903 ..CompileConfig::default()
904 };
905
906 let result = backend.compile_function("fdiv", &ops, &config);
907 assert!(
908 result.is_ok(),
909 "f32.div must compile on Cortex-M4F via the optimized->direct \
910 fallback (issue #120), got: {:?}",
911 result.as_ref().err()
912 );
913 assert!(
914 !result.unwrap().code.is_empty(),
915 "f32.div must produce non-empty machine code"
916 );
917 }
918
919 /// A spread of f32 ops, all through the optimized (default) config, must
920 /// compile via the fallback on an FPU target without panicking.
921 #[test]
922 fn test_issue120_assorted_f32_ops_compile_via_optimized_default() {
923 let backend = ArmBackend::new();
924 let config = CompileConfig {
925 target: TargetSpec::cortex_m4f(),
926 ..CompileConfig::default()
927 };
928
929 let cases: Vec<(&str, Vec<WasmOp>)> = vec![
930 (
931 "fadd",
932 vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Add],
933 ),
934 (
935 "fmul",
936 vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Mul],
937 ),
938 (
939 "fsub",
940 vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Sub],
941 ),
942 ];
943
944 for (name, ops) in cases {
945 let result = backend.compile_function(name, &ops, &config);
946 assert!(
947 result.is_ok(),
948 "{name} must compile via the optimized->direct fallback \
949 (issue #120), got: {:?}",
950 result.as_ref().err()
951 );
952 assert!(
953 !result.unwrap().code.is_empty(),
954 "{name} must produce non-empty machine code"
955 );
956 }
957 }
958
959 /// The fallback must still honor the ISA feature gate: f32 on a no-FPU
960 /// target must fail cleanly (not panic) even on the optimized path.
961 #[test]
962 fn test_issue120_f32_div_rejected_on_no_fpu_via_optimized() {
963 let backend = ArmBackend::new();
964 let ops = vec![WasmOp::LocalGet(0), WasmOp::LocalGet(1), WasmOp::F32Div];
965 let config = CompileConfig {
966 target: TargetSpec::cortex_m3(),
967 ..CompileConfig::default()
968 };
969
970 let result = backend.compile_function("fdiv", &ops, &config);
971 assert!(
972 result.is_err(),
973 "f32.div must be rejected on Cortex-M3 (no FPU), not panic"
974 );
975 }
976
977 /// Issue #94: end-to-end byte-size check for the canonical u64-packed
978 /// FFI-return hi32 extract pattern. Compiles two near-identical
979 /// functions — one with the optimized shift-by-32, one with a generic
980 /// shift-by-7 — and asserts the optimized form is meaningfully smaller.
981 #[test]
982 fn test_issue94_hi32_extract_is_smaller_than_generic_shift() {
983 let backend = ArmBackend::new();
984 let config = CompileConfig {
985 target: TargetSpec::cortex_m4f(),
986 ..CompileConfig::default()
987 };
988
989 // Optimized path: `(local.get 0) >>> 32; wrap_i64`
990 let ops_hi32 = vec![
991 WasmOp::LocalGet(0), // i64 param in R0:R1
992 WasmOp::I64Const(32),
993 WasmOp::I64ShrU,
994 WasmOp::I32WrapI64,
995 ];
996 let func_hi32 = backend
997 .compile_function("hi32_extract", &ops_hi32, &config)
998 .unwrap();
999
1000 // Generic path: `(local.get 0) >>> 7; wrap_i64` — same shape, but the
1001 // shift amount is not a multiple of 32, so it falls through to the
1002 // 38-byte runtime shift.
1003 let ops_generic = vec![
1004 WasmOp::LocalGet(0),
1005 WasmOp::I64Const(7),
1006 WasmOp::I64ShrU,
1007 WasmOp::I32WrapI64,
1008 ];
1009 let func_generic = backend
1010 .compile_function("generic_shr", &ops_generic, &config)
1011 .unwrap();
1012
1013 let bytes_hi32 = func_hi32.code.len();
1014 let bytes_generic = func_generic.code.len();
1015 println!(
1016 "\n[issue #94] hi32 extract: {} bytes (vs generic shift: {} bytes; saved {})",
1017 bytes_hi32,
1018 bytes_generic,
1019 bytes_generic.saturating_sub(bytes_hi32)
1020 );
1021 let hex: String = func_hi32
1022 .code
1023 .iter()
1024 .map(|b| format!("{:02x}", b))
1025 .collect::<Vec<_>>()
1026 .join(" ");
1027 println!("[issue #94] hi32 bytes: {}", hex);
1028 // We expect the optimized form to be at least 30 bytes smaller than
1029 // the generic 64-bit shift sequence. (Empirically: 14 vs 50 bytes.)
1030 assert!(
1031 bytes_hi32 + 30 <= bytes_generic,
1032 "issue #94: hi32 extract = {} bytes, generic shift = {} bytes; \
1033 expected optimized form to be at least 30 bytes smaller",
1034 bytes_hi32,
1035 bytes_generic,
1036 );
1037 }
1038}