Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    /// #206: encode an ARM32 (A32) load/store whose address uses a register
55    /// offset (`[rn, rm{, #off}]`). Returns `None` for ops with no register
56    /// offset (the caller falls through to the immediate-form arms). Computes
57    /// `ip = base + rm` then re-encodes the op against `[ip, #off]`, which works
58    /// uniformly for word/byte/halfword/signed forms. IP (R12) is the scratch
59    /// register the selector already treats as clobberable across memory ops.
60    fn encode_arm_reg_offset_mem(&self, op: &ArmOp) -> Result<Option<Vec<u8>>> {
61        use synth_synthesis::Reg;
62        let addr = match op {
63            ArmOp::Ldr { addr, .. }
64            | ArmOp::Str { addr, .. }
65            | ArmOp::Ldrb { addr, .. }
66            | ArmOp::Strb { addr, .. }
67            | ArmOp::Ldrh { addr, .. }
68            | ArmOp::Strh { addr, .. }
69            | ArmOp::Ldrsb { addr, .. }
70            | ArmOp::Ldrsh { addr, .. } => addr,
71            _ => return Ok(None),
72        };
73        let Some(rm) = addr.offset_reg else {
74            return Ok(None);
75        };
76        let ip = Reg::R12;
77        // ADD ip, base, rm  (cond=AL, opcode=ADD, S=0, register operand2)
78        let add: u32 = 0xE0800000
79            | (reg_to_bits(&addr.base) << 16)
80            | (reg_to_bits(&ip) << 12)
81            | reg_to_bits(&rm);
82        let mut bytes = add.to_le_bytes().to_vec();
83        // Re-encode the op against [ip, #off] (immediate form → no offset_reg,
84        // so this recursion hits the immediate arms, not this helper again).
85        let imm_addr = MemAddr::imm(ip, addr.offset);
86        let imm_op = match op {
87            ArmOp::Ldr { rd, .. } => ArmOp::Ldr {
88                rd: *rd,
89                addr: imm_addr,
90            },
91            ArmOp::Str { rd, .. } => ArmOp::Str {
92                rd: *rd,
93                addr: imm_addr,
94            },
95            ArmOp::Ldrb { rd, .. } => ArmOp::Ldrb {
96                rd: *rd,
97                addr: imm_addr,
98            },
99            ArmOp::Strb { rd, .. } => ArmOp::Strb {
100                rd: *rd,
101                addr: imm_addr,
102            },
103            ArmOp::Ldrh { rd, .. } => ArmOp::Ldrh {
104                rd: *rd,
105                addr: imm_addr,
106            },
107            ArmOp::Strh { rd, .. } => ArmOp::Strh {
108                rd: *rd,
109                addr: imm_addr,
110            },
111            ArmOp::Ldrsb { rd, .. } => ArmOp::Ldrsb {
112                rd: *rd,
113                addr: imm_addr,
114            },
115            ArmOp::Ldrsh { rd, .. } => ArmOp::Ldrsh {
116                rd: *rd,
117                addr: imm_addr,
118            },
119            _ => unreachable!(),
120        };
121        bytes.extend(self.encode_arm(&imm_op)?);
122        Ok(Some(bytes))
123    }
124
125    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
126        // #206: ARM32 register-offset loads/stores. `encode_mem_addr` only
127        // returns the 12-bit immediate, so the immediate-form arms below
128        // silently DROP `addr.offset_reg` — a runtime address index vanished,
129        // turning `ldr rd,[rn,rm,#off]` into `ldr rd,[rn,#off]` (the access went
130        // to the wrong address). Compute the effective base into IP and re-encode
131        // against `[ip, #off]`, which is uniform for word/byte/halfword/signed.
132        if let Some(bytes) = self.encode_arm_reg_offset_mem(op)? {
133            return Ok(bytes);
134        }
135        let instr: u32 = match op {
136            // Data processing instructions
137            ArmOp::Add { rd, rn, op2 } => {
138                let rd_bits = reg_to_bits(rd);
139                let rn_bits = reg_to_bits(rn);
140                let (op2_bits, i_flag) = encode_operand2(op2);
141
142                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
143                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
144                    | (i_flag << 25)
145                    | (rn_bits << 16)
146                    | (rd_bits << 12)
147                    | op2_bits
148            }
149
150            ArmOp::Sub { rd, rn, op2 } => {
151                let rd_bits = reg_to_bits(rd);
152                let rn_bits = reg_to_bits(rn);
153                let (op2_bits, i_flag) = encode_operand2(op2);
154
155                // SUB encoding: opcode=0010
156                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
157            }
158
159            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
160            ArmOp::Adds { rd, rn, op2 } => {
161                let rd_bits = reg_to_bits(rd);
162                let rn_bits = reg_to_bits(rn);
163                let (op2_bits, i_flag) = encode_operand2(op2);
164
165                // ADDS encoding: opcode=0100, S=1
166                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
167            }
168
169            ArmOp::Adc { rd, rn, op2 } => {
170                let rd_bits = reg_to_bits(rd);
171                let rn_bits = reg_to_bits(rn);
172                let (op2_bits, i_flag) = encode_operand2(op2);
173
174                // ADC encoding: opcode=0101
175                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
176            }
177
178            ArmOp::Subs { rd, rn, op2 } => {
179                let rd_bits = reg_to_bits(rd);
180                let rn_bits = reg_to_bits(rn);
181                let (op2_bits, i_flag) = encode_operand2(op2);
182
183                // SUBS encoding: opcode=0010, S=1
184                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
185            }
186
187            ArmOp::Sbc { rd, rn, op2 } => {
188                let rd_bits = reg_to_bits(rd);
189                let rn_bits = reg_to_bits(rn);
190                let (op2_bits, i_flag) = encode_operand2(op2);
191
192                // SBC encoding: opcode=0110
193                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
194            }
195
196            ArmOp::Mul { rd, rn, rm } => {
197                let rd_bits = reg_to_bits(rd);
198                let rn_bits = reg_to_bits(rn);
199                let rm_bits = reg_to_bits(rm);
200
201                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
202                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
203            }
204
205            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
206                let rdlo_bits = reg_to_bits(rdlo);
207                let rdhi_bits = reg_to_bits(rdhi);
208                let rn_bits = reg_to_bits(rn);
209                let rm_bits = reg_to_bits(rm);
210
211                // UMULL encoding: cond(4) | 0000 1000 | RdHi(4) | RdLo(4) | Rm(4) | 1001 | Rn(4)
212                0xE0800090 | (rdhi_bits << 16) | (rdlo_bits << 12) | (rm_bits << 8) | rn_bits
213            }
214
215            ArmOp::Sdiv { rd, rn, rm } => {
216                let rd_bits = reg_to_bits(rd);
217                let rn_bits = reg_to_bits(rn);
218                let rm_bits = reg_to_bits(rm);
219
220                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
221                // ARMv7-M and above
222                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
223            }
224
225            ArmOp::Udiv { rd, rn, rm } => {
226                let rd_bits = reg_to_bits(rd);
227                let rn_bits = reg_to_bits(rn);
228                let rm_bits = reg_to_bits(rm);
229
230                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
231                // ARMv7-M and above
232                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
233            }
234
235            ArmOp::Mls { rd, rn, rm, ra } => {
236                let rd_bits = reg_to_bits(rd);
237                let rn_bits = reg_to_bits(rn);
238                let rm_bits = reg_to_bits(rm);
239                let ra_bits = reg_to_bits(ra);
240
241                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
242                // Rd = Ra - (Rn * Rm)
243                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
244            }
245
246            ArmOp::Mla { rd, rn, rm, ra } => {
247                let rd_bits = reg_to_bits(rd);
248                let rn_bits = reg_to_bits(rn);
249                let rm_bits = reg_to_bits(rm);
250                let ra_bits = reg_to_bits(ra);
251
252                // MLA encoding: cond(4) | 0000001 S | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
253                // Rd = Ra + (Rn * Rm). Base 0xE0200090 (S=0).
254                0xE0200090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
255            }
256
257            ArmOp::And { rd, rn, op2 } => {
258                let rd_bits = reg_to_bits(rd);
259                let rn_bits = reg_to_bits(rn);
260                let (op2_bits, i_flag) = encode_operand2(op2);
261
262                // AND encoding: opcode=0000
263                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
264            }
265
266            ArmOp::Orr { rd, rn, op2 } => {
267                let rd_bits = reg_to_bits(rd);
268                let rn_bits = reg_to_bits(rn);
269                let (op2_bits, i_flag) = encode_operand2(op2);
270
271                // ORR encoding: opcode=1100
272                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
273            }
274
275            ArmOp::Eor { rd, rn, op2 } => {
276                let rd_bits = reg_to_bits(rd);
277                let rn_bits = reg_to_bits(rn);
278                let (op2_bits, i_flag) = encode_operand2(op2);
279
280                // EOR encoding: opcode=0001
281                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
282            }
283
284            // Shift instructions
285            ArmOp::Lsl { rd, rn, shift } => {
286                let rd_bits = reg_to_bits(rd);
287                let rn_bits = reg_to_bits(rn);
288                let shift_bits = *shift & 0x1F;
289
290                // LSL encoding: MOV with shift
291                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
292            }
293
294            ArmOp::Lsr { rd, rn, shift } => {
295                let rd_bits = reg_to_bits(rd);
296                let rn_bits = reg_to_bits(rn);
297                let shift_bits = *shift & 0x1F;
298
299                // LSR encoding
300                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
301            }
302
303            ArmOp::Asr { rd, rn, shift } => {
304                let rd_bits = reg_to_bits(rd);
305                let rn_bits = reg_to_bits(rn);
306                let shift_bits = *shift & 0x1F;
307
308                // ASR encoding
309                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
310            }
311
312            ArmOp::Ror { rd, rn, shift } => {
313                let rd_bits = reg_to_bits(rd);
314                let rn_bits = reg_to_bits(rn);
315                let shift_bits = *shift & 0x1F;
316
317                // ROR encoding: MOV with ROR shift
318                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
319            }
320
321            // Register-based shifts (ARM32)
322            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
323            ArmOp::LslReg { rd, rn, rm } => {
324                let rd_bits = reg_to_bits(rd);
325                let rn_bits = reg_to_bits(rn);
326                let rm_bits = reg_to_bits(rm);
327                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
328            }
329            ArmOp::LsrReg { rd, rn, rm } => {
330                let rd_bits = reg_to_bits(rd);
331                let rn_bits = reg_to_bits(rn);
332                let rm_bits = reg_to_bits(rm);
333                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
334            }
335            ArmOp::AsrReg { rd, rn, rm } => {
336                let rd_bits = reg_to_bits(rd);
337                let rn_bits = reg_to_bits(rn);
338                let rm_bits = reg_to_bits(rm);
339                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
340            }
341            ArmOp::RorReg { rd, rn, rm } => {
342                let rd_bits = reg_to_bits(rd);
343                let rn_bits = reg_to_bits(rn);
344                let rm_bits = reg_to_bits(rm);
345                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
346            }
347
348            // RSB (Reverse Subtract): Rd = imm - Rn
349            ArmOp::Rsb { rd, rn, imm } => {
350                let rd_bits = reg_to_bits(rd);
351                let rn_bits = reg_to_bits(rn);
352                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
353                // Opcode for RSB = 0011, I=1 (immediate), S=0
354                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
355            }
356
357            // Bit manipulation instructions
358            ArmOp::Clz { rd, rm } => {
359                let rd_bits = reg_to_bits(rd);
360                let rm_bits = reg_to_bits(rm);
361
362                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
363                // ARMv5T and above
364                0xE16F0F10 | (rd_bits << 12) | rm_bits
365            }
366
367            ArmOp::Rbit { rd, rm } => {
368                let rd_bits = reg_to_bits(rd);
369                let rm_bits = reg_to_bits(rm);
370
371                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
372                // ARMv6T2 and above
373                0xE6FF0F30 | (rd_bits << 12) | rm_bits
374            }
375
376            ArmOp::Sxtb { rd, rm } => {
377                let rd_bits = reg_to_bits(rd);
378                let rm_bits = reg_to_bits(rm);
379
380                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
381                // ARMv6 and above. rotate=00 for no rotation
382                0xE6AF0070 | (rd_bits << 12) | rm_bits
383            }
384
385            ArmOp::Sxth { rd, rm } => {
386                let rd_bits = reg_to_bits(rd);
387                let rm_bits = reg_to_bits(rm);
388
389                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
390                // ARMv6 and above. rotate=00 for no rotation
391                0xE6BF0070 | (rd_bits << 12) | rm_bits
392            }
393
394            // Move instructions
395            ArmOp::Mov { rd, op2 } => {
396                let rd_bits = reg_to_bits(rd);
397                let (op2_bits, i_flag) = encode_operand2(op2);
398
399                // MOV encoding: opcode=1101
400                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
401            }
402
403            ArmOp::Mvn { rd, op2 } => {
404                let rd_bits = reg_to_bits(rd);
405                let (op2_bits, i_flag) = encode_operand2(op2);
406
407                // MVN encoding: opcode=1111
408                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
409            }
410
411            // MOVW - Move Wide (ARM32)
412            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
413            ArmOp::Movw { rd, imm16 } => {
414                let rd_bits = reg_to_bits(rd);
415                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
416                let imm12 = (*imm16 as u32) & 0xFFF;
417                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
418            }
419
420            // MOVT - Move Top (ARM32)
421            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
422            ArmOp::Movt { rd, imm16 } => {
423                let rd_bits = reg_to_bits(rd);
424                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
425                let imm12 = (*imm16 as u32) & 0xFFF;
426                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
427            }
428
429            // #237: symbol-relative MOVW/MOVT (ARM mode) — addend in place, the
430            // backend records the MOVW_ABS/MOVT_ABS relocation against `symbol`.
431            ArmOp::MovwSym { rd, addend, .. } => {
432                let rd_bits = reg_to_bits(rd);
433                let v = (*addend as u32) & 0xffff;
434                0xE3000000 | (((v >> 12) & 0xF) << 16) | (rd_bits << 12) | (v & 0xFFF)
435            }
436            ArmOp::MovtSym { rd, addend, .. } => {
437                let rd_bits = reg_to_bits(rd);
438                let v = ((*addend as u32) >> 16) & 0xffff;
439                0xE3400000 | (((v >> 12) & 0xF) << 16) | (rd_bits << 12) | (v & 0xFFF)
440            }
441
442            // #345: LdrSym is the Thumb-2 literal-pool address load. A32 mode is
443            // not used for relocatable native-pointer objects; fail loudly rather
444            // than miscompile if it is ever reached here.
445            ArmOp::LdrSym { .. } => {
446                return Err(synth_core::Error::synthesis(
447                    "LdrSym (literal-pool address load) is Thumb-2-only",
448                ));
449            }
450
451            // Compare
452            ArmOp::Cmp { rn, op2 } => {
453                let rn_bits = reg_to_bits(rn);
454                let (op2_bits, i_flag) = encode_operand2(op2);
455
456                // CMP encoding: opcode=1010, S=1
457                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
458            }
459
460            // Compare Negative (CMN) - computes Rn + op2 and sets flags
461            ArmOp::Cmn { rn, op2 } => {
462                let rn_bits = reg_to_bits(rn);
463                let (op2_bits, i_flag) = encode_operand2(op2);
464
465                // CMN encoding: opcode=1011, S=1
466                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
467            }
468
469            // Load/Store
470            ArmOp::Ldr { rd, addr } => {
471                let rd_bits = reg_to_bits(rd);
472                let (base_bits, offset_bits) = encode_mem_addr(addr);
473
474                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
475                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
476                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
477            }
478
479            ArmOp::Str { rd, addr } => {
480                let rd_bits = reg_to_bits(rd);
481                let (base_bits, offset_bits) = encode_mem_addr(addr);
482
483                // STR encoding: L=0 (store)
484                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
485            }
486
487            // Sub-word loads (ARM32 encoding)
488            ArmOp::Ldrb { rd, addr } => {
489                let rd_bits = reg_to_bits(rd);
490                let (base_bits, offset_bits) = encode_mem_addr(addr);
491                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
492                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
493            }
494
495            ArmOp::Ldrsb { rd, addr } => {
496                let rd_bits = reg_to_bits(rd);
497                let (base_bits, offset_bits) = encode_mem_addr(addr);
498                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
499                // Simplified with immediate offset
500                let offset_val = offset_bits & 0xFF;
501                let imm4h = (offset_val >> 4) & 0xF;
502                let imm4l = offset_val & 0xF;
503                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
504            }
505
506            ArmOp::Ldrh { rd, addr } => {
507                let rd_bits = reg_to_bits(rd);
508                let (base_bits, offset_bits) = encode_mem_addr(addr);
509                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
510                let offset_val = offset_bits & 0xFF;
511                let imm4h = (offset_val >> 4) & 0xF;
512                let imm4l = offset_val & 0xF;
513                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
514            }
515
516            ArmOp::Ldrsh { rd, addr } => {
517                let rd_bits = reg_to_bits(rd);
518                let (base_bits, offset_bits) = encode_mem_addr(addr);
519                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
520                let offset_val = offset_bits & 0xFF;
521                let imm4h = (offset_val >> 4) & 0xF;
522                let imm4l = offset_val & 0xF;
523                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
524            }
525
526            // Sub-word stores (ARM32 encoding)
527            ArmOp::Strb { rd, addr } => {
528                let rd_bits = reg_to_bits(rd);
529                let (base_bits, offset_bits) = encode_mem_addr(addr);
530                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
531                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
532            }
533
534            ArmOp::Strh { rd, addr } => {
535                let rd_bits = reg_to_bits(rd);
536                let (base_bits, offset_bits) = encode_mem_addr(addr);
537                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
538                let offset_val = offset_bits & 0xFF;
539                let imm4h = (offset_val >> 4) & 0xF;
540                let imm4l = offset_val & 0xF;
541                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
542            }
543
544            // Memory management (ARM32 encoding)
545            ArmOp::MemorySize { rd } => {
546                let rd_bits = reg_to_bits(rd);
547                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
548                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
549                // LSR #16: shift5=10000, type=01
550                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
551            }
552
553            ArmOp::MemoryGrow { rd, .. } => {
554                let rd_bits = reg_to_bits(rd);
555                // On embedded, always fail: MOV rd, #-1
556                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
557            }
558
559            // Label pseudo-instruction: emits no machine code
560            ArmOp::Label { .. } => {
561                return Ok(Vec::new());
562            }
563
564            // Branch instructions
565            ArmOp::B { label: _ } => {
566                // B encoding: cond(4) | 1010 | offset(24)
567                // Simplified: branch to offset 0 (will be patched by linker/resolver)
568                0xEA000000
569            }
570
571            // Conditional branch to label (generic)
572            ArmOp::Bcc { cond, label: _ } => {
573                use synth_synthesis::Condition;
574                let cond_bits: u32 = match cond {
575                    Condition::EQ => 0x0,
576                    Condition::NE => 0x1,
577                    Condition::HS => 0x2,
578                    Condition::LO => 0x3,
579                    Condition::HI => 0x8,
580                    Condition::LS => 0x9,
581                    Condition::GE => 0xA,
582                    Condition::LT => 0xB,
583                    Condition::GT => 0xC,
584                    Condition::LE => 0xD,
585                };
586                // B<cond> with offset 0 (will be patched)
587                (cond_bits << 28) | 0x0A000000
588            }
589
590            // BHS (Branch if Higher or Same) - used for bounds checking
591            ArmOp::Bhs { label: _ } => {
592                // BHS encoding: cond(2=HS) | 1010 | offset(24)
593                0x2A000000 // BHS with offset 0
594            }
595
596            // BLO (Branch if Lower) - complementary to BHS
597            ArmOp::Blo { label: _ } => {
598                // BLO encoding: cond(3=LO) | 1010 | offset(24)
599                0x3A000000 // BLO with offset 0
600            }
601
602            // Branch with numeric offset (in instructions)
603            // ARM32 B instruction: offset is in instructions, stored as words
604            // The offset is relative to PC+8 (due to ARM pipeline)
605            ArmOp::BOffset { offset } => {
606                // B encoding: cond(4) | 1010 | offset(24)
607                // Offset is signed, in words (4-byte units)
608                // ARM adds PC+8 to the offset, so we need to adjust:
609                // target = PC + 8 + (offset * 4)
610                // For backward branch of N instructions: offset = -(N + 2)
611                // wrapping_sub keeps the encoder total under fuzzing (#186): an
612                // extreme i32::MIN offset would otherwise overflow-panic; for any
613                // real branch offset this is identical to `- 2`.
614                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
615                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
616                0xEA000000 | offset_bits
617            }
618
619            // Conditional branch with numeric offset
620            ArmOp::BCondOffset { cond, offset } => {
621                use synth_synthesis::Condition;
622                let cond_bits: u32 = match cond {
623                    Condition::EQ => 0x0,
624                    Condition::NE => 0x1,
625                    Condition::HS => 0x2,
626                    Condition::LO => 0x3,
627                    Condition::HI => 0x8,
628                    Condition::LS => 0x9,
629                    Condition::GE => 0xA,
630                    Condition::LT => 0xB,
631                    Condition::GT => 0xC,
632                    Condition::LE => 0xD,
633                };
634                // B<cond> encoding: cond(4) | 1010 | offset(24)
635                // wrapping_sub: total under fuzzing (#186), identical for real offsets.
636                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
637                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
638                (cond_bits << 28) | 0x0A000000 | offset_bits
639            }
640
641            ArmOp::Bl { label: _ } => {
642                // BL encoding: cond(4) | 1011 | offset(24)
643                0xEB000000
644            }
645
646            ArmOp::Bx { rm } => {
647                let rm_bits = reg_to_bits(rm);
648
649                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
650                0xE12FFF10 | rm_bits
651            }
652
653            ArmOp::Blx { rm } => {
654                let rm_bits = reg_to_bits(rm);
655
656                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
657                0xE12FFF30 | rm_bits
658            }
659
660            ArmOp::Push { regs } => {
661                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
662                let mut reg_list: u32 = 0;
663                for r in regs {
664                    reg_list |= 1 << reg_to_bits(r);
665                }
666                0xE92D0000 | reg_list
667            }
668
669            ArmOp::Pop { regs } => {
670                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
671                let mut reg_list: u32 = 0;
672                for r in regs {
673                    reg_list |= 1 << reg_to_bits(r);
674                }
675                0xE8BD0000 | reg_list
676            }
677
678            ArmOp::Nop => {
679                // NOP encoding: MOV R0, R0
680                0xE1A00000
681            }
682
683            ArmOp::Udf { imm } => {
684                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
685                // We only use imm8, so split into imm4_hi and imm4_lo
686                let imm8 = *imm as u32;
687                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
688            }
689
690            // Pseudo-instructions for verification - encode as NOP
691            // These are used in formal verification but not actual code generation
692            ArmOp::Popcnt { .. } => {
693                // Population count pseudo-instruction
694                // Not a real ARM instruction, would be expanded to actual code
695                0xE1A00000 // NOP for now
696            }
697
698            ArmOp::SetCond { .. } => {
699                // Condition evaluation pseudo-instruction
700                // Not a real ARM instruction, would be expanded to actual code
701                0xE1A00000 // NOP for now
702            }
703
704            ArmOp::SelectMove { .. } => {
705                // Conditional move pseudo-instruction for ARM32
706                // Would use MOV{cond} instruction
707                0xE1A00000 // NOP for now
708            }
709
710            ArmOp::Select { .. } => {
711                // Select pseudo-instruction
712                // Not a real ARM instruction, would be expanded to conditional moves
713                0xE1A00000 // NOP for now
714            }
715
716            ArmOp::LocalGet { .. } => {
717                // Local variable get pseudo-instruction
718                // Not a real ARM instruction, would be expanded to memory access
719                0xE1A00000 // NOP for now
720            }
721
722            ArmOp::LocalSet { .. } => {
723                // Local variable set pseudo-instruction
724                // Not a real ARM instruction, would be expanded to memory access
725                0xE1A00000 // NOP for now
726            }
727
728            ArmOp::LocalTee { .. } => {
729                // Local variable tee pseudo-instruction
730                // Not a real ARM instruction, would be expanded to memory access
731                0xE1A00000 // NOP for now
732            }
733
734            ArmOp::GlobalGet { .. } => {
735                // Global variable get pseudo-instruction
736                // Not a real ARM instruction, would be expanded to memory access
737                0xE1A00000 // NOP for now
738            }
739
740            ArmOp::GlobalSet { .. } => {
741                // Global variable set pseudo-instruction
742                // Not a real ARM instruction, would be expanded to memory access
743                0xE1A00000 // NOP for now
744            }
745
746            ArmOp::BrTable { .. } => {
747                // Branch table pseudo-instruction
748                // Not a real ARM instruction, would be expanded to jump table
749                0xE1A00000 // NOP for now
750            }
751
752            ArmOp::Call { .. } => {
753                // Function call pseudo-instruction
754                // Not a real ARM instruction, would be expanded to BL
755                0xE1A00000 // NOP for now
756            }
757
758            ArmOp::CallIndirect { .. } => {
759                // Indirect function call pseudo-instruction
760                // Not a real ARM instruction, would be expanded to indirect branch
761                0xE1A00000 // NOP for now
762            }
763
764            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
765            // Real compiler would expand these to multi-instruction sequences
766            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
767            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
768            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
769            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
770            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
771            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
772            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
773            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
774            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
775            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
776            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
777            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
778            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
779            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
780            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
781            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
782            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
783            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
784            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
785            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
786            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
787            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
788            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
789            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
790            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
791            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
792            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
793            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
794            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
795            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
796            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
797            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
798
799            // f32 VFP single-precision instructions
800            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
801            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
802            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
803            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
804            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
805            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
806            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
807
808            // f32 pseudo-ops — multi-instruction sequences
809            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
810            ArmOp::F32Ceil { sd, sm } => {
811                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
812            }
813            ArmOp::F32Floor { sd, sm } => {
814                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
815            }
816            ArmOp::F32Trunc { sd, sm } => {
817                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
818            }
819            ArmOp::F32Nearest { sd, sm } => {
820                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
821            }
822            ArmOp::F32Min { sd, sn, sm } => {
823                return self.encode_arm_f32_minmax(sd, sn, sm, true);
824            }
825            ArmOp::F32Max { sd, sn, sm } => {
826                return self.encode_arm_f32_minmax(sd, sn, sm, false);
827            }
828            ArmOp::F32Copysign { sd, sn, sm } => {
829                return self.encode_arm_f32_copysign(sd, sn, sm);
830            }
831
832            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
833            ArmOp::F32Eq { rd, sn, sm } => {
834                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
835            }
836            ArmOp::F32Ne { rd, sn, sm } => {
837                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
838            }
839            ArmOp::F32Lt { rd, sn, sm } => {
840                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
841            }
842            ArmOp::F32Le { rd, sn, sm } => {
843                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
844            }
845            ArmOp::F32Gt { rd, sn, sm } => {
846                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
847            }
848            ArmOp::F32Ge { rd, sn, sm } => {
849                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
850            }
851
852            // f32 const — multi-instruction: MOVW + MOVT + VMOV
853            ArmOp::F32Const { sd, value } => {
854                return self.encode_arm_f32_const(sd, *value);
855            }
856
857            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
858            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
859
860            // f32 conversions — multi-instruction sequences
861            ArmOp::F32ConvertI32S { sd, rm } => {
862                return self.encode_arm_f32_convert_i32(sd, rm, true);
863            }
864            ArmOp::F32ConvertI32U { sd, rm } => {
865                return self.encode_arm_f32_convert_i32(sd, rm, false);
866            }
867            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
868                return Err(synth_core::Error::synthesis(
869                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
870                ));
871            }
872            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
873            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
874            ArmOp::I32TruncF32S { rd, sm } => {
875                return self.encode_arm_i32_trunc_f32(rd, sm, true);
876            }
877            ArmOp::I32TruncF32U { rd, sm } => {
878                return self.encode_arm_i32_trunc_f32(rd, sm, false);
879            }
880
881            // f64 VFP double-precision instructions (ARM32)
882            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
883            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
884            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
885            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
886            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
887            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
888            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
889            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
890
891            // f64 pseudo-ops
892            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
893            ArmOp::F64Ceil { dd, dm } => {
894                return self.encode_arm_f64_rounding(dd, dm, 0b01);
895            }
896            ArmOp::F64Floor { dd, dm } => {
897                return self.encode_arm_f64_rounding(dd, dm, 0b10);
898            }
899            ArmOp::F64Trunc { dd, dm } => {
900                return self.encode_arm_f64_rounding(dd, dm, 0b11);
901            }
902            ArmOp::F64Nearest { dd, dm } => {
903                return self.encode_arm_f64_rounding(dd, dm, 0b00);
904            }
905            ArmOp::F64Min { dd, dn, dm } => {
906                return self.encode_arm_f64_minmax(dd, dn, dm, true);
907            }
908            ArmOp::F64Max { dd, dn, dm } => {
909                return self.encode_arm_f64_minmax(dd, dn, dm, false);
910            }
911            ArmOp::F64Copysign { dd, dn, dm } => {
912                return self.encode_arm_f64_copysign(dd, dn, dm);
913            }
914
915            // f64 comparisons
916            ArmOp::F64Eq { rd, dn, dm } => {
917                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
918            }
919            ArmOp::F64Ne { rd, dn, dm } => {
920                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
921            }
922            ArmOp::F64Lt { rd, dn, dm } => {
923                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
924            }
925            ArmOp::F64Le { rd, dn, dm } => {
926                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
927            }
928            ArmOp::F64Gt { rd, dn, dm } => {
929                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
930            }
931            ArmOp::F64Ge { rd, dn, dm } => {
932                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
933            }
934
935            ArmOp::F64Const { dd, value } => {
936                return self.encode_arm_f64_const(dd, *value);
937            }
938
939            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
940            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
941
942            ArmOp::F64ConvertI32S { dd, rm } => {
943                return self.encode_arm_f64_convert_i32(dd, rm, true);
944            }
945            ArmOp::F64ConvertI32U { dd, rm } => {
946                return self.encode_arm_f64_convert_i32(dd, rm, false);
947            }
948            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
949                return Err(synth_core::Error::synthesis(
950                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
951                ));
952            }
953            ArmOp::F64PromoteF32 { dd, sm } => {
954                return self.encode_arm_f64_promote_f32(dd, sm);
955            }
956            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
957                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
958            }
959            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
960                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
961            }
962            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
963                return Err(synth_core::Error::synthesis(
964                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
965                ));
966            }
967            ArmOp::I32TruncF64S { rd, dm } => {
968                return self.encode_arm_i32_trunc_f64(rd, dm, true);
969            }
970            ArmOp::I32TruncF64U { rd, dm } => {
971                return self.encode_arm_i32_trunc_f64(rd, dm, false);
972            }
973            // Multi-instruction sequences - only meaningful in Thumb-2 mode
974            ArmOp::I64SetCond { .. }
975            | ArmOp::I64SetCondZ { .. }
976            | ArmOp::I64Mul { .. }
977            | ArmOp::I64Shl { .. }
978            | ArmOp::I64ShrS { .. }
979            | ArmOp::I64ShrU { .. }
980            | ArmOp::I64Rotl { .. }
981            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
982
983            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
984            ArmOp::MveLoad { .. }
985            | ArmOp::MveStore { .. }
986            | ArmOp::MveConst { .. }
987            | ArmOp::MveAnd { .. }
988            | ArmOp::MveOrr { .. }
989            | ArmOp::MveEor { .. }
990            | ArmOp::MveMvn { .. }
991            | ArmOp::MveBic { .. }
992            | ArmOp::MveAddI { .. }
993            | ArmOp::MveSubI { .. }
994            | ArmOp::MveMulI { .. }
995            | ArmOp::MveNegI { .. }
996            | ArmOp::MveCmpEqI { .. }
997            | ArmOp::MveCmpNeI { .. }
998            | ArmOp::MveCmpLtS { .. }
999            | ArmOp::MveCmpLtU { .. }
1000            | ArmOp::MveCmpGtS { .. }
1001            | ArmOp::MveCmpGtU { .. }
1002            | ArmOp::MveCmpLeS { .. }
1003            | ArmOp::MveCmpLeU { .. }
1004            | ArmOp::MveCmpGeS { .. }
1005            | ArmOp::MveCmpGeU { .. }
1006            | ArmOp::MveDup { .. }
1007            | ArmOp::MveExtractLane { .. }
1008            | ArmOp::MveInsertLane { .. }
1009            | ArmOp::MveAddF32 { .. }
1010            | ArmOp::MveSubF32 { .. }
1011            | ArmOp::MveMulF32 { .. }
1012            | ArmOp::MveNegF32 { .. }
1013            | ArmOp::MveAbsF32 { .. }
1014            | ArmOp::MveCmpEqF32 { .. }
1015            | ArmOp::MveCmpNeF32 { .. }
1016            | ArmOp::MveCmpLtF32 { .. }
1017            | ArmOp::MveCmpLeF32 { .. }
1018            | ArmOp::MveCmpGtF32 { .. }
1019            | ArmOp::MveCmpGeF32 { .. }
1020            | ArmOp::MveDupF32 { .. }
1021            | ArmOp::MveExtractLaneF32 { .. }
1022            | ArmOp::MveReplaceLaneF32 { .. }
1023            | ArmOp::MveDivF32 { .. }
1024            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
1025        };
1026
1027        // ARM32 instructions are little-endian
1028        Ok(instr.to_le_bytes().to_vec())
1029    }
1030
1031    // === ARM32 VFP multi-instruction helpers ===
1032
1033    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
1034    fn encode_arm_f32_compare(
1035        &self,
1036        rd: &Reg,
1037        sn: &VfpReg,
1038        sm: &VfpReg,
1039        cond_code: u32,
1040    ) -> Result<Vec<u8>> {
1041        let mut bytes = Vec::new();
1042
1043        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
1044        let sn_num = vfp_sreg_to_num(sn)?;
1045        let sm_num = vfp_sreg_to_num(sm)?;
1046        let (vd, d) = encode_sreg(sn_num);
1047        let (vm, m) = encode_sreg(sm_num);
1048        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1049        bytes.extend_from_slice(&vcmp.to_le_bytes());
1050
1051        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
1052        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1053
1054        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
1055        let rd_bits = reg_to_bits(rd);
1056        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1057        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1058
1059        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
1060        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1061        bytes.extend_from_slice(&mov_one.to_le_bytes());
1062
1063        Ok(bytes)
1064    }
1065
1066    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
1067    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
1068        let mut bytes = Vec::new();
1069        let bits = value.to_bits();
1070
1071        // Use R12 as temp register for constant loading
1072        let rt: u32 = 12; // R12/IP
1073
1074        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
1075        let lo16 = bits & 0xFFFF;
1076        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1077        bytes.extend_from_slice(&movw.to_le_bytes());
1078
1079        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
1080        let hi16 = (bits >> 16) & 0xFFFF;
1081        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1082        bytes.extend_from_slice(&movt.to_le_bytes());
1083
1084        // VMOV Sd, R12
1085        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
1086        bytes.extend_from_slice(&vmov.to_le_bytes());
1087
1088        Ok(bytes)
1089    }
1090
1091    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
1092    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1093        let mut bytes = Vec::new();
1094
1095        // VMOV Sd, Rm — move integer to VFP register
1096        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
1097        bytes.extend_from_slice(&vmov.to_le_bytes());
1098
1099        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
1100        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
1101        let sd_num = vfp_sreg_to_num(sd)?;
1102        let (vd, d) = encode_sreg(sd_num);
1103        let (vm, m) = encode_sreg(sd_num); // same register as source
1104        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
1105        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1106        bytes.extend_from_slice(&vcvt.to_le_bytes());
1107
1108        Ok(bytes)
1109    }
1110
1111    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
1112    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
1113    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
1114    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
1115    /// Simplified: convert to int (toward zero for trunc) then back to float.
1116    /// Encode F32 rounding as ARM32.
1117    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1118    ///
1119    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
1120    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
1121    /// which honours FPSCR rmode), then restores FPSCR.
1122    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1123        let mut bytes = Vec::new();
1124        let sm_num = vfp_sreg_to_num(sm)?;
1125        let sd_num = vfp_sreg_to_num(sd)?;
1126        let (vd_s, d_s) = encode_sreg(sd_num);
1127        let (vm_s, m_s) = encode_sreg(sm_num);
1128
1129        if mode == 0b11 {
1130            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1131            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1132            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1133            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1134        } else {
1135            // ceil/floor/nearest: manipulate FPSCR rounding mode
1136            let rt: u32 = 12; // R12/IP as temp
1137
1138            // VMRS R12, FPSCR
1139            let vmrs = 0xEEF10A10 | (rt << 12);
1140            bytes.extend_from_slice(&vmrs.to_le_bytes());
1141
1142            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1143            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1144            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1145            bytes.extend_from_slice(&bic.to_le_bytes());
1146
1147            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1148            if mode != 0 {
1149                // mode<<22: rotation=5, imm8=mode
1150                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1151                bytes.extend_from_slice(&orr.to_le_bytes());
1152            }
1153
1154            // VMSR FPSCR, R12
1155            let vmsr = 0xEEE10A10 | (rt << 12);
1156            bytes.extend_from_slice(&vmsr.to_le_bytes());
1157
1158            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1159            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1160            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1161
1162            // Restore FPSCR: clear rmode bits back to nearest (default)
1163            bytes.extend_from_slice(&vmrs.to_le_bytes());
1164            bytes.extend_from_slice(&bic.to_le_bytes());
1165            bytes.extend_from_slice(&vmsr.to_le_bytes());
1166        }
1167
1168        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1169        let (vd2, d2) = encode_sreg(sd_num);
1170        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1171        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1172
1173        Ok(bytes)
1174    }
1175
1176    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1177    fn encode_arm_f32_minmax(
1178        &self,
1179        sd: &VfpReg,
1180        sn: &VfpReg,
1181        sm: &VfpReg,
1182        is_min: bool,
1183    ) -> Result<Vec<u8>> {
1184        let mut bytes = Vec::new();
1185        let sn_num = vfp_sreg_to_num(sn)?;
1186        let sm_num = vfp_sreg_to_num(sm)?;
1187        let sd_num = vfp_sreg_to_num(sd)?;
1188
1189        // VMOV Sd, Sn (start with first operand)
1190        let (vd, d) = encode_sreg(sd_num);
1191        let (vn, n) = encode_sreg(sn_num);
1192        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1193        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1194
1195        // VCMP.F32 Sn, Sm
1196        let (vm, m) = encode_sreg(sm_num);
1197        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1198        bytes.extend_from_slice(&vcmp.to_le_bytes());
1199
1200        // VMRS APSR_nzcv, FPSCR
1201        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1202
1203        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1204        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1205        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1206
1207        // VMOV{cond} Sd, Sm — conditional VMOV
1208        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1209        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1210
1211        Ok(bytes)
1212    }
1213
1214    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1215    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1216        let mut bytes = Vec::new();
1217
1218        // VMOV R12, Sm (get sign source bits)
1219        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1220        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1221
1222        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1223        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1224        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1225
1226        // AND R12, R12, #0x80000000 (keep only sign bit)
1227        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1228        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1229        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1230        bytes.extend_from_slice(&and_sign.to_le_bytes());
1231
1232        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1233        // R0 = register 0, so Rn and Rd fields are 0
1234        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1235        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1236
1237        // ORR R0, R0, R12 (combine sign + magnitude)
1238        // R0 = register 0, so Rn and Rd fields are 0
1239        let orr = 0xE1800000u32 | 12;
1240        bytes.extend_from_slice(&orr.to_le_bytes());
1241
1242        // VMOV Sd, R0
1243        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1244        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1245
1246        Ok(bytes)
1247    }
1248
1249    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1250    fn encode_arm_f64_compare(
1251        &self,
1252        rd: &Reg,
1253        dn: &VfpReg,
1254        dm: &VfpReg,
1255        cond_code: u32,
1256    ) -> Result<Vec<u8>> {
1257        let mut bytes = Vec::new();
1258
1259        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1260        let dn_num = vfp_dreg_to_num(dn)?;
1261        let dm_num = vfp_dreg_to_num(dm)?;
1262        let (vd, d) = encode_dreg(dn_num);
1263        let (vm, m) = encode_dreg(dm_num);
1264        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1265        bytes.extend_from_slice(&vcmp.to_le_bytes());
1266
1267        // VMRS APSR_nzcv, FPSCR
1268        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1269
1270        // MOV rd, #0
1271        let rd_bits = reg_to_bits(rd);
1272        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1273        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1274
1275        // MOVcond rd, #1
1276        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1277        bytes.extend_from_slice(&mov_one.to_le_bytes());
1278
1279        Ok(bytes)
1280    }
1281
1282    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1283    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1284        let mut bytes = Vec::new();
1285        let bits = value.to_bits();
1286        let lo32 = bits as u32;
1287        let hi32 = (bits >> 32) as u32;
1288
1289        // Load low 32 bits into R0 (Rd field = 0 for R0)
1290        let lo16 = lo32 & 0xFFFF;
1291        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1292        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1293        let hi16 = (lo32 >> 16) & 0xFFFF;
1294        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1295        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1296
1297        // Load high 32 bits into R12
1298        let lo16 = hi32 & 0xFFFF;
1299        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1300        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1301        let hi16 = (hi32 >> 16) & 0xFFFF;
1302        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1303        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1304
1305        // VMOV Dd, R0, R12
1306        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1307        bytes.extend_from_slice(&vmov.to_le_bytes());
1308
1309        Ok(bytes)
1310    }
1311
1312    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1313    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1314        let mut bytes = Vec::new();
1315
1316        // Use S0 as intermediate: VMOV S0, Rm
1317        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1318        bytes.extend_from_slice(&vmov.to_le_bytes());
1319
1320        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1321        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1322        let dd_num = vfp_dreg_to_num(dd)?;
1323        let (vd, d) = encode_dreg(dd_num);
1324        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1325        // S0 is register 0: Vm=0, M=0
1326        let vcvt = base | (d << 22) | (vd << 12);
1327        bytes.extend_from_slice(&vcvt.to_le_bytes());
1328
1329        Ok(bytes)
1330    }
1331
1332    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1333    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1334        let dd_num = vfp_dreg_to_num(dd)?;
1335        let sm_num = vfp_sreg_to_num(sm)?;
1336        let (vd, d) = encode_dreg(dd_num);
1337        let (vm, m) = encode_sreg(sm_num);
1338
1339        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1340        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1341        Ok(vcvt.to_le_bytes().to_vec())
1342    }
1343
1344    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1345    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1346        let mut bytes = Vec::new();
1347        let dm_num = vfp_dreg_to_num(dm)?;
1348        let (vm, m) = encode_dreg(dm_num);
1349
1350        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1351        // S0: Vd=0, D=0
1352        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1353        let vcvt = base | (m << 5) | vm;
1354        bytes.extend_from_slice(&vcvt.to_le_bytes());
1355
1356        // VMOV Rd, S0
1357        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1358        bytes.extend_from_slice(&vmov.to_le_bytes());
1359
1360        Ok(bytes)
1361    }
1362
1363    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1364    /// Encode F64 rounding as ARM32.
1365    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1366    ///
1367    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1368    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1369    /// then restores FPSCR.
1370    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1371        let mut bytes = Vec::new();
1372        let dm_num = vfp_dreg_to_num(dm)?;
1373        let dd_num = vfp_dreg_to_num(dd)?;
1374        let (vm, m) = encode_dreg(dm_num);
1375        let (vd, d) = encode_dreg(dd_num);
1376
1377        if mode == 0b11 {
1378            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1379            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1380            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1381        } else {
1382            // ceil/floor/nearest: manipulate FPSCR rounding mode
1383            let rt: u32 = 12;
1384
1385            // VMRS R12, FPSCR
1386            let vmrs = 0xEEF10A10 | (rt << 12);
1387            bytes.extend_from_slice(&vmrs.to_le_bytes());
1388
1389            // BIC R12, R12, #(3 << 22)
1390            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1391            bytes.extend_from_slice(&bic.to_le_bytes());
1392
1393            // ORR R12, R12, #(mode << 22)
1394            if mode != 0 {
1395                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1396                bytes.extend_from_slice(&orr.to_le_bytes());
1397            }
1398
1399            // VMSR FPSCR, R12
1400            let vmsr = 0xEEE10A10 | (rt << 12);
1401            bytes.extend_from_slice(&vmsr.to_le_bytes());
1402
1403            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1404            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1405            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1406
1407            // Restore FPSCR
1408            bytes.extend_from_slice(&vmrs.to_le_bytes());
1409            bytes.extend_from_slice(&bic.to_le_bytes());
1410            bytes.extend_from_slice(&vmsr.to_le_bytes());
1411        }
1412
1413        // VCVT.F64.S32 Dd, S0 (convert back to double)
1414        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1415        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1416
1417        Ok(bytes)
1418    }
1419
1420    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1421    fn encode_arm_f64_minmax(
1422        &self,
1423        dd: &VfpReg,
1424        dn: &VfpReg,
1425        dm: &VfpReg,
1426        is_min: bool,
1427    ) -> Result<Vec<u8>> {
1428        let mut bytes = Vec::new();
1429        let dn_num = vfp_dreg_to_num(dn)?;
1430        let dm_num = vfp_dreg_to_num(dm)?;
1431        let dd_num = vfp_dreg_to_num(dd)?;
1432
1433        // VMOV.F64 Dd, Dn (start with first operand)
1434        let (vd, d) = encode_dreg(dd_num);
1435        let (vn, n) = encode_dreg(dn_num);
1436        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1437        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1438
1439        // VCMP.F64 Dn, Dm
1440        let (vm, m) = encode_dreg(dm_num);
1441        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1442        bytes.extend_from_slice(&vcmp.to_le_bytes());
1443
1444        // VMRS APSR_nzcv, FPSCR
1445        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1446
1447        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1448        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1449        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1450
1451        Ok(bytes)
1452    }
1453
1454    /// Encode F64 copysign as ARM32
1455    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1456        let mut bytes = Vec::new();
1457
1458        // VMOV R0, R12, Dm (get sign source bits)
1459        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1460        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1461
1462        // VMOV R1, R2, Dn (get magnitude source bits)
1463        // We use R1 (lo) and R2 (hi) for the magnitude
1464        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1465        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1466
1467        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1468        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1469        bytes.extend_from_slice(&and_sign.to_le_bytes());
1470
1471        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1472        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1473        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1474
1475        // ORR R2, R2, R12 (combine sign + magnitude)
1476        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1477        bytes.extend_from_slice(&orr.to_le_bytes());
1478
1479        // VMOV Dd, R1, R2
1480        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1481        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1482
1483        Ok(bytes)
1484    }
1485
1486    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1487    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1488        let mut bytes = Vec::new();
1489
1490        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1491        // We use Sm as both source and destination for the intermediate result
1492        let sm_num = vfp_sreg_to_num(sm)?;
1493        let (vd, d) = encode_sreg(sm_num);
1494        let (vm, m) = encode_sreg(sm_num);
1495        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1496        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1497        bytes.extend_from_slice(&vcvt.to_le_bytes());
1498
1499        // VMOV Rd, Sm — move result back to core register
1500        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1501        bytes.extend_from_slice(&vmov.to_le_bytes());
1502
1503        Ok(bytes)
1504    }
1505
1506    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1507    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1508        // Thumb-2 supports both 16-bit and 32-bit instructions
1509        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1510        match op {
1511            // === 16-bit Thumb encodings ===
1512            ArmOp::Add { rd, rn, op2 } => {
1513                let rd_bits = reg_to_bits(rd) as u16;
1514                let rn_bits = reg_to_bits(rn) as u16;
1515
1516                if let Operand2::Reg(rm) = op2 {
1517                    let rm_bits = reg_to_bits(rm) as u16;
1518                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1519                    // high registers (e.g. R12, the MemLoad/MemStore base
1520                    // scratch) the bits overflow into adjacent fields, silently
1521                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1522                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1523                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1524                    // as the Sub handler below does.
1525                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1526                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1527                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1528                        Ok(instr.to_le_bytes().to_vec())
1529                    } else {
1530                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1531                        self.encode_thumb32_add_reg_raw(
1532                            rd_bits as u32,
1533                            rn_bits as u32,
1534                            rm_bits as u32,
1535                        )
1536                    }
1537                } else if let Operand2::Imm(imm) = op2 {
1538                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1539                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1540                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1541                        Ok(instr.to_le_bytes().to_vec())
1542                    } else {
1543                        // Use 32-bit ADD for larger immediates
1544                        self.encode_thumb32_add(rd, rn, *imm as u32)
1545                    }
1546                } else {
1547                    // Fallback to 32-bit encoding
1548                    self.encode_thumb32_add(rd, rn, 0)
1549                }
1550            }
1551
1552            ArmOp::Sub { rd, rn, op2 } => {
1553                let rd_bits = reg_to_bits(rd) as u16;
1554                let rn_bits = reg_to_bits(rn) as u16;
1555
1556                if let Operand2::Reg(rm) = op2 {
1557                    let rm_bits = reg_to_bits(rm) as u16;
1558                    // 16-bit SUBS can only use low registers (R0-R7)
1559                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1560                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1561                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1562                        Ok(instr.to_le_bytes().to_vec())
1563                    } else {
1564                        // Use 32-bit SUB.W for high registers
1565                        self.encode_thumb32_sub_reg_raw(
1566                            rd_bits as u32,
1567                            rn_bits as u32,
1568                            rm_bits as u32,
1569                        )
1570                    }
1571                } else if let Operand2::Imm(imm) = op2 {
1572                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1573                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1574                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1575                        Ok(instr.to_le_bytes().to_vec())
1576                    } else {
1577                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1578                    }
1579                } else {
1580                    self.encode_thumb32_sub(rd, rn, 0)
1581                }
1582            }
1583
1584            ArmOp::Mov { rd, op2 } => {
1585                let rd_bits = reg_to_bits(rd) as u16;
1586
1587                if let Operand2::Imm(imm) = op2 {
1588                    if *imm <= 255 && rd_bits < 8 {
1589                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1590                        let imm_bits = (*imm as u16) & 0xFF;
1591                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1592                        Ok(instr.to_le_bytes().to_vec())
1593                    } else {
1594                        // Use 32-bit MOVW for larger immediates
1595                        self.encode_thumb32_movw(rd, *imm as u32)
1596                    }
1597                } else if let Operand2::Reg(rm) = op2 {
1598                    let rm_bits = reg_to_bits(rm) as u16;
1599                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1600                    // D = Rd[3], Rd[2:0] in lower bits
1601                    let d_bit = (rd_bits >> 3) & 1;
1602                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1603                    Ok(instr.to_le_bytes().to_vec())
1604                } else {
1605                    let instr: u16 = 0xBF00; // NOP fallback
1606                    Ok(instr.to_le_bytes().to_vec())
1607                }
1608            }
1609
1610            ArmOp::Push { regs } => {
1611                // Thumb-2 PUSH encoding:
1612                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1613                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1614                let mut reg_list: u16 = 0;
1615                let mut need_32bit = false;
1616                for r in regs {
1617                    let bit = reg_to_bits(r);
1618                    if bit >= 8 && *r != Reg::LR {
1619                        need_32bit = true;
1620                    }
1621                    reg_list |= 1 << bit;
1622                }
1623                if !need_32bit {
1624                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1625                    let m_bit = if reg_list & (1 << 14) != 0 {
1626                        1u16
1627                    } else {
1628                        0u16
1629                    };
1630                    let low_regs = reg_list & 0xFF;
1631                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1632                    Ok(instr.to_le_bytes().to_vec())
1633                } else {
1634                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1635                    let hw1: u16 = 0xE92D;
1636                    let hw2: u16 = reg_list;
1637                    let mut bytes = hw1.to_le_bytes().to_vec();
1638                    bytes.extend_from_slice(&hw2.to_le_bytes());
1639                    Ok(bytes)
1640                }
1641            }
1642
1643            ArmOp::Pop { regs } => {
1644                // Thumb-2 POP encoding:
1645                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1646                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1647                let mut reg_list: u16 = 0;
1648                let mut need_32bit = false;
1649                for r in regs {
1650                    let bit = reg_to_bits(r);
1651                    if bit >= 8 && *r != Reg::PC {
1652                        need_32bit = true;
1653                    }
1654                    reg_list |= 1 << bit;
1655                }
1656                if !need_32bit {
1657                    // 16-bit POP: 1011 110 P rrrrrrrr
1658                    let p_bit = if reg_list & (1 << 15) != 0 {
1659                        1u16
1660                    } else {
1661                        0u16
1662                    };
1663                    let low_regs = reg_list & 0xFF;
1664                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1665                    Ok(instr.to_le_bytes().to_vec())
1666                } else {
1667                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1668                    let hw1: u16 = 0xE8BD;
1669                    let hw2: u16 = reg_list;
1670                    let mut bytes = hw1.to_le_bytes().to_vec();
1671                    bytes.extend_from_slice(&hw2.to_le_bytes());
1672                    Ok(bytes)
1673                }
1674            }
1675
1676            ArmOp::Nop => {
1677                let instr: u16 = 0xBF00; // NOP in Thumb-2
1678                Ok(instr.to_le_bytes().to_vec())
1679            }
1680
1681            ArmOp::Udf { imm } => {
1682                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1683                // This triggers UsageFault/HardFault, used for WASM traps
1684                let instr: u16 = 0xDE00 | (*imm as u16);
1685                let bytes = instr.to_le_bytes().to_vec();
1686                encoding_contracts::verify_thumb16(&bytes);
1687                Ok(bytes)
1688            }
1689
1690            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1691            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1692            ArmOp::Adds { rd, rn, op2 } => {
1693                let rd_bits = reg_to_bits(rd) as u16;
1694                let rn_bits = reg_to_bits(rn) as u16;
1695
1696                if let Operand2::Reg(rm) = op2 {
1697                    let rm_bits = reg_to_bits(rm) as u16;
1698                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1699                    // operands in R8-R11, which would overflow the 3-bit fields
1700                    // and corrupt the operands (#178/#180 class). Guard and fall
1701                    // back to 32-bit ADDS.W for high registers.
1702                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1703                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1704                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1705                        Ok(instr.to_le_bytes().to_vec())
1706                    } else {
1707                        self.encode_thumb32_adds_reg_raw(
1708                            rd_bits as u32,
1709                            rn_bits as u32,
1710                            rm_bits as u32,
1711                        )
1712                    }
1713                } else {
1714                    // 32-bit Thumb-2 ADDS with immediate
1715                    self.encode_thumb32_adds(rd, rn, 0)
1716                }
1717            }
1718
1719            // ADC: Add with Carry (Thumb-2 32-bit)
1720            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1721            ArmOp::Adc { rd, rn, op2 } => {
1722                let rd_bits = reg_to_bits(rd);
1723                let rn_bits = reg_to_bits(rn);
1724
1725                if let Operand2::Reg(rm) = op2 {
1726                    let rm_bits = reg_to_bits(rm);
1727                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1728                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1729                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1730
1731                    let mut bytes = hw1.to_le_bytes().to_vec();
1732                    bytes.extend_from_slice(&hw2.to_le_bytes());
1733                    Ok(bytes)
1734                } else {
1735                    // ADC with immediate - use 32-bit encoding
1736                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1737                    let hw2: u16 = (rd_bits << 8) as u16;
1738                    let mut bytes = hw1.to_le_bytes().to_vec();
1739                    bytes.extend_from_slice(&hw2.to_le_bytes());
1740                    Ok(bytes)
1741                }
1742            }
1743
1744            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1745            ArmOp::Subs { rd, rn, op2 } => {
1746                let rd_bits = reg_to_bits(rd) as u16;
1747                let rn_bits = reg_to_bits(rn) as u16;
1748
1749                if let Operand2::Reg(rm) = op2 {
1750                    let rm_bits = reg_to_bits(rm) as u16;
1751                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1752                    // would overflow the 3-bit fields (#178/#180 class). Guard
1753                    // and fall back to 32-bit SUBS.W for high registers.
1754                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1755                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1756                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1757                        Ok(instr.to_le_bytes().to_vec())
1758                    } else {
1759                        self.encode_thumb32_subs_reg_raw(
1760                            rd_bits as u32,
1761                            rn_bits as u32,
1762                            rm_bits as u32,
1763                        )
1764                    }
1765                } else {
1766                    // 32-bit Thumb-2 SUBS with immediate
1767                    self.encode_thumb32_subs(rd, rn, 0)
1768                }
1769            }
1770
1771            // SBC: Subtract with Carry (Thumb-2 32-bit)
1772            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1773            ArmOp::Sbc { rd, rn, op2 } => {
1774                let rd_bits = reg_to_bits(rd);
1775                let rn_bits = reg_to_bits(rn);
1776
1777                if let Operand2::Reg(rm) = op2 {
1778                    let rm_bits = reg_to_bits(rm);
1779                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1780                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1781                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1782
1783                    let mut bytes = hw1.to_le_bytes().to_vec();
1784                    bytes.extend_from_slice(&hw2.to_le_bytes());
1785                    Ok(bytes)
1786                } else {
1787                    // SBC with immediate - use 32-bit encoding
1788                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1789                    let hw2: u16 = (rd_bits << 8) as u16;
1790                    let mut bytes = hw1.to_le_bytes().to_vec();
1791                    bytes.extend_from_slice(&hw2.to_le_bytes());
1792                    Ok(bytes)
1793                }
1794            }
1795
1796            // === 32-bit Thumb-2 encodings ===
1797
1798            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1799            ArmOp::Sdiv { rd, rn, rm } => {
1800                let rd_bits = reg_to_bits(rd);
1801                let rn_bits = reg_to_bits(rn);
1802                let rm_bits = reg_to_bits(rm);
1803                reg_bits_checked(rd_bits)?;
1804                reg_bits_checked(rn_bits)?;
1805                reg_bits_checked(rm_bits)?;
1806
1807                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1808                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1809                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1810                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1811                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1812
1813                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1814                let mut bytes = hw1.to_le_bytes().to_vec();
1815                bytes.extend_from_slice(&hw2.to_le_bytes());
1816                encoding_contracts::verify_thumb32(&bytes);
1817                Ok(bytes)
1818            }
1819
1820            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1821            ArmOp::Udiv { rd, rn, rm } => {
1822                let rd_bits = reg_to_bits(rd);
1823                let rn_bits = reg_to_bits(rn);
1824                let rm_bits = reg_to_bits(rm);
1825                reg_bits_checked(rd_bits)?;
1826                reg_bits_checked(rn_bits)?;
1827                reg_bits_checked(rm_bits)?;
1828
1829                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1830                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1831                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1832
1833                let mut bytes = hw1.to_le_bytes().to_vec();
1834                bytes.extend_from_slice(&hw2.to_le_bytes());
1835                encoding_contracts::verify_thumb32(&bytes);
1836                Ok(bytes)
1837            }
1838
1839            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
1840                let rdlo_bits = reg_to_bits(rdlo);
1841                let rdhi_bits = reg_to_bits(rdhi);
1842                let rn_bits = reg_to_bits(rn);
1843                let rm_bits = reg_to_bits(rm);
1844                reg_bits_checked(rdlo_bits)?;
1845                reg_bits_checked(rdhi_bits)?;
1846                reg_bits_checked(rn_bits)?;
1847                reg_bits_checked(rm_bits)?;
1848
1849                // Thumb-2 UMULL: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm
1850                let hw1: u16 = (0xFBA0 | rn_bits) as u16;
1851                let hw2: u16 = ((rdlo_bits << 12) | (rdhi_bits << 8) | rm_bits) as u16;
1852
1853                let mut bytes = hw1.to_le_bytes().to_vec();
1854                bytes.extend_from_slice(&hw2.to_le_bytes());
1855                encoding_contracts::verify_thumb32(&bytes);
1856                Ok(bytes)
1857            }
1858
1859            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1860            ArmOp::Mul { rd, rn, rm } => {
1861                let rd_bits = reg_to_bits(rd);
1862                let rn_bits = reg_to_bits(rn);
1863                let rm_bits = reg_to_bits(rm);
1864
1865                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1866                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1867                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1868                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1869
1870                let mut bytes = hw1.to_le_bytes().to_vec();
1871                bytes.extend_from_slice(&hw2.to_le_bytes());
1872                Ok(bytes)
1873            }
1874
1875            // MLS: Rd = Ra - Rn * Rm
1876            ArmOp::Mls { rd, rn, rm, ra } => {
1877                let rd_bits = reg_to_bits(rd);
1878                let rn_bits = reg_to_bits(rn);
1879                let rm_bits = reg_to_bits(rm);
1880                let ra_bits = reg_to_bits(ra);
1881
1882                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1883                // 11111011 0000 Rn | Ra Rd 0001 Rm
1884                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1885                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1886
1887                let mut bytes = hw1.to_le_bytes().to_vec();
1888                bytes.extend_from_slice(&hw2.to_le_bytes());
1889                Ok(bytes)
1890            }
1891
1892            ArmOp::Mla { rd, rn, rm, ra } => {
1893                let rd_bits = reg_to_bits(rd);
1894                let rn_bits = reg_to_bits(rn);
1895                let rm_bits = reg_to_bits(rm);
1896                let ra_bits = reg_to_bits(ra);
1897
1898                // Thumb-2 MLA: FB00 Rn | Ra Rd 0000 Rm — same as MLS without the
1899                // bit-4 (0x10) op flag. rd = ra + rn*rm.
1900                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1901                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | rm_bits) as u16;
1902
1903                let mut bytes = hw1.to_le_bytes().to_vec();
1904                bytes.extend_from_slice(&hw2.to_le_bytes());
1905                Ok(bytes)
1906            }
1907
1908            // AND (Thumb-2 32-bit)
1909            ArmOp::And { rd, rn, op2 } => {
1910                if let Operand2::Reg(rm) = op2 {
1911                    let rd_bits = reg_to_bits(rd);
1912                    let rn_bits = reg_to_bits(rn);
1913                    let rm_bits = reg_to_bits(rm);
1914
1915                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1916                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1917                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1918
1919                    let mut bytes = hw1.to_le_bytes().to_vec();
1920                    bytes.extend_from_slice(&hw2.to_le_bytes());
1921                    Ok(bytes)
1922                } else if let Operand2::Imm(imm) = op2 {
1923                    let rd_bits = reg_to_bits(rd);
1924                    let rn_bits = reg_to_bits(rn);
1925
1926                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8.
1927                    // The i:imm3:imm8 field is a ThumbExpandImm modified immediate —
1928                    // encode it correctly (or error on an un-encodable value)
1929                    // rather than packing raw bits, closing the silent-miscompile
1930                    // class for AND alongside ORR/EOR (#251) / ADD/SUB (#253) /
1931                    // CMP (#255).
1932                    let field = try_thumb_expand_imm(*imm as u32).ok_or_else(|| {
1933                        synth_core::Error::synthesis(
1934                            "AND immediate is not a valid ThumbExpandImm — materialize into a register",
1935                        )
1936                    })?;
1937                    let i_bit = (field >> 11) & 1;
1938                    let imm3 = (field >> 8) & 0x7;
1939                    let imm8 = field & 0xFF;
1940
1941                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1942                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1943
1944                    let mut bytes = hw1.to_le_bytes().to_vec();
1945                    bytes.extend_from_slice(&hw2.to_le_bytes());
1946                    Ok(bytes)
1947                } else {
1948                    // RegShift variant - fallback to NOP
1949                    let instr: u16 = 0xBF00;
1950                    Ok(instr.to_le_bytes().to_vec())
1951                }
1952            }
1953
1954            // ORR (Thumb-2 32-bit)
1955            ArmOp::Orr { rd, rn, op2 } => {
1956                if let Operand2::Reg(rm) = op2 {
1957                    let rd_bits = reg_to_bits(rd);
1958                    let rn_bits = reg_to_bits(rn);
1959                    let rm_bits = reg_to_bits(rm);
1960
1961                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1962                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1963                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1964
1965                    let mut bytes = hw1.to_le_bytes().to_vec();
1966                    bytes.extend_from_slice(&hw2.to_le_bytes());
1967                    Ok(bytes)
1968                } else if let Operand2::Imm(imm) = op2 {
1969                    // ORR.W immediate T1: 11110 i 0 0010 S Rn | 0 imm3 Rd imm8.
1970                    // Only the zero-extended byte form (imm <= 0xFF) is encoded;
1971                    // larger modified immediates need ThumbExpandImm — return an
1972                    // error rather than silently emit a NOP (Ok-or-Err, #180/#185).
1973                    let imm_val = *imm as u32;
1974                    if imm_val > 0xFF {
1975                        return Err(synth_core::Error::synthesis(
1976                            "ORR immediate > 0xFF requires ThumbExpandImm (not yet implemented)",
1977                        ));
1978                    }
1979                    let rd_bits = reg_to_bits(rd);
1980                    let rn_bits = reg_to_bits(rn);
1981                    let hw1: u16 = (0xF040 | rn_bits) as u16;
1982                    let hw2: u16 = ((rd_bits << 8) | (imm_val & 0xFF)) as u16;
1983                    let mut bytes = hw1.to_le_bytes().to_vec();
1984                    bytes.extend_from_slice(&hw2.to_le_bytes());
1985                    Ok(bytes)
1986                } else {
1987                    let instr: u16 = 0xBF00;
1988                    Ok(instr.to_le_bytes().to_vec())
1989                }
1990            }
1991
1992            // EOR (Thumb-2 32-bit)
1993            ArmOp::Eor { rd, rn, op2 } => {
1994                if let Operand2::Reg(rm) = op2 {
1995                    let rd_bits = reg_to_bits(rd);
1996                    let rn_bits = reg_to_bits(rn);
1997                    let rm_bits = reg_to_bits(rm);
1998
1999                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
2000                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
2001                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
2002
2003                    let mut bytes = hw1.to_le_bytes().to_vec();
2004                    bytes.extend_from_slice(&hw2.to_le_bytes());
2005                    Ok(bytes)
2006                } else if let Operand2::Imm(imm) = op2 {
2007                    // EOR.W immediate T1: 11110 i 0 0100 S Rn | 0 imm3 Rd imm8.
2008                    // Byte form only (imm <= 0xFF); larger needs ThumbExpandImm —
2009                    // error, not a silent NOP (Ok-or-Err, #180/#185).
2010                    let imm_val = *imm as u32;
2011                    if imm_val > 0xFF {
2012                        return Err(synth_core::Error::synthesis(
2013                            "EOR immediate > 0xFF requires ThumbExpandImm (not yet implemented)",
2014                        ));
2015                    }
2016                    let rd_bits = reg_to_bits(rd);
2017                    let rn_bits = reg_to_bits(rn);
2018                    let hw1: u16 = (0xF080 | rn_bits) as u16;
2019                    let hw2: u16 = ((rd_bits << 8) | (imm_val & 0xFF)) as u16;
2020                    let mut bytes = hw1.to_le_bytes().to_vec();
2021                    bytes.extend_from_slice(&hw2.to_le_bytes());
2022                    Ok(bytes)
2023                } else {
2024                    let instr: u16 = 0xBF00;
2025                    Ok(instr.to_le_bytes().to_vec())
2026                }
2027            }
2028
2029            // Shift operations (16-bit for low registers)
2030            ArmOp::Lsl { rd, rn, shift } => {
2031                let rd_bits = reg_to_bits(rd) as u16;
2032                let rn_bits = reg_to_bits(rn) as u16;
2033                let shift_bits = (*shift as u16) & 0x1F;
2034
2035                if rd_bits < 8 && rn_bits < 8 {
2036                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
2037                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
2038                    Ok(instr.to_le_bytes().to_vec())
2039                } else {
2040                    // Use 32-bit encoding for high registers
2041                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
2042                }
2043            }
2044
2045            ArmOp::Lsr { rd, rn, shift } => {
2046                let rd_bits = reg_to_bits(rd) as u16;
2047                let rn_bits = reg_to_bits(rn) as u16;
2048                let shift_bits = (*shift as u16) & 0x1F;
2049
2050                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
2051                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
2052                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
2053                    Ok(instr.to_le_bytes().to_vec())
2054                } else {
2055                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
2056                }
2057            }
2058
2059            ArmOp::Asr { rd, rn, shift } => {
2060                let rd_bits = reg_to_bits(rd) as u16;
2061                let rn_bits = reg_to_bits(rn) as u16;
2062                let shift_bits = (*shift as u16) & 0x1F;
2063
2064                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
2065                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
2066                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
2067                    Ok(instr.to_le_bytes().to_vec())
2068                } else {
2069                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
2070                }
2071            }
2072
2073            ArmOp::Ror { rd, rn, shift } => {
2074                // ROR doesn't have a 16-bit immediate form, use 32-bit
2075                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
2076            }
2077
2078            // Register-based shifts (Thumb-2 32-bit)
2079            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
2080            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2081            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
2082            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
2083            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
2084            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
2085
2086            // RSB (Reverse Subtract): Rd = imm - Rn
2087            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
2088            ArmOp::Rsb { rd, rn, imm } => {
2089                let rd_bits = reg_to_bits(rd);
2090                let rn_bits = reg_to_bits(rn);
2091                let imm_val = *imm;
2092
2093                let i_bit = (imm_val >> 11) & 1;
2094                let imm3 = (imm_val >> 8) & 0x7;
2095                let imm8 = imm_val & 0xFF;
2096
2097                // hw1: 11110 i 01110 0 Rn  (S=0)
2098                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
2099                // hw2: 0 imm3 Rd imm8
2100                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
2101
2102                let mut bytes = hw1.to_le_bytes().to_vec();
2103                bytes.extend_from_slice(&hw2.to_le_bytes());
2104                Ok(bytes)
2105            }
2106
2107            // CLZ (Thumb-2 32-bit)
2108            ArmOp::Clz { rd, rm } => {
2109                let rd_bits = reg_to_bits(rd);
2110                let rm_bits = reg_to_bits(rm);
2111
2112                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
2113                // 11111010 1011 Rm | 1111 1000 Rd Rm
2114                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
2115                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
2116
2117                let mut bytes = hw1.to_le_bytes().to_vec();
2118                bytes.extend_from_slice(&hw2.to_le_bytes());
2119                Ok(bytes)
2120            }
2121
2122            // RBIT (Thumb-2 32-bit)
2123            ArmOp::Rbit { rd, rm } => {
2124                let rd_bits = reg_to_bits(rd);
2125                let rm_bits = reg_to_bits(rm);
2126
2127                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
2128                // 11111010 1001 Rm | 1111 Rd 1010 Rm
2129                let hw1: u16 = (0xFA90 | rm_bits) as u16;
2130                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
2131
2132                let mut bytes = hw1.to_le_bytes().to_vec();
2133                bytes.extend_from_slice(&hw2.to_le_bytes());
2134                Ok(bytes)
2135            }
2136
2137            // SXTB (16-bit for low registers)
2138            ArmOp::Sxtb { rd, rm } => {
2139                let rd_bits = reg_to_bits(rd) as u16;
2140                let rm_bits = reg_to_bits(rm) as u16;
2141
2142                if rd_bits < 8 && rm_bits < 8 {
2143                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
2144                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
2145                    Ok(instr.to_le_bytes().to_vec())
2146                } else {
2147                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
2148                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
2149                    let rd_bits32 = rd_bits as u32;
2150                    let rm_bits32 = rm_bits as u32;
2151                    let hw1: u16 = 0xFA4F;
2152                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2153                    let mut bytes = hw1.to_le_bytes().to_vec();
2154                    bytes.extend_from_slice(&hw2.to_le_bytes());
2155                    Ok(bytes)
2156                }
2157            }
2158
2159            // SXTH (16-bit for low registers)
2160            ArmOp::Sxth { rd, rm } => {
2161                let rd_bits = reg_to_bits(rd) as u16;
2162                let rm_bits = reg_to_bits(rm) as u16;
2163
2164                if rd_bits < 8 && rm_bits < 8 {
2165                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
2166                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
2167                    Ok(instr.to_le_bytes().to_vec())
2168                } else {
2169                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
2170                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
2171                    let rd_bits32 = rd_bits as u32;
2172                    let rm_bits32 = rm_bits as u32;
2173                    let hw1: u16 = 0xFA0F;
2174                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2175                    let mut bytes = hw1.to_le_bytes().to_vec();
2176                    bytes.extend_from_slice(&hw2.to_le_bytes());
2177                    Ok(bytes)
2178                }
2179            }
2180
2181            // CMP (can be 16-bit for low registers)
2182            ArmOp::Cmp { rn, op2 } => {
2183                let rn_bits = reg_to_bits(rn) as u16;
2184
2185                if let Operand2::Imm(imm) = op2 {
2186                    // Only use 16-bit encoding for non-negative immediates 0-255
2187                    // Negative immediates must use 32-bit encoding
2188                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
2189                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
2190                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
2191                        Ok(instr.to_le_bytes().to_vec())
2192                    } else {
2193                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
2194                    }
2195                } else if let Operand2::Reg(rm) = op2 {
2196                    let rm_bits = reg_to_bits(rm) as u16;
2197                    if rn_bits < 8 && rm_bits < 8 {
2198                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
2199                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2200                        Ok(instr.to_le_bytes().to_vec())
2201                    } else {
2202                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
2203                        let n_bit = (rn_bits >> 3) & 1;
2204                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2205                        Ok(instr.to_le_bytes().to_vec())
2206                    }
2207                } else {
2208                    let instr: u16 = 0xBF00;
2209                    Ok(instr.to_le_bytes().to_vec())
2210                }
2211            }
2212
2213            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2214            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2215            ArmOp::Cmn { rn, op2 } => {
2216                let rn_bits = reg_to_bits(rn) as u16;
2217
2218                if let Operand2::Imm(imm) = op2 {
2219                    // CMN.W Rn, #imm (32-bit): i:imm3:imm8 is a ThumbExpandImm
2220                    // modified immediate (the field sits in imm3=hw2[14:12],
2221                    // imm8=hw2[7:0], i=hw1[10]). Encode it correctly, or error on
2222                    // an un-encodable value — replacing the old silent `0xBF00`
2223                    // NOP (the last of the silent-miscompile data-proc encoders).
2224                    let field = try_thumb_expand_imm(*imm as u32).ok_or_else(|| {
2225                        synth_core::Error::synthesis(
2226                            "CMN immediate is not a valid ThumbExpandImm — materialize into a register",
2227                        )
2228                    })?;
2229                    let i_bit = (field >> 11) & 1;
2230                    let imm3 = (field >> 8) & 0x7;
2231                    let imm8 = field & 0xFF;
2232                    let hw1: u16 = (0xF110 | (i_bit << 10) as u16) | rn_bits;
2233                    let hw2: u16 = (imm3 << 12) as u16 | 0x0F00 | imm8 as u16;
2234                    let mut bytes = hw1.to_le_bytes().to_vec();
2235                    bytes.extend_from_slice(&hw2.to_le_bytes());
2236                    Ok(bytes)
2237                } else if let Operand2::Reg(rm) = op2 {
2238                    let rm_bits = reg_to_bits(rm) as u16;
2239                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2240                    // the 3-bit fields and corrupt the operands (#184, the #180
2241                    // class). CMN has no high-register 16-bit form, so fall back
2242                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2243                    // Rd discarded as PC/1111).
2244                    if rn_bits < 8 && rm_bits < 8 {
2245                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2246                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2247                        Ok(instr.to_le_bytes().to_vec())
2248                    } else {
2249                        let hw1: u16 = 0xEB10 | rn_bits;
2250                        let hw2: u16 = 0x0F00 | rm_bits;
2251                        let mut bytes = hw1.to_le_bytes().to_vec();
2252                        bytes.extend_from_slice(&hw2.to_le_bytes());
2253                        Ok(bytes)
2254                    }
2255                } else {
2256                    Ok(vec![0xBF, 0x00])
2257                }
2258            }
2259
2260            // LDR (can be 16-bit for simple cases)
2261            ArmOp::Ldr { rd, addr } => {
2262                let rd_bits = reg_to_bits(rd);
2263                let base_bits = reg_to_bits(&addr.base);
2264
2265                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2266                if let Some(offset_reg) = &addr.offset_reg {
2267                    let rm_bits = reg_to_bits(offset_reg);
2268
2269                    // If there's also an immediate offset, we need to ADD it first
2270                    if addr.offset != 0 {
2271                        // Use R12 (IP) as scratch to avoid clobbering the address register
2272                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2273                        let scratch = Reg::R12;
2274                        let mut bytes =
2275                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2276                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2277                        return Ok(bytes);
2278                    }
2279
2280                    // Simple register offset: LDR Rd, [Rn, Rm]
2281                    // 16-bit: only if Rd, Rn, Rm < R8
2282                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2283                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2284                        let instr: u16 = 0x5800
2285                            | ((rm_bits as u16) << 6)
2286                            | ((base_bits as u16) << 3)
2287                            | (rd_bits as u16);
2288                        return Ok(instr.to_le_bytes().to_vec());
2289                    }
2290
2291                    // 32-bit register offset
2292                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2293                }
2294
2295                // Immediate offset mode [base, #imm]
2296                let offset = addr.offset as u32;
2297
2298                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2299                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2300                    let imm5 = (offset >> 2) as u16;
2301                    let instr: u16 =
2302                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2303                    Ok(instr.to_le_bytes().to_vec())
2304                } else {
2305                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2306                }
2307            }
2308
2309            // STR (can be 16-bit for simple cases)
2310            ArmOp::Str { rd, addr } => {
2311                let rd_bits = reg_to_bits(rd);
2312                let base_bits = reg_to_bits(&addr.base);
2313
2314                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2315                if let Some(offset_reg) = &addr.offset_reg {
2316                    let rm_bits = reg_to_bits(offset_reg);
2317
2318                    // If there's also an immediate offset, we need to ADD it first
2319                    if addr.offset != 0 {
2320                        // Use R12 (IP) as scratch to avoid clobbering the address register
2321                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2322                        let scratch = Reg::R12;
2323                        let mut bytes =
2324                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2325                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2326                        return Ok(bytes);
2327                    }
2328
2329                    // Simple register offset: STR Rd, [Rn, Rm]
2330                    // 16-bit: only if Rd, Rn, Rm < R8
2331                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2332                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2333                        let instr: u16 = 0x5000
2334                            | ((rm_bits as u16) << 6)
2335                            | ((base_bits as u16) << 3)
2336                            | (rd_bits as u16);
2337                        return Ok(instr.to_le_bytes().to_vec());
2338                    }
2339
2340                    // 32-bit register offset
2341                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2342                }
2343
2344                // Immediate offset mode [base, #imm]
2345                let offset = addr.offset as u32;
2346
2347                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2348                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2349                    let imm5 = (offset >> 2) as u16;
2350                    let instr: u16 =
2351                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2352                    Ok(instr.to_le_bytes().to_vec())
2353                } else {
2354                    self.encode_thumb32_str(rd, &addr.base, offset)
2355                }
2356            }
2357
2358            // LDRB (Thumb-2)
2359            ArmOp::Ldrb { rd, addr } => {
2360                let rd_bits = reg_to_bits(rd);
2361                let base_bits = reg_to_bits(&addr.base);
2362
2363                if let Some(offset_reg) = &addr.offset_reg {
2364                    if addr.offset != 0 {
2365                        let scratch = Reg::R12;
2366                        let mut bytes =
2367                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2368                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2369                        return Ok(bytes);
2370                    }
2371                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2372                }
2373
2374                let offset = addr.offset as u32;
2375                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2376                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2377                    let instr: u16 = 0x7800
2378                        | ((offset as u16) << 6)
2379                        | ((base_bits as u16) << 3)
2380                        | (rd_bits as u16);
2381                    Ok(instr.to_le_bytes().to_vec())
2382                } else {
2383                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2384                }
2385            }
2386
2387            // LDRSB (Thumb-2)
2388            ArmOp::Ldrsb { rd, addr } => {
2389                let rd_bits = reg_to_bits(rd);
2390                let base_bits = reg_to_bits(&addr.base);
2391
2392                if let Some(offset_reg) = &addr.offset_reg {
2393                    if addr.offset != 0 {
2394                        let scratch = Reg::R12;
2395                        let mut bytes =
2396                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2397                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2398                        return Ok(bytes);
2399                    }
2400                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2401                }
2402
2403                let offset = addr.offset as u32;
2404                // LDRSB has no 16-bit immediate form (only register)
2405                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2406                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2407                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2408                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2409                } else {
2410                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2411                }
2412            }
2413
2414            // LDRH (Thumb-2)
2415            ArmOp::Ldrh { rd, addr } => {
2416                let rd_bits = reg_to_bits(rd);
2417                let base_bits = reg_to_bits(&addr.base);
2418
2419                if let Some(offset_reg) = &addr.offset_reg {
2420                    if addr.offset != 0 {
2421                        let scratch = Reg::R12;
2422                        let mut bytes =
2423                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2424                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2425                        return Ok(bytes);
2426                    }
2427                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2428                }
2429
2430                let offset = addr.offset as u32;
2431                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2432                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2433                    let imm5 = (offset >> 1) as u16;
2434                    let instr: u16 =
2435                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2436                    Ok(instr.to_le_bytes().to_vec())
2437                } else {
2438                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2439                }
2440            }
2441
2442            // LDRSH (Thumb-2)
2443            ArmOp::Ldrsh { rd, addr } => {
2444                if let Some(offset_reg) = &addr.offset_reg {
2445                    if addr.offset != 0 {
2446                        let scratch = Reg::R12;
2447                        let mut bytes =
2448                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2449                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2450                        return Ok(bytes);
2451                    }
2452                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2453                }
2454
2455                let offset = addr.offset as u32;
2456                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2457            }
2458
2459            // STRB (Thumb-2)
2460            ArmOp::Strb { rd, addr } => {
2461                let rd_bits = reg_to_bits(rd);
2462                let base_bits = reg_to_bits(&addr.base);
2463
2464                if let Some(offset_reg) = &addr.offset_reg {
2465                    if addr.offset != 0 {
2466                        let scratch = Reg::R12;
2467                        let mut bytes =
2468                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2469                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2470                        return Ok(bytes);
2471                    }
2472                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2473                }
2474
2475                let offset = addr.offset as u32;
2476                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2477                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2478                    let instr: u16 = 0x7000
2479                        | ((offset as u16) << 6)
2480                        | ((base_bits as u16) << 3)
2481                        | (rd_bits as u16);
2482                    Ok(instr.to_le_bytes().to_vec())
2483                } else {
2484                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2485                }
2486            }
2487
2488            // STRH (Thumb-2)
2489            ArmOp::Strh { rd, addr } => {
2490                let rd_bits = reg_to_bits(rd);
2491                let base_bits = reg_to_bits(&addr.base);
2492
2493                if let Some(offset_reg) = &addr.offset_reg {
2494                    if addr.offset != 0 {
2495                        let scratch = Reg::R12;
2496                        let mut bytes =
2497                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2498                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2499                        return Ok(bytes);
2500                    }
2501                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2502                }
2503
2504                let offset = addr.offset as u32;
2505                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2506                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2507                    let imm5 = (offset >> 1) as u16;
2508                    let instr: u16 =
2509                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2510                    Ok(instr.to_le_bytes().to_vec())
2511                } else {
2512                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2513                }
2514            }
2515
2516            // MemorySize (Thumb-2)
2517            ArmOp::MemorySize { rd } => {
2518                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2519                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2520                let rd_bits = reg_to_bits(rd);
2521                let r10_bits = reg_to_bits(&Reg::R10);
2522                if rd_bits < 8 && r10_bits < 8 {
2523                    let instr: u16 =
2524                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2525                    Ok(instr.to_le_bytes().to_vec())
2526                } else {
2527                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2528                    let imm5: u32 = 16;
2529                    let imm3 = (imm5 >> 2) & 0x7;
2530                    let imm2 = imm5 & 0x3;
2531                    let hw1: u16 = 0xEA4F;
2532                    let hw2: u16 =
2533                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2534                    let mut bytes = hw1.to_le_bytes().to_vec();
2535                    bytes.extend_from_slice(&hw2.to_le_bytes());
2536                    Ok(bytes)
2537                }
2538            }
2539
2540            // MemoryGrow (Thumb-2)
2541            ArmOp::MemoryGrow { rd, .. } => {
2542                // On embedded with fixed memory, always return -1 (failure)
2543                // MVN rd, #0 → MOV rd, #-1
2544                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2545                let rd_bits = reg_to_bits(rd);
2546                let hw1: u16 = 0xF06F; // MVN with i=0
2547                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2548                let mut bytes = hw1.to_le_bytes().to_vec();
2549                bytes.extend_from_slice(&hw2.to_le_bytes());
2550                Ok(bytes)
2551            }
2552
2553            // BX (16-bit)
2554            ArmOp::Bx { rm } => {
2555                let rm_bits = reg_to_bits(rm) as u16;
2556                // BX Rm (16-bit): 0100 0111 0 Rm 000
2557                let instr: u16 = 0x4700 | (rm_bits << 3);
2558                Ok(instr.to_le_bytes().to_vec())
2559            }
2560
2561            // BLX (16-bit) - Branch with Link and Exchange
2562            // BLX Rm: 0100 0111 1 Rm 000
2563            ArmOp::Blx { rm } => {
2564                let rm_bits = reg_to_bits(rm) as u16;
2565                let instr: u16 = 0x4780 | (rm_bits << 3);
2566                Ok(instr.to_le_bytes().to_vec())
2567            }
2568
2569            // CallIndirect - indirect function call via table lookup
2570            // table_index_reg contains the table index
2571            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2572            ArmOp::CallIndirect {
2573                rd: _,
2574                type_idx: _,
2575                table_index_reg,
2576            } => {
2577                let idx_reg = reg_to_bits(table_index_reg);
2578                let mut bytes = Vec::new();
2579
2580                // For now, we generate code that:
2581                // 1. Multiplies index by 4 (function pointer size)
2582                // 2. Loads function pointer from table (assumes table base in R11)
2583                // 3. Calls the function via BLX
2584                //
2585                // Table base setup must be done by caller/runtime.
2586                // This is a simplified implementation - full support needs:
2587                // - Table base address resolution
2588                // - Type signature checking
2589                // - Bounds checking
2590
2591                // LSL R12, idx_reg, #2 (multiply index by 4)
2592                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2593                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2594                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2595                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2596                bytes.extend_from_slice(&hw1.to_le_bytes());
2597                bytes.extend_from_slice(&hw2.to_le_bytes());
2598
2599                // LDR R12, [R11, R12] - load function pointer
2600                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2601                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2602                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2603                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2604                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2605                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2606
2607                // BLX R12 (call function indirectly)
2608                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2609                let blx: u16 = 0x47E0; // BLX R12
2610                bytes.extend_from_slice(&blx.to_le_bytes());
2611
2612                Ok(bytes)
2613            }
2614
2615            // Label pseudo-instruction: emits no machine code
2616            ArmOp::Label { .. } => Ok(Vec::new()),
2617
2618            // Conditional branch to label (generic) - offset 0, will be patched
2619            ArmOp::Bcc { cond, label: _ } => {
2620                use synth_synthesis::Condition;
2621                let cond_bits: u16 = match cond {
2622                    Condition::EQ => 0x0,
2623                    Condition::NE => 0x1,
2624                    Condition::HS => 0x2,
2625                    Condition::LO => 0x3,
2626                    Condition::HI => 0x8,
2627                    Condition::LS => 0x9,
2628                    Condition::GE => 0xA,
2629                    Condition::LT => 0xB,
2630                    Condition::GT => 0xC,
2631                    Condition::LE => 0xD,
2632                };
2633                // 16-bit B<cond> with offset 0: 1101 cond imm8
2634                let instr: u16 = 0xD000 | (cond_bits << 8);
2635                Ok(instr.to_le_bytes().to_vec())
2636            }
2637
2638            // Branch instructions
2639            ArmOp::B { label: _ } => {
2640                // Simplified: B.N with offset 0
2641                // For real usage, would need label resolution
2642                let instr: u16 = 0xE000; // B.N #0
2643                Ok(instr.to_le_bytes().to_vec())
2644            }
2645
2646            // BHS (Branch if Higher or Same) - used for bounds checking
2647            // Condition code: 0x2 (C set)
2648            ArmOp::Bhs { label: _ } => {
2649                // 16-bit B<cond> with offset 0: 1101 cond imm8
2650                // cond = 0x2 (HS)
2651                let instr: u16 = 0xD200; // BHS.N #0
2652                Ok(instr.to_le_bytes().to_vec())
2653            }
2654
2655            // BLO (Branch if Lower) - complementary to BHS
2656            // Condition code: 0x3 (C clear)
2657            ArmOp::Blo { label: _ } => {
2658                // 16-bit B<cond> with offset 0: 1101 cond imm8
2659                // cond = 0x3 (LO)
2660                let instr: u16 = 0xD300; // BLO.N #0
2661                Ok(instr.to_le_bytes().to_vec())
2662            }
2663
2664            // Branch with numeric offset (Thumb-2)
2665            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2666            ArmOp::BOffset { offset } => {
2667                // offset is already the halfword displacement: (target - branch - 4) / 2
2668                // This is the raw encoded value, accounting for variable-length instructions
2669                let halfword_offset = *offset;
2670
2671                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2672                // Range: -1024 to +1022 halfwords
2673                if (-1024..=1022).contains(&halfword_offset) {
2674                    // 16-bit B.N encoding: 1110 0 imm11
2675                    let imm11 = (halfword_offset as u16) & 0x7FF;
2676                    let instr: u16 = 0xE000 | imm11;
2677                    Ok(instr.to_le_bytes().to_vec())
2678                } else {
2679                    // 32-bit B.W encoding for larger offsets
2680                    // First halfword: 1111 0 S imm10
2681                    // Second halfword: 10 J1 0 J2 imm11
2682                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2683                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2684
2685                    // The B.W (T4) encoding packs the signed offset as:
2686                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2687                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2688                    // Input halfword_offset already equals (target - PC - 4) / 2,
2689                    // so the full byte offset = halfword_offset << 1.
2690                    // The encoding fields split that 25-bit signed value (including the
2691                    // implicit trailing zero) as: S | imm10 | imm11
2692                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2693                    let signed_offset = halfword_offset << 1; // byte offset
2694                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2695                    let uoffset = signed_offset as u32;
2696                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2697                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2698                    let i1 = (uoffset >> 23) & 1; // bit 23
2699                    let i2 = (uoffset >> 22) & 1; // bit 22
2700                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2701                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2702
2703                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2704                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2705
2706                    let mut bytes = hw1.to_le_bytes().to_vec();
2707                    bytes.extend_from_slice(&hw2.to_le_bytes());
2708                    Ok(bytes)
2709                }
2710            }
2711
2712            // Conditional branch with numeric offset (Thumb-2)
2713            ArmOp::BCondOffset { cond, offset } => {
2714                use synth_synthesis::Condition;
2715                let cond_bits: u16 = match cond {
2716                    Condition::EQ => 0x0,
2717                    Condition::NE => 0x1,
2718                    Condition::HS => 0x2,
2719                    Condition::LO => 0x3,
2720                    Condition::HI => 0x8,
2721                    Condition::LS => 0x9,
2722                    Condition::GE => 0xA,
2723                    Condition::LT => 0xB,
2724                    Condition::GT => 0xC,
2725                    Condition::LE => 0xD,
2726                };
2727
2728                // offset is already the halfword displacement: (target - branch - 4) / 2
2729                // This is the raw imm8 value for 16-bit B<cond> encoding
2730                let halfword_offset = *offset;
2731
2732                // 16-bit B<cond> encoding: 1101 cond imm8
2733                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2734                if (-128..=127).contains(&halfword_offset) {
2735                    let imm8 = (halfword_offset as u16) & 0xFF;
2736                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2737                    Ok(instr.to_le_bytes().to_vec())
2738                } else {
2739                    // 32-bit B<cond>.W for larger offsets
2740                    // First halfword: 1111 0 S cond imm6
2741                    // Second halfword: 10 J1 0 J2 imm11
2742                    let offset = halfword_offset >> 1;
2743                    let s = if offset < 0 { 1u32 } else { 0u32 };
2744                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2745                    let imm11 = (offset as u32) & 0x7FF;
2746                    let j1 = if s == 1 { 1 } else { 0 };
2747                    let j2 = if s == 1 { 1 } else { 0 };
2748
2749                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2750                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2751
2752                    let mut bytes = hw1.to_le_bytes().to_vec();
2753                    bytes.extend_from_slice(&hw2.to_le_bytes());
2754                    Ok(bytes)
2755                }
2756            }
2757
2758            ArmOp::Bl { label: _ } => {
2759                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2760                // placeholder; an R_ARM_THM_CALL relocation patches the target
2761                // (see arm_backend.rs). The placeholder must carry an embedded
2762                // addend of -4 so the relocation nets to exactly the symbol S.
2763                //
2764                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2765                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2766                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2767                // instruction past the callee entry (#174). The correct
2768                // placeholder is what `gas` emits for `bl <extern>`:
2769                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2770                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2771                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2772                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2773                let hw1: u16 = 0xF7FF;
2774                let hw2: u16 = 0xFFFE;
2775                let mut bytes = hw1.to_le_bytes().to_vec();
2776                bytes.extend_from_slice(&hw2.to_le_bytes());
2777                Ok(bytes)
2778            }
2779
2780            // MVN
2781            ArmOp::Mvn { rd, op2 } => {
2782                if let Operand2::Reg(rm) = op2 {
2783                    let rd_bits = reg_to_bits(rd) as u16;
2784                    let rm_bits = reg_to_bits(rm) as u16;
2785
2786                    if rd_bits < 8 && rm_bits < 8 {
2787                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2788                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2789                        Ok(instr.to_le_bytes().to_vec())
2790                    } else {
2791                        // 32-bit MVN
2792                        let hw1: u16 = 0xEA6F_u16;
2793                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2794                        let mut bytes = hw1.to_le_bytes().to_vec();
2795                        bytes.extend_from_slice(&hw2.to_le_bytes());
2796                        Ok(bytes)
2797                    }
2798                } else {
2799                    let instr: u16 = 0xBF00;
2800                    Ok(instr.to_le_bytes().to_vec())
2801                }
2802            }
2803
2804            // MOVW - Move Wide (Thumb-2 32-bit)
2805            ArmOp::Movw { rd, imm16 } => {
2806                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2807            }
2808
2809            // MOVT - Move Top (Thumb-2 32-bit)
2810            ArmOp::Movt { rd, imm16 } => {
2811                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2812            }
2813
2814            // #237: symbol-relative MOVW/MOVT. Encode the addend's low/high 16
2815            // bits in place; the backend records an R_ARM_MOVW_ABS_NC /
2816            // R_ARM_MOVT_ABS relocation against `symbol`, so the linker adds the
2817            // symbol's final address to the in-place addend (REL semantics).
2818            ArmOp::MovwSym { rd, addend, .. } => {
2819                self.encode_thumb32_movw_raw(reg_to_bits(rd), (*addend as u32) & 0xffff)
2820            }
2821            ArmOp::MovtSym { rd, addend, .. } => {
2822                self.encode_thumb32_movt_raw(reg_to_bits(rd), ((*addend as u32) >> 16) & 0xffff)
2823            }
2824
2825            // #345: literal-pool address load — emit a PLACEHOLDER `LDR.W rd,
2826            // [pc, #0]` (U=1, imm12=0). The backend (arm_backend.rs) places the
2827            // 4-byte pool word at the end of the function, records the R_ARM_ABS32
2828            // relocation against `symbol+addend`, and patches the imm12 with the
2829            // real PC-relative distance once the pool offset is known.
2830            // Encoding T2: 1111 1000 1101 1111 | Rt(4) imm12(12), with the literal
2831            // base = Align(PC,4) and PC = address of this instruction + 4.
2832            ArmOp::LdrSym { rd, .. } => {
2833                let rt = reg_to_bits(rd) as u16;
2834                let hw1: u16 = 0xF8DF; // LDR.W (literal), U=1
2835                let hw2: u16 = rt << 12; // imm12 = 0 placeholder
2836                let mut bytes = Vec::with_capacity(4);
2837                bytes.extend_from_slice(&hw1.to_le_bytes());
2838                bytes.extend_from_slice(&hw2.to_le_bytes());
2839                Ok(bytes)
2840            }
2841
2842            // SetCond: Materialize condition flag into register (0 or 1)
2843            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2844            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2845            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2846            // any MOV instruction clobbers the flags from CMP.
2847            ArmOp::SetCond { rd, cond } => {
2848                let rd_bits = reg_to_bits(rd) as u16;
2849
2850                // Condition code encoding for IT block
2851                use synth_synthesis::Condition;
2852                let cond_bits: u16 = match cond {
2853                    Condition::EQ => 0x0,
2854                    Condition::NE => 0x1,
2855                    Condition::LT => 0xB,
2856                    Condition::LE => 0xD,
2857                    Condition::GT => 0xC,
2858                    Condition::GE => 0xA,
2859                    Condition::LO => 0x3, // CC/LO (unsigned <)
2860                    Condition::LS => 0x9, // LS (unsigned <=)
2861                    Condition::HI => 0x8, // HI (unsigned >)
2862                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2863                };
2864
2865                // ITE <cond>: encodes If-Then-Else block
2866                // The mask field depends on firstcond[0]:
2867                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2868                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2869                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2870                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2871
2872                // Materialize 0/1 into Rd. The 16-bit MOVS (T1) encodes Rd in a
2873                // 3-bit field (bits[10:8]) — only R0–R7. For a high register
2874                // (R8–R12) `rd_bits << 8` overflows into bit 11 and silently
2875                // turns MOVS into CMP (00100 → 00101), corrupting the result
2876                // (this mis-materialized gale's `has_waiter`, so its `local.set`
2877                // stored a stale register → the binary-sem WAKE dispatch read
2878                // garbage). Use the 32-bit MOV.W (T2) for high registers, which
2879                // has a 4-bit Rd field. MOV.W with S=0 doesn't set flags, which
2880                // is fine inside the ITE (the materialized value is the result;
2881                // the flags are not consumed afterwards).
2882                let mut bytes = ite_instr.to_le_bytes().to_vec();
2883                let push_mov = |bytes: &mut Vec<u8>, imm: u16| {
2884                    if rd_bits <= 7 {
2885                        let m: u16 = 0x2000 | (rd_bits << 8) | imm; // 16-bit MOVS Rd,#imm
2886                        bytes.extend_from_slice(&m.to_le_bytes());
2887                    } else {
2888                        // 32-bit MOV.W Rd, #imm (T2): F04F | (Rd<<8) | imm8
2889                        let hw1: u16 = 0xF04F;
2890                        let hw2: u16 = (rd_bits << 8) | imm;
2891                        bytes.extend_from_slice(&hw1.to_le_bytes());
2892                        bytes.extend_from_slice(&hw2.to_le_bytes());
2893                    }
2894                };
2895                push_mov(&mut bytes, 1); // Then branch (condition true)  → 1
2896                push_mov(&mut bytes, 0); // Else branch (condition false) → 0
2897                Ok(bytes)
2898            }
2899
2900            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2901            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2902            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2903            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2904            ArmOp::I64SetCond {
2905                rd,
2906                rn_lo,
2907                rn_hi,
2908                rm_lo,
2909                rm_hi,
2910                cond,
2911            } => {
2912                use synth_synthesis::Condition;
2913                let rd_bits = reg_to_bits(rd) as u16;
2914                let mut bytes = Vec::new();
2915
2916                // Helper: encode CMP Rn, Rm (16-bit)
2917                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2918                                      rm: &synth_synthesis::Reg|
2919                 -> Vec<u8> {
2920                    let rn_bits = reg_to_bits(rn) as u16;
2921                    let rm_bits = reg_to_bits(rm) as u16;
2922                    if rn_bits < 8 && rm_bits < 8 {
2923                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2924                        instr.to_le_bytes().to_vec()
2925                    } else {
2926                        let n_bit = (rn_bits >> 3) & 1;
2927                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2928                        instr.to_le_bytes().to_vec()
2929                    }
2930                };
2931
2932                // Helper: encode ITE <cond> (2 bytes)
2933                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2934                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2935                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2936                    ite_instr.to_le_bytes().to_vec()
2937                };
2938
2939                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2940                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2941                    let mut b = encode_ite(cond_bits);
2942                    if rd_bits < 8 {
2943                        let mov_one: u16 = 0x2001 | (rd_bits << 8);
2944                        let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2945                        b.extend_from_slice(&mov_one.to_le_bytes());
2946                        b.extend_from_slice(&mov_zero.to_le_bytes());
2947                    } else {
2948                        // #311: rd >= R8 — the 16-bit MOV imm8 form has a 3-bit
2949                        // rd field; rd_bits<<8 overflows into bit 11 and
2950                        // TRANSMUTES the MOV into CMP (0x2001|0x0800 = 0x2801 =
2951                        // CMP r0,#1): the boolean dies in the flags and the
2952                        // consumer reads a stale register. Use the 32-bit
2953                        // MOV.W (T2: F04F 0000|rd<<8|imm8) — IT-legal,
2954                        // flag-preserving. Same class as H-CODE-9 / #180.
2955                        for imm in [1u16, 0u16] {
2956                            let hw1: u16 = 0xF04F;
2957                            let hw2: u16 = (rd_bits << 8) | imm;
2958                            b.extend_from_slice(&hw1.to_le_bytes());
2959                            b.extend_from_slice(&hw2.to_le_bytes());
2960                        }
2961                    }
2962                    b
2963                };
2964
2965                match cond {
2966                    Condition::EQ | Condition::NE => {
2967                        // CMP rn_lo, rm_lo (compare low words)
2968                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2969
2970                        // IT EQ (execute next instruction only if Z=1)
2971                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
2972                        bytes.extend_from_slice(&it_eq.to_le_bytes());
2973
2974                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
2975                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
2976
2977                        // ITE <cond>; MOV rd, #1; MOV rd, #0
2978                        let cond_bits: u16 = match cond {
2979                            Condition::EQ => 0x0,
2980                            Condition::NE => 0x1,
2981                            _ => unreachable!(),
2982                        };
2983                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
2984                    }
2985
2986                    Condition::LT => {
2987                        // CMP rn_lo, rm_lo (sets C flag for borrow)
2988                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2989
2990                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
2991                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
2992                        let rn_hi_bits = reg_to_bits(rn_hi);
2993                        let rm_hi_bits = reg_to_bits(rm_hi);
2994                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2995                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2996                        bytes.extend_from_slice(&hw1.to_le_bytes());
2997                        bytes.extend_from_slice(&hw2.to_le_bytes());
2998
2999                        // ITE LT; MOV rd, #1; MOV rd, #0
3000                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
3001                    }
3002
3003                    Condition::GT => {
3004                        // GT(a,b) = LT(b,a): swap operands
3005                        // CMP rm_lo, rn_lo (swapped)
3006                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3007
3008                        // SBCS rd, rm_hi, rn_hi (swapped)
3009                        let rm_hi_bits = reg_to_bits(rm_hi);
3010                        let rn_hi_bits = reg_to_bits(rn_hi);
3011                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3012                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3013                        bytes.extend_from_slice(&hw1.to_le_bytes());
3014                        bytes.extend_from_slice(&hw2.to_le_bytes());
3015
3016                        // ITE LT; MOV rd, #1; MOV rd, #0
3017                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
3018                    }
3019
3020                    Condition::LE => {
3021                        // LE(a,b) = !GT(a,b): use GT logic but invert result
3022                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
3023                        // CMP rm_lo, rn_lo (swapped, same as GT)
3024                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3025
3026                        // SBCS rd, rm_hi, rn_hi (swapped)
3027                        let rm_hi_bits = reg_to_bits(rm_hi);
3028                        let rn_hi_bits = reg_to_bits(rn_hi);
3029                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3030                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3031                        bytes.extend_from_slice(&hw1.to_le_bytes());
3032                        bytes.extend_from_slice(&hw2.to_le_bytes());
3033
3034                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
3035                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
3036                    }
3037
3038                    Condition::GE => {
3039                        // GE(a,b) = !LT(a,b): use LT logic but invert result
3040                        // CMP rn_lo, rm_lo (same as LT)
3041                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3042
3043                        // SBCS rd, rn_hi, rm_hi (same as LT)
3044                        let rn_hi_bits = reg_to_bits(rn_hi);
3045                        let rm_hi_bits = reg_to_bits(rm_hi);
3046                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3047                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3048                        bytes.extend_from_slice(&hw1.to_le_bytes());
3049                        bytes.extend_from_slice(&hw2.to_le_bytes());
3050
3051                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
3052                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
3053                    }
3054
3055                    // Unsigned comparisons - same instruction sequence, different conditions
3056                    Condition::LO => {
3057                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
3058                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3059                        let rn_hi_bits = reg_to_bits(rn_hi);
3060                        let rm_hi_bits = reg_to_bits(rm_hi);
3061                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3062                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3063                        bytes.extend_from_slice(&hw1.to_le_bytes());
3064                        bytes.extend_from_slice(&hw2.to_le_bytes());
3065                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
3066                    }
3067
3068                    Condition::HI => {
3069                        // HI (unsigned GT): swap operands and check LO
3070                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3071                        let rm_hi_bits = reg_to_bits(rm_hi);
3072                        let rn_hi_bits = reg_to_bits(rn_hi);
3073                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3074                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3075                        bytes.extend_from_slice(&hw1.to_le_bytes());
3076                        bytes.extend_from_slice(&hw2.to_le_bytes());
3077                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
3078                    }
3079
3080                    Condition::LS => {
3081                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
3082                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3083                        let rm_hi_bits = reg_to_bits(rm_hi);
3084                        let rn_hi_bits = reg_to_bits(rn_hi);
3085                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3086                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3087                        bytes.extend_from_slice(&hw1.to_le_bytes());
3088                        bytes.extend_from_slice(&hw2.to_le_bytes());
3089                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
3090                    }
3091
3092                    Condition::HS => {
3093                        // HS (unsigned GE): !(a < b) = !(LO)
3094                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3095                        let rn_hi_bits = reg_to_bits(rn_hi);
3096                        let rm_hi_bits = reg_to_bits(rm_hi);
3097                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3098                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3099                        bytes.extend_from_slice(&hw1.to_le_bytes());
3100                        bytes.extend_from_slice(&hw2.to_le_bytes());
3101                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
3102                    }
3103                }
3104
3105                Ok(bytes)
3106            }
3107
3108            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
3109            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
3110            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
3111                let rd_bits = reg_to_bits(rd);
3112                let rn_lo_bits = reg_to_bits(rn_lo);
3113                let rn_hi_bits = reg_to_bits(rn_hi);
3114                let mut bytes = Vec::new();
3115
3116                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
3117                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
3118                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
3119                bytes.extend_from_slice(&hw1.to_le_bytes());
3120                bytes.extend_from_slice(&hw2.to_le_bytes());
3121
3122                // CMP rd, #0 — 16-bit form only for r0-r7 (3-bit rd field);
3123                // high registers take CMP.W (T2: F1B0|rn 0F00|imm8). This was
3124                // H-CODE-9: rd_bits<<8 overflowing the field compared the
3125                // WRONG register. Same hardening as the #311 SetCond fix.
3126                if rd_bits < 8 {
3127                    let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
3128                    bytes.extend_from_slice(&cmp_instr.to_le_bytes());
3129                } else {
3130                    let hw1: u16 = 0xF1B0 | (rd_bits as u16);
3131                    let hw2: u16 = 0x0F00;
3132                    bytes.extend_from_slice(&hw1.to_le_bytes());
3133                    bytes.extend_from_slice(&hw2.to_le_bytes());
3134                }
3135
3136                // ITE EQ; MOV rd, #1; MOV rd, #0 (32-bit MOV.W for rd >= R8,
3137                // #311 — see I64SetCond)
3138                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
3139                let ite_instr: u16 = 0xBF00 | mask;
3140                bytes.extend_from_slice(&ite_instr.to_le_bytes());
3141                if rd_bits < 8 {
3142                    let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
3143                    let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
3144                    bytes.extend_from_slice(&mov_one.to_le_bytes());
3145                    bytes.extend_from_slice(&mov_zero.to_le_bytes());
3146                } else {
3147                    for imm in [1u16, 0u16] {
3148                        let hw1: u16 = 0xF04F;
3149                        let hw2: u16 = ((rd_bits as u16) << 8) | imm;
3150                        bytes.extend_from_slice(&hw1.to_le_bytes());
3151                        bytes.extend_from_slice(&hw2.to_le_bytes());
3152                    }
3153                }
3154
3155                Ok(bytes)
3156            }
3157
3158            // I64Mul: 64-bit multiply using UMULL + MLA cross products
3159            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
3160            // Uses R12 as scratch register
3161            ArmOp::I64Mul {
3162                rd_lo,
3163                rd_hi,
3164                rn_lo,
3165                rn_hi,
3166                rm_lo,
3167                rm_hi,
3168            } => {
3169                let rd_lo_bits = reg_to_bits(rd_lo);
3170                let rd_hi_bits = reg_to_bits(rd_hi);
3171                let rn_lo_bits = reg_to_bits(rn_lo);
3172                let rn_hi_bits = reg_to_bits(rn_hi);
3173                let rm_lo_bits = reg_to_bits(rm_lo);
3174                let rm_hi_bits = reg_to_bits(rm_hi);
3175                let r12: u32 = 12; // IP scratch register
3176                let mut bytes = Vec::new();
3177
3178                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
3179                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
3180                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
3181                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
3182                bytes.extend_from_slice(&hw1.to_le_bytes());
3183                bytes.extend_from_slice(&hw2.to_le_bytes());
3184
3185                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
3186                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
3187                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
3188                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
3189                bytes.extend_from_slice(&hw1.to_le_bytes());
3190                bytes.extend_from_slice(&hw2.to_le_bytes());
3191
3192                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
3193                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
3194                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
3195                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3196                bytes.extend_from_slice(&hw1.to_le_bytes());
3197                bytes.extend_from_slice(&hw2.to_le_bytes());
3198
3199                // 4. ADD rd_hi, R12  (rd_hi += cross products)
3200                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
3201                let d_bit = (rd_hi_bits >> 3) & 1;
3202                let add_instr: u16 =
3203                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
3204                bytes.extend_from_slice(&add_instr.to_le_bytes());
3205
3206                Ok(bytes)
3207            }
3208
3209            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
3210            // rm_hi (R3) is used as temp register
3211            ArmOp::I64Shl {
3212                rd_lo,
3213                rd_hi,
3214                rn_lo,
3215                rn_hi,
3216                rm_lo,
3217                rm_hi,
3218            } => {
3219                let rd_lo_bits = reg_to_bits(rd_lo);
3220                let rd_hi_bits = reg_to_bits(rd_hi);
3221                let rn_lo_bits = reg_to_bits(rn_lo);
3222                let rn_hi_bits = reg_to_bits(rn_hi);
3223                let rm_lo_bits = reg_to_bits(rm_lo);
3224                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3225                let mut bytes = Vec::new();
3226
3227                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
3228                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3229                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3230                bytes.extend_from_slice(&hw1.to_le_bytes());
3231                bytes.extend_from_slice(&hw2.to_le_bytes());
3232
3233                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
3234                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3235                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3236                bytes.extend_from_slice(&hw1.to_le_bytes());
3237                bytes.extend_from_slice(&hw2.to_le_bytes());
3238
3239                // BPL .large (branch if n >= 32, offset = +10 halfwords)
3240                let bpl: u16 = 0xD50A;
3241                bytes.extend_from_slice(&bpl.to_le_bytes());
3242
3243                // --- Small shift (n < 32) ---
3244                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3245                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3246                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3247                bytes.extend_from_slice(&hw1.to_le_bytes());
3248                bytes.extend_from_slice(&hw2.to_le_bytes());
3249
3250                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
3251                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3252                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3253                bytes.extend_from_slice(&hw1.to_le_bytes());
3254                bytes.extend_from_slice(&hw2.to_le_bytes());
3255
3256                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
3257                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3258                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3259                bytes.extend_from_slice(&hw1.to_le_bytes());
3260                bytes.extend_from_slice(&hw2.to_le_bytes());
3261
3262                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
3263                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3264                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
3265                bytes.extend_from_slice(&hw1.to_le_bytes());
3266                bytes.extend_from_slice(&hw2.to_le_bytes());
3267
3268                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
3269                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3270                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3271                bytes.extend_from_slice(&hw1.to_le_bytes());
3272                bytes.extend_from_slice(&hw2.to_le_bytes());
3273
3274                // B .done (skip large shift: +2 halfwords)
3275                let b_done: u16 = 0xE002;
3276                bytes.extend_from_slice(&b_done.to_le_bytes());
3277
3278                // --- Large shift (n >= 32) ---
3279                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
3280                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3281                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
3282                bytes.extend_from_slice(&hw1.to_le_bytes());
3283                bytes.extend_from_slice(&hw2.to_le_bytes());
3284
3285                // MOV rd_lo, #0
3286                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
3287                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3288
3289                Ok(bytes) // Total: 38 bytes
3290            }
3291
3292            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3293            ArmOp::I64ShrU {
3294                rd_lo,
3295                rd_hi,
3296                rn_lo,
3297                rn_hi,
3298                rm_lo,
3299                rm_hi,
3300            } => {
3301                let rd_lo_bits = reg_to_bits(rd_lo);
3302                let rd_hi_bits = reg_to_bits(rd_hi);
3303                let rn_lo_bits = reg_to_bits(rn_lo);
3304                let rn_hi_bits = reg_to_bits(rn_hi);
3305                let rm_lo_bits = reg_to_bits(rm_lo);
3306                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3307                let mut bytes = Vec::new();
3308
3309                // AND.W rm_lo, rm_lo, #63
3310                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3311                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3312                bytes.extend_from_slice(&hw1.to_le_bytes());
3313                bytes.extend_from_slice(&hw2.to_le_bytes());
3314
3315                // SUBS.W rm_hi, rm_lo, #32
3316                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3317                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3318                bytes.extend_from_slice(&hw1.to_le_bytes());
3319                bytes.extend_from_slice(&hw2.to_le_bytes());
3320
3321                // BPL .large (+10 halfwords)
3322                let bpl: u16 = 0xD50A;
3323                bytes.extend_from_slice(&bpl.to_le_bytes());
3324
3325                // --- Small shift (n < 32) ---
3326                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3327                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3328                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3329                bytes.extend_from_slice(&hw1.to_le_bytes());
3330                bytes.extend_from_slice(&hw2.to_le_bytes());
3331
3332                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3333                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3334                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3335                bytes.extend_from_slice(&hw1.to_le_bytes());
3336                bytes.extend_from_slice(&hw2.to_le_bytes());
3337
3338                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3339                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3340                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3341                bytes.extend_from_slice(&hw1.to_le_bytes());
3342                bytes.extend_from_slice(&hw2.to_le_bytes());
3343
3344                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3345                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3346                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3347                bytes.extend_from_slice(&hw1.to_le_bytes());
3348                bytes.extend_from_slice(&hw2.to_le_bytes());
3349
3350                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3351                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3352                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3353                bytes.extend_from_slice(&hw1.to_le_bytes());
3354                bytes.extend_from_slice(&hw2.to_le_bytes());
3355
3356                // B .done (+2 halfwords)
3357                let b_done: u16 = 0xE002;
3358                bytes.extend_from_slice(&b_done.to_le_bytes());
3359
3360                // --- Large shift (n >= 32) ---
3361                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3362                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3363                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3364                bytes.extend_from_slice(&hw1.to_le_bytes());
3365                bytes.extend_from_slice(&hw2.to_le_bytes());
3366
3367                // MOV rd_hi, #0
3368                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3369                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3370
3371                Ok(bytes) // Total: 38 bytes
3372            }
3373
3374            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3375            ArmOp::I64ShrS {
3376                rd_lo,
3377                rd_hi,
3378                rn_lo,
3379                rn_hi,
3380                rm_lo,
3381                rm_hi,
3382            } => {
3383                let rd_lo_bits = reg_to_bits(rd_lo);
3384                let rd_hi_bits = reg_to_bits(rd_hi);
3385                let rn_lo_bits = reg_to_bits(rn_lo);
3386                let rn_hi_bits = reg_to_bits(rn_hi);
3387                let rm_lo_bits = reg_to_bits(rm_lo);
3388                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3389                let mut bytes = Vec::new();
3390
3391                // AND.W rm_lo, rm_lo, #63
3392                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3393                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3394                bytes.extend_from_slice(&hw1.to_le_bytes());
3395                bytes.extend_from_slice(&hw2.to_le_bytes());
3396
3397                // SUBS.W rm_hi, rm_lo, #32
3398                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3399                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3400                bytes.extend_from_slice(&hw1.to_le_bytes());
3401                bytes.extend_from_slice(&hw2.to_le_bytes());
3402
3403                // BPL .large (+10 halfwords)
3404                let bpl: u16 = 0xD50A;
3405                bytes.extend_from_slice(&bpl.to_le_bytes());
3406
3407                // --- Small shift (n < 32) ---
3408                // RSB.W rm_hi, rm_lo, #32
3409                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3410                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3411                bytes.extend_from_slice(&hw1.to_le_bytes());
3412                bytes.extend_from_slice(&hw2.to_le_bytes());
3413
3414                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3415                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3416                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3417                bytes.extend_from_slice(&hw1.to_le_bytes());
3418                bytes.extend_from_slice(&hw2.to_le_bytes());
3419
3420                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3421                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3422                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3423                bytes.extend_from_slice(&hw1.to_le_bytes());
3424                bytes.extend_from_slice(&hw2.to_le_bytes());
3425
3426                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3427                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3428                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3429                bytes.extend_from_slice(&hw1.to_le_bytes());
3430                bytes.extend_from_slice(&hw2.to_le_bytes());
3431
3432                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3433                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3434                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3435                bytes.extend_from_slice(&hw1.to_le_bytes());
3436                bytes.extend_from_slice(&hw2.to_le_bytes());
3437
3438                // B .done (+3 halfwords, large shift is 8 bytes)
3439                let b_done: u16 = 0xE003;
3440                bytes.extend_from_slice(&b_done.to_le_bytes());
3441
3442                // --- Large shift (n >= 32) ---
3443                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3444                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3445                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3446                bytes.extend_from_slice(&hw1.to_le_bytes());
3447                bytes.extend_from_slice(&hw2.to_le_bytes());
3448
3449                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3450                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3451                // imm5=31=11111 → imm3=111, imm2=11
3452                let hw1: u16 = 0xEA4F;
3453                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3454                bytes.extend_from_slice(&hw1.to_le_bytes());
3455                bytes.extend_from_slice(&hw2.to_le_bytes());
3456
3457                Ok(bytes) // Total: 40 bytes
3458            }
3459
3460            // I64Rotl: 64-bit rotate left
3461            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3462            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3463            // Uses R4 (saved/restored) and R12 as scratch
3464            ArmOp::I64Rotl {
3465                rdlo,
3466                rdhi,
3467                rnlo,
3468                rnhi,
3469                shift,
3470            } => {
3471                let rd_lo_bits = reg_to_bits(rdlo);
3472                let rd_hi_bits = reg_to_bits(rdhi);
3473                let rn_lo_bits = reg_to_bits(rnlo);
3474                let rn_hi_bits = reg_to_bits(rnhi);
3475                let shift_bits = reg_to_bits(shift);
3476                let r12: u32 = 12; // IP scratch
3477                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3478                let r4: u32 = 4; // Scratch (saved/restored)
3479                let mut bytes = Vec::new();
3480
3481                // PUSH {R4}
3482                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3483
3484                // AND.W shift, shift, #63 (mask to 6 bits)
3485                let hw1: u16 = (0xF000 | shift_bits) as u16;
3486                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3487                bytes.extend_from_slice(&hw1.to_le_bytes());
3488                bytes.extend_from_slice(&hw2.to_le_bytes());
3489
3490                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3491                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3492                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3493                bytes.extend_from_slice(&hw1.to_le_bytes());
3494                bytes.extend_from_slice(&hw2.to_le_bytes());
3495
3496                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3497                let bpl: u16 = 0xD50E;
3498                bytes.extend_from_slice(&bpl.to_le_bytes());
3499
3500                // === Small rotation (n < 32) ===
3501                // RSB.W R3, shift, #32 (R3 = 32-n)
3502                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3503                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3504                bytes.extend_from_slice(&hw1.to_le_bytes());
3505                bytes.extend_from_slice(&hw2.to_le_bytes());
3506
3507                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3508                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3509                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3510                bytes.extend_from_slice(&hw1.to_le_bytes());
3511                bytes.extend_from_slice(&hw2.to_le_bytes());
3512
3513                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3514                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3515                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3516                bytes.extend_from_slice(&hw1.to_le_bytes());
3517                bytes.extend_from_slice(&hw2.to_le_bytes());
3518
3519                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3520                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3521                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3522                bytes.extend_from_slice(&hw1.to_le_bytes());
3523                bytes.extend_from_slice(&hw2.to_le_bytes());
3524
3525                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3526                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3527                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3528                bytes.extend_from_slice(&hw1.to_le_bytes());
3529                bytes.extend_from_slice(&hw2.to_le_bytes());
3530
3531                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3532                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3533                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3534                bytes.extend_from_slice(&hw1.to_le_bytes());
3535                bytes.extend_from_slice(&hw2.to_le_bytes());
3536
3537                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3538                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3539                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3540                bytes.extend_from_slice(&hw1.to_le_bytes());
3541                bytes.extend_from_slice(&hw2.to_le_bytes());
3542
3543                // B .done (skip large block, offset = +14 halfwords)
3544                let b_done: u16 = 0xE00E;
3545                bytes.extend_from_slice(&b_done.to_le_bytes());
3546
3547                // === Large rotation (n >= 32) ===
3548                // R3 already has n-32 from the SUBS
3549                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3550                let hw1: u16 = (0xF1C0 | r3) as u16;
3551                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3552                bytes.extend_from_slice(&hw1.to_le_bytes());
3553                bytes.extend_from_slice(&hw2.to_le_bytes());
3554
3555                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3556                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3557                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3558                bytes.extend_from_slice(&hw1.to_le_bytes());
3559                bytes.extend_from_slice(&hw2.to_le_bytes());
3560
3561                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3562                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3563                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3564                bytes.extend_from_slice(&hw1.to_le_bytes());
3565                bytes.extend_from_slice(&hw2.to_le_bytes());
3566
3567                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3568                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3569                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3570                bytes.extend_from_slice(&hw1.to_le_bytes());
3571                bytes.extend_from_slice(&hw2.to_le_bytes());
3572
3573                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3574                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3575                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3576                bytes.extend_from_slice(&hw1.to_le_bytes());
3577                bytes.extend_from_slice(&hw2.to_le_bytes());
3578
3579                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3580                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3581                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3582                bytes.extend_from_slice(&hw1.to_le_bytes());
3583                bytes.extend_from_slice(&hw2.to_le_bytes());
3584
3585                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3586                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3587                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3588                bytes.extend_from_slice(&hw1.to_le_bytes());
3589                bytes.extend_from_slice(&hw2.to_le_bytes());
3590
3591                // MOV rd_hi, shift (rd_hi = new_hi)
3592                let d_bit = (rd_hi_bits >> 3) & 1;
3593                let mov_instr: u16 =
3594                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3595                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3596
3597                // POP {R4}
3598                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3599
3600                Ok(bytes) // Total: 74 bytes
3601            }
3602
3603            // I64Rotr: 64-bit rotate right
3604            // rotr(x, n) = rotl(x, 64-n)
3605            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3606            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3607            ArmOp::I64Rotr {
3608                rdlo,
3609                rdhi,
3610                rnlo,
3611                rnhi,
3612                shift,
3613            } => {
3614                let rd_lo_bits = reg_to_bits(rdlo);
3615                let rd_hi_bits = reg_to_bits(rdhi);
3616                let rn_lo_bits = reg_to_bits(rnlo);
3617                let rn_hi_bits = reg_to_bits(rnhi);
3618                let shift_bits = reg_to_bits(shift);
3619                let r12: u32 = 12;
3620                let r3: u32 = 3;
3621                let r4: u32 = 4;
3622                let mut bytes = Vec::new();
3623
3624                // PUSH {R4}
3625                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3626
3627                // AND.W shift, shift, #63
3628                let hw1: u16 = (0xF000 | shift_bits) as u16;
3629                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3630                bytes.extend_from_slice(&hw1.to_le_bytes());
3631                bytes.extend_from_slice(&hw2.to_le_bytes());
3632
3633                // SUBS.W R3, shift, #32
3634                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3635                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3636                bytes.extend_from_slice(&hw1.to_le_bytes());
3637                bytes.extend_from_slice(&hw2.to_le_bytes());
3638
3639                // BPL .large (+14 halfwords)
3640                let bpl: u16 = 0xD50E;
3641                bytes.extend_from_slice(&bpl.to_le_bytes());
3642
3643                // === Small rotation (n < 32) ===
3644                // RSB.W R3, shift, #32 (R3 = 32-n)
3645                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3646                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3647                bytes.extend_from_slice(&hw1.to_le_bytes());
3648                bytes.extend_from_slice(&hw2.to_le_bytes());
3649
3650                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3651                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3652                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3653                bytes.extend_from_slice(&hw1.to_le_bytes());
3654                bytes.extend_from_slice(&hw2.to_le_bytes());
3655
3656                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3657                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3658                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3659                bytes.extend_from_slice(&hw1.to_le_bytes());
3660                bytes.extend_from_slice(&hw2.to_le_bytes());
3661
3662                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3663                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3664                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3665                bytes.extend_from_slice(&hw1.to_le_bytes());
3666                bytes.extend_from_slice(&hw2.to_le_bytes());
3667
3668                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3669                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3670                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3671                bytes.extend_from_slice(&hw1.to_le_bytes());
3672                bytes.extend_from_slice(&hw2.to_le_bytes());
3673
3674                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3675                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3676                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3677                bytes.extend_from_slice(&hw1.to_le_bytes());
3678                bytes.extend_from_slice(&hw2.to_le_bytes());
3679
3680                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3681                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3682                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3683                bytes.extend_from_slice(&hw1.to_le_bytes());
3684                bytes.extend_from_slice(&hw2.to_le_bytes());
3685
3686                // B .done (+14 halfwords)
3687                let b_done: u16 = 0xE00E;
3688                bytes.extend_from_slice(&b_done.to_le_bytes());
3689
3690                // === Large rotation (n >= 32) ===
3691                // RSB.W R4, R3, #32 (R4 = 64-n)
3692                let hw1: u16 = (0xF1C0 | r3) as u16;
3693                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3694                bytes.extend_from_slice(&hw1.to_le_bytes());
3695                bytes.extend_from_slice(&hw2.to_le_bytes());
3696
3697                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3698                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3699                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3700                bytes.extend_from_slice(&hw1.to_le_bytes());
3701                bytes.extend_from_slice(&hw2.to_le_bytes());
3702
3703                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3704                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3705                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3706                bytes.extend_from_slice(&hw1.to_le_bytes());
3707                bytes.extend_from_slice(&hw2.to_le_bytes());
3708
3709                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3710                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3711                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3712                bytes.extend_from_slice(&hw1.to_le_bytes());
3713                bytes.extend_from_slice(&hw2.to_le_bytes());
3714
3715                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3716                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3717                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3718                bytes.extend_from_slice(&hw1.to_le_bytes());
3719                bytes.extend_from_slice(&hw2.to_le_bytes());
3720
3721                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3722                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3723                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3724                bytes.extend_from_slice(&hw1.to_le_bytes());
3725                bytes.extend_from_slice(&hw2.to_le_bytes());
3726
3727                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3728                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3729                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3730                bytes.extend_from_slice(&hw1.to_le_bytes());
3731                bytes.extend_from_slice(&hw2.to_le_bytes());
3732
3733                // MOV rd_lo, shift (rd_lo = new_lo)
3734                let d_bit = (rd_lo_bits >> 3) & 1;
3735                let mov_instr: u16 =
3736                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3737                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3738
3739                // POP {R4}
3740                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3741
3742                Ok(bytes) // Total: 74 bytes
3743            }
3744
3745            // I64Clz: Count leading zeros in 64-bit value
3746            // If hi != 0: result = CLZ(hi)
3747            // If hi == 0: result = 32 + CLZ(lo)
3748            //
3749            // Layout (using CMP+BNE approach for consistency):
3750            // 0: CMP.W rnhi, #0 (4 bytes)
3751            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3752            // 6: CLZ.W rd, rnhi (4 bytes)
3753            // 10: B .done (2 bytes) - branch forward to offset 22
3754            // 12: NOP (2 bytes) - padding for alignment
3755            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3756            // 18: ADD.W rd, rd, #32 (4 bytes)
3757            // 22: .done
3758            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3759                let rd_bits = reg_to_bits(rd);
3760                let rn_lo_bits = reg_to_bits(rnlo);
3761                let rn_hi_bits = reg_to_bits(rnhi);
3762                let mut bytes = Vec::new();
3763
3764                // CMP.W rnhi, #0 (4 bytes at offset 0)
3765                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3766                let hw2: u16 = 0x0F00;
3767                bytes.extend_from_slice(&hw1.to_le_bytes());
3768                bytes.extend_from_slice(&hw2.to_le_bytes());
3769
3770                // BEQ .hi_zero (2 bytes at offset 4)
3771                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3772                let beq: u16 = 0xD003;
3773                bytes.extend_from_slice(&beq.to_le_bytes());
3774
3775                // CLZ.W rd, rnhi (4 bytes at offset 6)
3776                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3777                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3778                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3779                bytes.extend_from_slice(&hw1.to_le_bytes());
3780                bytes.extend_from_slice(&hw2.to_le_bytes());
3781
3782                // B .done (2 bytes at offset 10)
3783                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3784                let b_done: u16 = 0xE004;
3785                bytes.extend_from_slice(&b_done.to_le_bytes());
3786
3787                // NOP (2 bytes at offset 12) - padding
3788                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3789
3790                // .hi_zero: (offset 14)
3791                // CLZ.W rd, rnlo (4 bytes)
3792                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3793                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3794                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3795                bytes.extend_from_slice(&hw1.to_le_bytes());
3796                bytes.extend_from_slice(&hw2.to_le_bytes());
3797
3798                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3799                let hw1: u16 = (0xF100 | rd_bits) as u16;
3800                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3801                bytes.extend_from_slice(&hw1.to_le_bytes());
3802                bytes.extend_from_slice(&hw2.to_le_bytes());
3803
3804                // .done: (offset 22)
3805                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3806                // MOVS Rn, #0: 0010 0 Rn 00000000
3807                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3808                bytes.extend_from_slice(&mov0.to_le_bytes());
3809
3810                Ok(bytes)
3811            }
3812
3813            // I64Ctz: Count trailing zeros in 64-bit value
3814            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3815            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3816            //
3817            // Layout:
3818            // 0: CMP.W rnlo, #0 (4 bytes)
3819            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3820            // 6: RBIT.W rd, rnlo (4 bytes)
3821            // 10: CLZ.W rd, rd (4 bytes)
3822            // 14: B .done (2 bytes) - branch to offset 30
3823            // 16: NOP (2 bytes) - padding
3824            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3825            // 22: CLZ.W rd, rd (4 bytes)
3826            // 26: ADD.W rd, rd, #32 (4 bytes)
3827            // 30: .done
3828            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3829                let rd_bits = reg_to_bits(rd);
3830                let rn_lo_bits = reg_to_bits(rnlo);
3831                let rn_hi_bits = reg_to_bits(rnhi);
3832                let mut bytes = Vec::new();
3833
3834                // CMP.W rnlo, #0 (4 bytes at offset 0)
3835                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3836                let hw2: u16 = 0x0F00;
3837                bytes.extend_from_slice(&hw1.to_le_bytes());
3838                bytes.extend_from_slice(&hw2.to_le_bytes());
3839
3840                // BEQ .lo_zero (2 bytes at offset 4)
3841                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3842                let beq: u16 = 0xD005;
3843                bytes.extend_from_slice(&beq.to_le_bytes());
3844
3845                // RBIT.W rd, rnlo (4 bytes at offset 6)
3846                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3847                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3848                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3849                bytes.extend_from_slice(&hw1.to_le_bytes());
3850                bytes.extend_from_slice(&hw2.to_le_bytes());
3851
3852                // CLZ.W rd, rd (4 bytes at offset 10)
3853                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3854                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3855                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3856                bytes.extend_from_slice(&hw1.to_le_bytes());
3857                bytes.extend_from_slice(&hw2.to_le_bytes());
3858
3859                // B .done (2 bytes at offset 14)
3860                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3861                let b_done: u16 = 0xE006;
3862                bytes.extend_from_slice(&b_done.to_le_bytes());
3863
3864                // NOP (2 bytes at offset 16) - padding
3865                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3866
3867                // .lo_zero: (offset 18)
3868                // RBIT.W rd, rnhi (4 bytes)
3869                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3870                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3871                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3872                bytes.extend_from_slice(&hw1.to_le_bytes());
3873                bytes.extend_from_slice(&hw2.to_le_bytes());
3874
3875                // CLZ.W rd, rd (4 bytes at offset 22)
3876                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3877                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3878                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3879                bytes.extend_from_slice(&hw1.to_le_bytes());
3880                bytes.extend_from_slice(&hw2.to_le_bytes());
3881
3882                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3883                let hw1: u16 = (0xF100 | rd_bits) as u16;
3884                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3885                bytes.extend_from_slice(&hw1.to_le_bytes());
3886                bytes.extend_from_slice(&hw2.to_le_bytes());
3887
3888                // .done: (offset 30)
3889                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3890                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3891                bytes.extend_from_slice(&mov0.to_le_bytes());
3892
3893                Ok(bytes)
3894            }
3895
3896            // I64Popcnt: Population count of 64-bit value
3897            // result = POPCNT(lo) + POPCNT(hi)
3898            // Using SIMD-style parallel bit counting algorithm
3899            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3900                let rd_bits = reg_to_bits(rd);
3901                let rn_lo_bits = reg_to_bits(rnlo);
3902                let rn_hi_bits = reg_to_bits(rnhi);
3903                let r12: u32 = 12; // IP scratch
3904                let r3: u32 = 3; // Scratch for hi popcnt result
3905                let mut bytes = Vec::new();
3906
3907                // PUSH {R3, R4, R5} - save scratch registers
3908                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3909
3910                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3911                // Using lookup table approach for each byte would be too large
3912                // Using shift-and-add approach instead
3913
3914                // For simplicity and correctness, use the efficient parallel algorithm
3915                // but implement it as a series of inline operations
3916
3917                // MOV R4, rnlo
3918                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3919                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3920                bytes.extend_from_slice(&mov.to_le_bytes());
3921
3922                // MOV R5, rnhi
3923                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3924                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3925                bytes.extend_from_slice(&mov.to_le_bytes());
3926
3927                // --- POPCNT for R4 (lo word) ---
3928                // Step 1: x = x - ((x >> 1) & 0x55555555)
3929                // LSR.W R12, R4, #1
3930                let hw1: u16 = 0xEA4F;
3931                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3932                bytes.extend_from_slice(&hw1.to_le_bytes());
3933                bytes.extend_from_slice(&hw2.to_le_bytes());
3934
3935                // Load 0x55555555 into R3 using MOVW/MOVT
3936                // MOVW R3, #0x5555
3937                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3938                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3939                // MOVT R3, #0x5555
3940                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3941                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3942
3943                // AND.W R12, R12, R3
3944                let hw1: u16 = (0xEA00 | r12) as u16;
3945                let hw2: u16 = ((r12 << 8) | r3) as u16;
3946                bytes.extend_from_slice(&hw1.to_le_bytes());
3947                bytes.extend_from_slice(&hw2.to_le_bytes());
3948
3949                // SUB.W R4, R4, R12
3950                let hw1: u16 = (0xEBA0 | 4) as u16;
3951                let hw2: u16 = ((4 << 8) | r12) as u16;
3952                bytes.extend_from_slice(&hw1.to_le_bytes());
3953                bytes.extend_from_slice(&hw2.to_le_bytes());
3954
3955                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
3956                // Load 0x33333333 into R3
3957                // MOVW R3, #0x3333
3958                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3959                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3960                // MOVT R3, #0x3333
3961                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3962                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3963
3964                // AND.W R12, R4, R3
3965                let hw1: u16 = (0xEA00 | 4) as u16;
3966                let hw2: u16 = ((r12 << 8) | r3) as u16;
3967                bytes.extend_from_slice(&hw1.to_le_bytes());
3968                bytes.extend_from_slice(&hw2.to_le_bytes());
3969
3970                // LSR.W R4, R4, #2
3971                let hw1: u16 = 0xEA4F;
3972                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
3973                bytes.extend_from_slice(&hw1.to_le_bytes());
3974                bytes.extend_from_slice(&hw2.to_le_bytes());
3975
3976                // AND.W R4, R4, R3
3977                let hw1: u16 = (0xEA00 | 4) as u16;
3978                let hw2: u16 = ((4 << 8) | r3) as u16;
3979                bytes.extend_from_slice(&hw1.to_le_bytes());
3980                bytes.extend_from_slice(&hw2.to_le_bytes());
3981
3982                // ADD.W R4, R4, R12
3983                let hw1: u16 = (0xEB00 | 4) as u16;
3984                let hw2: u16 = ((4 << 8) | r12) as u16;
3985                bytes.extend_from_slice(&hw1.to_le_bytes());
3986                bytes.extend_from_slice(&hw2.to_le_bytes());
3987
3988                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
3989                // LSR.W R12, R4, #4
3990                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
3991                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3992                let hw1: u16 = 0xEA4F;
3993                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
3994                bytes.extend_from_slice(&hw1.to_le_bytes());
3995                bytes.extend_from_slice(&hw2.to_le_bytes());
3996
3997                // ADD.W R4, R4, R12
3998                let hw1: u16 = (0xEB00 | 4) as u16;
3999                let hw2: u16 = ((4 << 8) | r12) as u16;
4000                bytes.extend_from_slice(&hw1.to_le_bytes());
4001                bytes.extend_from_slice(&hw2.to_le_bytes());
4002
4003                // Load 0x0F0F0F0F into R3
4004                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
4005                // hw1 = 11110 1 10 0100 0000 = 0xF640
4006                // hw2 = 0 111 0011 00001111 = 0x730F
4007                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
4008                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4009                // MOVT R3, #0x0F0F
4010                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
4011                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4012
4013                // AND.W R4, R4, R3
4014                let hw1: u16 = (0xEA00 | 4) as u16;
4015                let hw2: u16 = ((4 << 8) | r3) as u16;
4016                bytes.extend_from_slice(&hw1.to_le_bytes());
4017                bytes.extend_from_slice(&hw2.to_le_bytes());
4018
4019                // Step 4: x = x * 0x01010101 >> 24
4020                // Load 0x01010101 into R3
4021                // MOVW R3, #0x0101
4022                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
4023                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4024                // MOVT R3, #0x0101
4025                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
4026                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4027
4028                // MUL R4, R4, R3
4029                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
4030                let hw1: u16 = (0xFB00 | 4) as u16;
4031                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
4032                bytes.extend_from_slice(&hw1.to_le_bytes());
4033                bytes.extend_from_slice(&hw2.to_le_bytes());
4034
4035                // LSR.W R4, R4, #24
4036                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
4037                let hw1: u16 = 0xEA4F;
4038                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
4039                bytes.extend_from_slice(&hw1.to_le_bytes());
4040                bytes.extend_from_slice(&hw2.to_le_bytes());
4041
4042                // --- POPCNT for R5 (hi word) - same algorithm ---
4043                // Step 1
4044                let hw1: u16 = 0xEA4F;
4045                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
4046                bytes.extend_from_slice(&hw1.to_le_bytes());
4047                bytes.extend_from_slice(&hw2.to_le_bytes());
4048
4049                // Load 0x55555555 into R3
4050                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
4051                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
4052                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
4053                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
4054
4055                let hw1: u16 = (0xEA00 | r12) as u16;
4056                let hw2: u16 = ((r12 << 8) | r3) as u16;
4057                bytes.extend_from_slice(&hw1.to_le_bytes());
4058                bytes.extend_from_slice(&hw2.to_le_bytes());
4059
4060                let hw1: u16 = (0xEBA0 | 5) as u16;
4061                let hw2: u16 = ((5 << 8) | r12) as u16;
4062                bytes.extend_from_slice(&hw1.to_le_bytes());
4063                bytes.extend_from_slice(&hw2.to_le_bytes());
4064
4065                // Step 2
4066                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
4067                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
4068                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
4069                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
4070
4071                let hw1: u16 = (0xEA00 | 5) as u16;
4072                let hw2: u16 = ((r12 << 8) | r3) as u16;
4073                bytes.extend_from_slice(&hw1.to_le_bytes());
4074                bytes.extend_from_slice(&hw2.to_le_bytes());
4075
4076                let hw1: u16 = 0xEA4F;
4077                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
4078                bytes.extend_from_slice(&hw1.to_le_bytes());
4079                bytes.extend_from_slice(&hw2.to_le_bytes());
4080
4081                let hw1: u16 = (0xEA00 | 5) as u16;
4082                let hw2: u16 = ((5 << 8) | r3) as u16;
4083                bytes.extend_from_slice(&hw1.to_le_bytes());
4084                bytes.extend_from_slice(&hw2.to_le_bytes());
4085
4086                let hw1: u16 = (0xEB00 | 5) as u16;
4087                let hw2: u16 = ((5 << 8) | r12) as u16;
4088                bytes.extend_from_slice(&hw1.to_le_bytes());
4089                bytes.extend_from_slice(&hw2.to_le_bytes());
4090
4091                // Step 3: LSR.W R12, R5, #4
4092                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
4093                let hw1: u16 = 0xEA4F;
4094                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
4095                bytes.extend_from_slice(&hw1.to_le_bytes());
4096                bytes.extend_from_slice(&hw2.to_le_bytes());
4097
4098                let hw1: u16 = (0xEB00 | 5) as u16;
4099                let hw2: u16 = ((5 << 8) | r12) as u16;
4100                bytes.extend_from_slice(&hw1.to_le_bytes());
4101                bytes.extend_from_slice(&hw2.to_le_bytes());
4102
4103                // Load 0x0F0F0F0F into R3 (for hi-word)
4104                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
4105                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4106                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
4107                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4108
4109                let hw1: u16 = (0xEA00 | 5) as u16;
4110                let hw2: u16 = ((5 << 8) | r3) as u16;
4111                bytes.extend_from_slice(&hw1.to_le_bytes());
4112                bytes.extend_from_slice(&hw2.to_le_bytes());
4113
4114                // Step 4
4115                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
4116                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4117                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
4118                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4119
4120                // MUL R5, R5, R3
4121                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
4122                let hw1: u16 = (0xFB00 | 5) as u16;
4123                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
4124                bytes.extend_from_slice(&hw1.to_le_bytes());
4125                bytes.extend_from_slice(&hw2.to_le_bytes());
4126
4127                // LSR.W R5, R5, #24
4128                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
4129                let hw1: u16 = 0xEA4F;
4130                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
4131                bytes.extend_from_slice(&hw1.to_le_bytes());
4132                bytes.extend_from_slice(&hw2.to_le_bytes());
4133
4134                // ADD rd, R4, R5 (combine lo and hi counts)
4135                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
4136                let rd_bits_u16 = rd_bits as u16;
4137                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
4138                bytes.extend_from_slice(&instr.to_le_bytes());
4139
4140                // POP {R3, R4, R5}
4141                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
4142
4143                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
4144                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
4145                bytes.extend_from_slice(&mov0.to_le_bytes());
4146
4147                Ok(bytes)
4148            }
4149
4150            // I64Extend8S: Sign-extend low 8 bits to 64 bits
4151            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
4152            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
4153                let rdlo_bits = reg_to_bits(rdlo);
4154                let rdhi_bits = reg_to_bits(rdhi);
4155                let rnlo_bits = reg_to_bits(rnlo);
4156                let mut bytes = Vec::new();
4157
4158                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
4159                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
4160                let hw1: u16 = 0xFA4F_u16;
4161                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4162                bytes.extend_from_slice(&hw1.to_le_bytes());
4163                bytes.extend_from_slice(&hw2.to_le_bytes());
4164
4165                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4166                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
4167                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
4168                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
4169                let hw1: u16 = 0xEA4F;
4170                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4171                bytes.extend_from_slice(&hw1.to_le_bytes());
4172                bytes.extend_from_slice(&hw2.to_le_bytes());
4173
4174                Ok(bytes)
4175            }
4176
4177            // I64Extend16S: Sign-extend low 16 bits to 64 bits
4178            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
4179            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
4180                let rdlo_bits = reg_to_bits(rdlo);
4181                let rdhi_bits = reg_to_bits(rdhi);
4182                let rnlo_bits = reg_to_bits(rnlo);
4183                let mut bytes = Vec::new();
4184
4185                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
4186                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
4187                let hw1: u16 = 0xFA0F_u16;
4188                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4189                bytes.extend_from_slice(&hw1.to_le_bytes());
4190                bytes.extend_from_slice(&hw2.to_le_bytes());
4191
4192                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4193                let hw1: u16 = 0xEA4F;
4194                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4195                bytes.extend_from_slice(&hw1.to_le_bytes());
4196                bytes.extend_from_slice(&hw2.to_le_bytes());
4197
4198                Ok(bytes)
4199            }
4200
4201            // I64Extend32S: Sign-extend low 32 bits to 64 bits
4202            // Result: rdlo = rnlo, rdhi = rnlo >> 31
4203            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
4204                let rdlo_bits = reg_to_bits(rdlo);
4205                let rdhi_bits = reg_to_bits(rdhi);
4206                let rnlo_bits = reg_to_bits(rnlo);
4207                let mut bytes = Vec::new();
4208
4209                // MOV rdlo, rnlo (if different)
4210                if rdlo_bits != rnlo_bits {
4211                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4212                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
4213                    let mov: u16 = 0x4600
4214                        | (d_bit << 7)
4215                        | ((rnlo_bits as u16) << 3)
4216                        | ((rdlo_bits & 0x7) as u16);
4217                    bytes.extend_from_slice(&mov.to_le_bytes());
4218                }
4219
4220                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
4221                let hw1: u16 = 0xEA4F;
4222                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
4223                bytes.extend_from_slice(&hw1.to_le_bytes());
4224                bytes.extend_from_slice(&hw2.to_le_bytes());
4225
4226                Ok(bytes)
4227            }
4228
4229            // SelectMove: IT <cond>; MOV{cond} rd, rm
4230            // Conditional move: only execute MOV if condition is true
4231            ArmOp::SelectMove { rd, rm, cond } => {
4232                let rd_bits = reg_to_bits(rd) as u16;
4233                let rm_bits = reg_to_bits(rm) as u16;
4234
4235                // Condition code encoding for IT block
4236                use synth_synthesis::Condition;
4237                let cond_bits: u16 = match cond {
4238                    Condition::EQ => 0x0, // Equal
4239                    Condition::NE => 0x1, // Not equal
4240                    Condition::HS => 0x2, // Higher or same (unsigned >=)
4241                    Condition::LO => 0x3, // Lower (unsigned <)
4242                    Condition::HI => 0x8, // Higher (unsigned >)
4243                    Condition::LS => 0x9, // Lower or same (unsigned <=)
4244                    Condition::GE => 0xA, // Greater or equal (signed)
4245                    Condition::LT => 0xB, // Less than (signed)
4246                    Condition::GT => 0xC, // Greater than (signed)
4247                    Condition::LE => 0xD, // Less or equal (signed)
4248                };
4249
4250                // IT <cond>: single Then block (mask = 0x8 for T only)
4251                // IT instruction: 1011 1111 firstcond mask
4252                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
4253
4254                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4255                // This MOV will only execute if condition is true due to IT block
4256                let d_bit = (rd_bits >> 3) & 1;
4257                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4258
4259                // Emit: IT <cond>, MOV rd, rm
4260                let mut bytes = it_instr.to_le_bytes().to_vec();
4261                bytes.extend_from_slice(&mov_instr.to_le_bytes());
4262                Ok(bytes)
4263            }
4264
4265            // Popcnt: Population count (count set bits)
4266            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
4267            // x = x - ((x >> 1) & 0x55555555);
4268            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
4269            // x = (x + (x >> 4)) & 0x0F0F0F0F;
4270            // x = x + (x >> 8);
4271            // x = x + (x >> 16);
4272            // return x & 0x3F;
4273            //
4274            // Uses rd as working register and R12 as scratch for constants
4275            ArmOp::Popcnt { rd, rm } => {
4276                let mut bytes = Vec::new();
4277
4278                // First, move rm to rd if they're different
4279                if rd != rm {
4280                    let rd_bits = reg_to_bits(rd) as u16;
4281                    let rm_bits = reg_to_bits(rm) as u16;
4282                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4283                    let d_bit = (rd_bits >> 3) & 1;
4284                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4285                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
4286                }
4287
4288                // Step 1: x = x - ((x >> 1) & 0x55555555)
4289                // Load 0x55555555 into R12
4290                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4291                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4292
4293                // R12_temp = rd >> 1
4294                // We need a second scratch register. Use R11.
4295                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4296
4297                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4298                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4299
4300                // rd = rd - R11
4301                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4302                    reg_to_bits(rd),
4303                    reg_to_bits(rd),
4304                    11,
4305                )?);
4306
4307                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4308                // Load 0x33333333 into R12
4309                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4310                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4311
4312                // R11 = rd & R12
4313                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4314                    11,
4315                    reg_to_bits(rd),
4316                    12,
4317                )?);
4318
4319                // rd = rd >> 2
4320                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4321                    reg_to_bits(rd),
4322                    reg_to_bits(rd),
4323                    2,
4324                )?);
4325
4326                // rd = rd & R12
4327                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4328                    reg_to_bits(rd),
4329                    reg_to_bits(rd),
4330                    12,
4331                )?);
4332
4333                // rd = rd + R11
4334                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4335                    reg_to_bits(rd),
4336                    reg_to_bits(rd),
4337                    11,
4338                )?);
4339
4340                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4341                // R11 = rd >> 4
4342                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4343
4344                // rd = rd + R11
4345                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4346                    reg_to_bits(rd),
4347                    reg_to_bits(rd),
4348                    11,
4349                )?);
4350
4351                // Load 0x0F0F0F0F into R12
4352                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4353                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4354
4355                // rd = rd & R12
4356                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4357                    reg_to_bits(rd),
4358                    reg_to_bits(rd),
4359                    12,
4360                )?);
4361
4362                // Step 4: x = x + (x >> 8)
4363                // R11 = rd >> 8
4364                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4365
4366                // rd = rd + R11
4367                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4368                    reg_to_bits(rd),
4369                    reg_to_bits(rd),
4370                    11,
4371                )?);
4372
4373                // Step 5: x = x + (x >> 16)
4374                // R11 = rd >> 16
4375                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4376
4377                // rd = rd + R11
4378                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4379                    reg_to_bits(rd),
4380                    reg_to_bits(rd),
4381                    11,
4382                )?);
4383
4384                // Step 6: return x & 0x3F
4385                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4386                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4387                    reg_to_bits(rd),
4388                    reg_to_bits(rd),
4389                    0x3F,
4390                )?);
4391
4392                Ok(bytes)
4393            }
4394
4395            // I64DivU: 64-bit unsigned division using binary long division
4396            // Input: R0:R1 = dividend, R2:R3 = divisor
4397            // Output: R0:R1 = quotient
4398            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4399            ArmOp::I64DivU {
4400                rdlo: _,
4401                rdhi: _,
4402                rnlo: _,
4403                rnhi: _,
4404                rmlo: _,
4405                rmhi: _,
4406            } => {
4407                let mut bytes = Vec::new();
4408
4409                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4410                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4411                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4412                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4413
4414                // Initialize quotient (R4:R5) = 0
4415                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4416                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4417
4418                // Initialize remainder (R6:R7) = 0
4419                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4420                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4421
4422                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4423                // MOV.W R12, #64: F04F 0C40
4424                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4425                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4426
4427                // Loop start
4428                let loop_start = bytes.len();
4429
4430                // === Loop body: process one bit ===
4431
4432                // 1. Shift quotient R4:R5 left by 1
4433                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4434                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4435                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4436                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4437                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4438                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4439                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4440                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4441                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4442                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4443                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4444
4445                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4446                // LSLS R7, R7, #1
4447                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4448                // ORR.W R7, R7, R6, LSR #31
4449                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4450                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4451                // LSLS R6, R6, #1
4452                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4453                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4454                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4455                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4456
4457                // 3. Shift dividend R0:R1 left by 1
4458                // LSLS R1, R1, #1
4459                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4460                // ORR.W R1, R1, R0, LSR #31
4461                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4462                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4463                // LSLS R0, R0, #1
4464                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4465
4466                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4467                // Compare high words first: CMP R7, R3
4468                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4469                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4470                // BHI means R7 > R3 (unsigned) - definitely subtract
4471                // BLO means R7 < R3 - definitely don't subtract
4472                // BEQ means need to check low words
4473
4474                // If high > divisor high: branch to subtract (forward +offset)
4475                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4476                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4477                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4478
4479                // If high < divisor high: branch past subtract
4480                // BLO.N +10 (skip to decrement)
4481                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4482
4483                // High words equal, compare low: CMP R6, R2
4484                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4485                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4486                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4487
4488                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4489                // SUBS R6, R6, R2
4490                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4491                // SBC R7, R7, R3 (with borrow)
4492                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4493                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4494                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4495                // ORR R4, R4, #1 (set bit 0 of quotient low)
4496                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4497                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4498
4499                // === Decrement counter and loop ===
4500                // SUBS.W R12, R12, #1 (decrement loop counter)
4501                // SUBS.W R12, R12, #1: F1BC 0C01
4502                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4503                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4504
4505                // BNE back to loop_start
4506                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4507                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4508                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4509                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4510
4511                // === Loop done, move quotient to R0:R1 ===
4512                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4513                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4514
4515                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4516                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4517                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4518                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4519
4520                Ok(bytes)
4521            }
4522
4523            // I64DivS: 64-bit signed division
4524            // Converts to unsigned, divides, then applies sign
4525            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4526            // Output: R0:R1 = quotient (signed)
4527            ArmOp::I64DivS {
4528                rdlo: _,
4529                rdhi: _,
4530                rnlo: _,
4531                rnhi: _,
4532                rmlo: _,
4533                rmhi: _,
4534            } => {
4535                let mut bytes = Vec::new();
4536
4537                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4538                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4539                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4540
4541                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4542                // EOR.W R9, R1, R3
4543                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4544                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4545
4546                // If dividend negative (R1 MSB set), negate it
4547                // TST R1, R1 (check sign)
4548                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4549                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4550                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4551
4552                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4553                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4554                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4555                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4556                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4557                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4558                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4559
4560                // If divisor negative (R3 MSB set), negate it
4561                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4562                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4563
4564                // Negate R2:R3
4565                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4566                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4567                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4568                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4569                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4570
4571                // === Now do unsigned division (same as I64DivU) ===
4572                // Initialize quotient (R4:R5) = 0
4573                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4574                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4575                // Initialize remainder (R6:R7) = 0
4576                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4577                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4578                // Initialize loop counter R8 = 64
4579                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4580                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4581
4582                let loop_start = bytes.len();
4583
4584                // Shift quotient left
4585                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4586                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4587                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4588                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4589
4590                // Shift remainder left, OR in MSB of dividend
4591                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4592                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4593                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4594                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4595                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4596                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4597
4598                // Shift dividend left
4599                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4600                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4601                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4602                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4603
4604                // Compare and conditionally subtract
4605                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4606                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4607                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4608                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4609                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4610
4611                // Subtract and set quotient bit
4612                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4613                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4614                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4615                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4616                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4617
4618                // Decrement and loop
4619                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4620                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4621
4622                let branch_offset_bytes = bytes.len() - loop_start + 4;
4623                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4624                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4625                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4626
4627                // Move quotient to R0:R1
4628                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4629                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4630
4631                // If result should be negative (R9 MSB set), negate R0:R1
4632                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4633                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4634                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4635
4636                // Negate result R0:R1
4637                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4638                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4639                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4640                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4641                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4642
4643                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4644                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4645                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4646
4647                Ok(bytes)
4648            }
4649
4650            // I64RemU: 64-bit unsigned remainder using binary long division
4651            // Same algorithm as I64DivU but returns remainder instead of quotient
4652            // Input: R0:R1 = dividend, R2:R3 = divisor
4653            // Output: R0:R1 = remainder
4654            ArmOp::I64RemU {
4655                rdlo: _,
4656                rdhi: _,
4657                rnlo: _,
4658                rnhi: _,
4659                rmlo: _,
4660                rmhi: _,
4661            } => {
4662                let mut bytes = Vec::new();
4663
4664                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4665                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4666                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4667
4668                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4669                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4670                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4671                // Initialize remainder (R6:R7) = 0
4672                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4673                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4674                // Initialize loop counter R8 = 64
4675                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4676                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4677
4678                let loop_start = bytes.len();
4679
4680                // Shift quotient left (not needed for result, but keeps algorithm same)
4681                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4682                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4683                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4684                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4685
4686                // Shift remainder left, OR in MSB of dividend
4687                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4688                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4689                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4690                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4691                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4692                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4693
4694                // Shift dividend left
4695                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4696                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4697                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4698                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4699
4700                // Compare and conditionally subtract
4701                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4702                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4703                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4704                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4705                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4706
4707                // Subtract and set quotient bit
4708                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4709                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4710                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4711                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4712                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4713
4714                // Decrement and loop
4715                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4716                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4717
4718                let branch_offset_bytes = bytes.len() - loop_start + 4;
4719                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4720                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4721                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4722
4723                // Move REMAINDER to R0:R1 (difference from I64DivU)
4724                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4725                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4726
4727                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4728                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4729                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4730
4731                Ok(bytes)
4732            }
4733
4734            // I64RemS: 64-bit signed remainder
4735            // Remainder sign follows dividend sign (not quotient rule)
4736            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4737            // Output: R0:R1 = remainder (signed, same sign as dividend)
4738            ArmOp::I64RemS {
4739                rdlo: _,
4740                rdhi: _,
4741                rnlo: _,
4742                rnhi: _,
4743                rmlo: _,
4744                rmhi: _,
4745            } => {
4746                let mut bytes = Vec::new();
4747
4748                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4749                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4750                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4751
4752                // Save dividend sign in R9 (remainder sign = dividend sign)
4753                // MOV R9, R1 (just need the sign bit)
4754                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4755
4756                // If dividend negative (R1 MSB set), negate it
4757                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4758                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4759
4760                // Negate R0:R1
4761                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4762                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4763                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4764                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4765                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4766
4767                // If divisor negative (R3 MSB set), negate it
4768                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4769                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4770
4771                // Negate R2:R3
4772                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4773                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4774                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4775                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4776                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4777
4778                // === Unsigned division algorithm ===
4779                // Initialize quotient (R4:R5) = 0
4780                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4781                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4782                // Initialize remainder (R6:R7) = 0
4783                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4784                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4785                // Initialize loop counter R8 = 64
4786                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4787                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4788
4789                let loop_start = bytes.len();
4790
4791                // Shift quotient left
4792                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4793                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4794                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4795                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4796
4797                // Shift remainder left, OR in MSB of dividend
4798                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4799                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4800                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4801                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4802                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4803                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4804
4805                // Shift dividend left
4806                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4807                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4808                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4809                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4810
4811                // Compare and conditionally subtract
4812                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4813                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4814                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4815                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4816                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4817
4818                // Subtract and set quotient bit
4819                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4820                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4821                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4822                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4823                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4824
4825                // Decrement and loop
4826                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4827                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4828
4829                let branch_offset_bytes = bytes.len() - loop_start + 4;
4830                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4831                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4832                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4833
4834                // Move remainder to R0:R1
4835                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4836                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4837
4838                // If original dividend was negative (R9 MSB set), negate remainder
4839                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4840                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4841                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4842
4843                // Negate result R0:R1
4844                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4845                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4846                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4847                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4848                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4849
4850                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4851                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4852                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4853
4854                Ok(bytes)
4855            }
4856
4857            // === F32 VFP single-precision Thumb-2 encodings ===
4858            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4859            ArmOp::F32Add { sd, sn, sm } => {
4860                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4861            }
4862            ArmOp::F32Sub { sd, sn, sm } => {
4863                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4864            }
4865            ArmOp::F32Mul { sd, sn, sm } => {
4866                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4867            }
4868            ArmOp::F32Div { sd, sn, sm } => {
4869                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4870            }
4871            ArmOp::F32Abs { sd, sm } => {
4872                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4873            }
4874            ArmOp::F32Neg { sd, sm } => {
4875                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4876            }
4877            ArmOp::F32Sqrt { sd, sm } => {
4878                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4879            }
4880
4881            // f32 pseudo-ops — multi-instruction sequences
4882            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4883            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4884            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4885            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4886            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4887            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4888            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4889            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4890
4891            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4892            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4893            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4894            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4895            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4896            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4897            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4898
4899            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4900
4901            ArmOp::F32Load { sd, addr } => {
4902                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4903            }
4904            ArmOp::F32Store { sd, addr } => {
4905                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4906            }
4907
4908            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4909            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4910            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4911                Err(synth_core::Error::synthesis(
4912                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4913                ))
4914            }
4915            ArmOp::F32ReinterpretI32 { sd, rm } => {
4916                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4917            }
4918            ArmOp::I32ReinterpretF32 { rd, sm } => {
4919                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4920            }
4921            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4922            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4923
4924            // === F64 VFP double-precision Thumb-2 encodings ===
4925            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4926            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4927                0xEE300B00, dd, dn, dm,
4928            )?)),
4929            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4930                0xEE300B40, dd, dn, dm,
4931            )?)),
4932            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4933                0xEE200B00, dd, dn, dm,
4934            )?)),
4935            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4936                0xEE800B00, dd, dn, dm,
4937            )?)),
4938            ArmOp::F64Abs { dd, dm } => {
4939                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4940            }
4941            ArmOp::F64Neg { dd, dm } => {
4942                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4943            }
4944            ArmOp::F64Sqrt { dd, dm } => {
4945                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4946            }
4947
4948            // f64 pseudo-ops
4949            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4950            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
4951            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
4952            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
4953            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
4954            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
4955            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
4956            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
4957
4958            // f64 comparisons
4959            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
4960            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
4961            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
4962            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
4963            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
4964            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
4965
4966            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
4967
4968            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4969                0xED900B00, dd, addr,
4970            )?)),
4971            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4972                0xED800B00, dd, addr,
4973            )?)),
4974
4975            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
4976            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
4977            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
4978                Err(synth_core::Error::synthesis(
4979                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4980                ))
4981            }
4982            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
4983            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
4984                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
4985            )),
4986            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
4987                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
4988            )),
4989            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
4990                Err(synth_core::Error::synthesis(
4991                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
4992                ))
4993            }
4994            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
4995            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
4996
4997            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
4998
4999            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
5000            ArmOp::I64Add {
5001                rdlo,
5002                rdhi,
5003                rnlo,
5004                rnhi,
5005                rmlo,
5006                rmhi,
5007            } => {
5008                let mut bytes = Vec::new();
5009                // ADDS rdlo, rnlo, rmlo (16-bit)
5010                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
5011                    rd: *rdlo,
5012                    rn: *rnlo,
5013                    op2: Operand2::Reg(*rmlo),
5014                })?);
5015                // ADC.W rdhi, rnhi, rmhi (32-bit)
5016                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
5017                    rd: *rdhi,
5018                    rn: *rnhi,
5019                    op2: Operand2::Reg(*rmhi),
5020                })?);
5021                Ok(bytes)
5022            }
5023
5024            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
5025            ArmOp::I64Sub {
5026                rdlo,
5027                rdhi,
5028                rnlo,
5029                rnhi,
5030                rmlo,
5031                rmhi,
5032            } => {
5033                let mut bytes = Vec::new();
5034                // SUBS rdlo, rnlo, rmlo (16-bit)
5035                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
5036                    rd: *rdlo,
5037                    rn: *rnlo,
5038                    op2: Operand2::Reg(*rmlo),
5039                })?);
5040                // SBC.W rdhi, rnhi, rmhi (32-bit)
5041                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
5042                    rd: *rdhi,
5043                    rn: *rnhi,
5044                    op2: Operand2::Reg(*rmhi),
5045                })?);
5046                Ok(bytes)
5047            }
5048
5049            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
5050            ArmOp::I64And {
5051                rdlo,
5052                rdhi,
5053                rnlo,
5054                rnhi,
5055                rmlo,
5056                rmhi,
5057            } => {
5058                let mut bytes = Vec::new();
5059                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
5060                    rd: *rdlo,
5061                    rn: *rnlo,
5062                    op2: Operand2::Reg(*rmlo),
5063                })?);
5064                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
5065                    rd: *rdhi,
5066                    rn: *rnhi,
5067                    op2: Operand2::Reg(*rmhi),
5068                })?);
5069                Ok(bytes)
5070            }
5071
5072            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
5073            ArmOp::I64Or {
5074                rdlo,
5075                rdhi,
5076                rnlo,
5077                rnhi,
5078                rmlo,
5079                rmhi,
5080            } => {
5081                let mut bytes = Vec::new();
5082                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
5083                    rd: *rdlo,
5084                    rn: *rnlo,
5085                    op2: Operand2::Reg(*rmlo),
5086                })?);
5087                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
5088                    rd: *rdhi,
5089                    rn: *rnhi,
5090                    op2: Operand2::Reg(*rmhi),
5091                })?);
5092                Ok(bytes)
5093            }
5094
5095            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
5096            ArmOp::I64Xor {
5097                rdlo,
5098                rdhi,
5099                rnlo,
5100                rnhi,
5101                rmlo,
5102                rmhi,
5103            } => {
5104                let mut bytes = Vec::new();
5105                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
5106                    rd: *rdlo,
5107                    rn: *rnlo,
5108                    op2: Operand2::Reg(*rmlo),
5109                })?);
5110                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
5111                    rd: *rdhi,
5112                    rn: *rnhi,
5113                    op2: Operand2::Reg(*rmhi),
5114                })?);
5115                Ok(bytes)
5116            }
5117
5118            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
5119            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
5120                rd: *rd,
5121                rn_lo: *rnlo,
5122                rn_hi: *rnhi,
5123            }),
5124
5125            // I64 comparisons: delegate to I64SetCond
5126            ArmOp::I64Eq {
5127                rd,
5128                rnlo,
5129                rnhi,
5130                rmlo,
5131                rmhi,
5132            } => self.encode_thumb(&ArmOp::I64SetCond {
5133                rd: *rd,
5134                rn_lo: *rnlo,
5135                rn_hi: *rnhi,
5136                rm_lo: *rmlo,
5137                rm_hi: *rmhi,
5138                cond: synth_synthesis::Condition::EQ,
5139            }),
5140
5141            ArmOp::I64Ne {
5142                rd,
5143                rnlo,
5144                rnhi,
5145                rmlo,
5146                rmhi,
5147            } => self.encode_thumb(&ArmOp::I64SetCond {
5148                rd: *rd,
5149                rn_lo: *rnlo,
5150                rn_hi: *rnhi,
5151                rm_lo: *rmlo,
5152                rm_hi: *rmhi,
5153                cond: synth_synthesis::Condition::NE,
5154            }),
5155
5156            ArmOp::I64LtS {
5157                rd,
5158                rnlo,
5159                rnhi,
5160                rmlo,
5161                rmhi,
5162            } => self.encode_thumb(&ArmOp::I64SetCond {
5163                rd: *rd,
5164                rn_lo: *rnlo,
5165                rn_hi: *rnhi,
5166                rm_lo: *rmlo,
5167                rm_hi: *rmhi,
5168                cond: synth_synthesis::Condition::LT,
5169            }),
5170
5171            ArmOp::I64LtU {
5172                rd,
5173                rnlo,
5174                rnhi,
5175                rmlo,
5176                rmhi,
5177            } => self.encode_thumb(&ArmOp::I64SetCond {
5178                rd: *rd,
5179                rn_lo: *rnlo,
5180                rn_hi: *rnhi,
5181                rm_lo: *rmlo,
5182                rm_hi: *rmhi,
5183                cond: synth_synthesis::Condition::LO,
5184            }),
5185
5186            ArmOp::I64LeS {
5187                rd,
5188                rnlo,
5189                rnhi,
5190                rmlo,
5191                rmhi,
5192            } => self.encode_thumb(&ArmOp::I64SetCond {
5193                rd: *rd,
5194                rn_lo: *rnlo,
5195                rn_hi: *rnhi,
5196                rm_lo: *rmlo,
5197                rm_hi: *rmhi,
5198                cond: synth_synthesis::Condition::LE,
5199            }),
5200
5201            ArmOp::I64LeU {
5202                rd,
5203                rnlo,
5204                rnhi,
5205                rmlo,
5206                rmhi,
5207            } => self.encode_thumb(&ArmOp::I64SetCond {
5208                rd: *rd,
5209                rn_lo: *rnlo,
5210                rn_hi: *rnhi,
5211                rm_lo: *rmlo,
5212                rm_hi: *rmhi,
5213                cond: synth_synthesis::Condition::LS,
5214            }),
5215
5216            ArmOp::I64GtS {
5217                rd,
5218                rnlo,
5219                rnhi,
5220                rmlo,
5221                rmhi,
5222            } => self.encode_thumb(&ArmOp::I64SetCond {
5223                rd: *rd,
5224                rn_lo: *rnlo,
5225                rn_hi: *rnhi,
5226                rm_lo: *rmlo,
5227                rm_hi: *rmhi,
5228                cond: synth_synthesis::Condition::GT,
5229            }),
5230
5231            ArmOp::I64GtU {
5232                rd,
5233                rnlo,
5234                rnhi,
5235                rmlo,
5236                rmhi,
5237            } => self.encode_thumb(&ArmOp::I64SetCond {
5238                rd: *rd,
5239                rn_lo: *rnlo,
5240                rn_hi: *rnhi,
5241                rm_lo: *rmlo,
5242                rm_hi: *rmhi,
5243                cond: synth_synthesis::Condition::HI,
5244            }),
5245
5246            ArmOp::I64GeS {
5247                rd,
5248                rnlo,
5249                rnhi,
5250                rmlo,
5251                rmhi,
5252            } => self.encode_thumb(&ArmOp::I64SetCond {
5253                rd: *rd,
5254                rn_lo: *rnlo,
5255                rn_hi: *rnhi,
5256                rm_lo: *rmlo,
5257                rm_hi: *rmhi,
5258                cond: synth_synthesis::Condition::GE,
5259            }),
5260
5261            ArmOp::I64GeU {
5262                rd,
5263                rnlo,
5264                rnhi,
5265                rmlo,
5266                rmhi,
5267            } => self.encode_thumb(&ArmOp::I64SetCond {
5268                rd: *rd,
5269                rn_lo: *rnlo,
5270                rn_hi: *rnhi,
5271                rm_lo: *rmlo,
5272                rm_hi: *rmhi,
5273                cond: synth_synthesis::Condition::HS,
5274            }),
5275
5276            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
5277            ArmOp::I64Const { rdlo, rdhi, value } => {
5278                let lo32 = *value as u32;
5279                let hi32 = (*value >> 32) as u32;
5280                let mut bytes = Vec::new();
5281                // Load low 32 bits into rdlo
5282                bytes.extend_from_slice(
5283                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
5284                );
5285                if lo32 > 0xFFFF {
5286                    bytes.extend_from_slice(
5287                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
5288                    );
5289                }
5290                // Load high 32 bits into rdhi
5291                bytes.extend_from_slice(
5292                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5293                );
5294                if hi32 > 0xFFFF {
5295                    bytes.extend_from_slice(
5296                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5297                    );
5298                }
5299                Ok(bytes)
5300            }
5301
5302            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5303            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5304                let mut bytes = Vec::new();
5305                let offset = if addr.offset < 0 {
5306                    0u32
5307                } else {
5308                    addr.offset as u32
5309                };
5310                // #372: a memory `i64.load` carries an index register
5311                // (`reg_imm(R11, addr_reg, offset)` = R11 + addr + offset). The
5312                // immediate `encode_thumb32_ldr` below uses only base+offset and
5313                // would SILENTLY DROP `offset_reg` — the #206 defect, here for
5314                // i64. Materialize the effective base `ip = base + index` first
5315                // (ADD.W ip, base, index — byte-verified), then load with
5316                // immediate offsets. Frame i64 loads (no `offset_reg`, e.g. a
5317                // spilled local at `[SP, #off]`) keep the plain `[base,#off]`
5318                // form unchanged — so existing output is byte-identical.
5319                let base = self.i64_effective_base(&mut bytes, addr);
5320                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &base, offset)?);
5321                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5322                    rdhi,
5323                    &base,
5324                    offset.wrapping_add(4),
5325                )?);
5326                Ok(bytes)
5327            }
5328
5329            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5330            ArmOp::I64Str { rdlo, rdhi, addr } => {
5331                let mut bytes = Vec::new();
5332                let offset = if addr.offset < 0 {
5333                    0u32
5334                } else {
5335                    addr.offset as u32
5336                };
5337                // #372: same index-materialization as I64Ldr (see above).
5338                let base = self.i64_effective_base(&mut bytes, addr);
5339                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &base, offset)?);
5340                bytes.extend_from_slice(&self.encode_thumb32_str(
5341                    rdhi,
5342                    &base,
5343                    offset.wrapping_add(4),
5344                )?);
5345                Ok(bytes)
5346            }
5347
5348            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5349            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5350                let mut bytes = Vec::new();
5351                if rdlo != rn {
5352                    // MOV rdlo, rn (16-bit)
5353                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5354                        rd: *rdlo,
5355                        op2: Operand2::Reg(*rn),
5356                    })?);
5357                }
5358                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5359                bytes.extend_from_slice(
5360                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5361                );
5362                Ok(bytes)
5363            }
5364
5365            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5366            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5367                let mut bytes = Vec::new();
5368                if rdlo != rn {
5369                    // MOV rdlo, rn
5370                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5371                        rd: *rdlo,
5372                        op2: Operand2::Reg(*rn),
5373                    })?);
5374                }
5375                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5376                let rdhi_bits = reg_to_bits(rdhi) as u16;
5377                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5378                bytes.extend_from_slice(&instr.to_le_bytes());
5379                Ok(bytes)
5380            }
5381
5382            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5383            ArmOp::I32WrapI64 { rd, rnlo } => {
5384                if rd == rnlo {
5385                    // No-op: already in the right register
5386                    let instr: u16 = 0xBF00; // NOP
5387                    Ok(instr.to_le_bytes().to_vec())
5388                } else {
5389                    // MOV rd, rnlo
5390                    self.encode_thumb(&ArmOp::Mov {
5391                        rd: *rd,
5392                        op2: Operand2::Reg(*rnlo),
5393                    })
5394                }
5395            }
5396
5397            // ===== Helium MVE operations (Thumb-2 encoding) =====
5398            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5399            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5400            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5401            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5402                0xEF000150, qd, qn, qm,
5403            ))),
5404            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5405                0xEF200150, qd, qn, qm,
5406            ))),
5407            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5408                0xFF000150, qd, qn, qm,
5409            ))),
5410            ArmOp::MveMvn { qd, qm } => {
5411                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5412                let qd_enc = qreg_to_num(qd);
5413                let qm_enc = qreg_to_num(qm);
5414                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5415                Ok(vfp_to_thumb_bytes(instr))
5416            }
5417            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5418                0xEF100150, qd, qn, qm,
5419            ))),
5420            ArmOp::MveAddI { qd, qn, qm, size } => {
5421                let sz = mve_size_bits(size);
5422                let base: u32 = 0xEF000840 | (sz << 20);
5423                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5424            }
5425            ArmOp::MveSubI { qd, qn, qm, size } => {
5426                let sz = mve_size_bits(size);
5427                let base: u32 = 0xFF000840 | (sz << 20);
5428                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5429            }
5430            ArmOp::MveMulI { qd, qn, qm, size } => {
5431                let sz = mve_size_bits(size);
5432                let base: u32 = 0xEF000950 | (sz << 20);
5433                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5434            }
5435            ArmOp::MveNegI { qd, qm, size } => {
5436                let sz = mve_size_bits(size);
5437                // VNEG.Sx Qd, Qm
5438                let qd_enc = qreg_to_num(qd);
5439                let qm_enc = qreg_to_num(qm);
5440                let base: u32 = 0xFFB103C0 | (sz << 18);
5441                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5442                Ok(vfp_to_thumb_bytes(instr))
5443            }
5444            ArmOp::MveDup { qd, rn, size } => {
5445                let sz = mve_size_bits(size);
5446                let qd_enc = qreg_to_num(qd);
5447                let rn_bits = reg_to_bits(rn);
5448                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5449                // size encoding: 00=32, 01=16, 10=8
5450                let be = match sz {
5451                    0 => 0b00u32, // 8-bit
5452                    1 => 0b01,    // 16-bit
5453                    _ => 0b00,    // 32-bit (default)
5454                };
5455                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5456                Ok(vfp_to_thumb_bytes(instr))
5457            }
5458            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5459                let qn_enc = qreg_to_num(qn);
5460                let rd_bits = reg_to_bits(rd);
5461                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5462                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5463                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5464                let lane_in_d = (*lane as u32) & 1;
5465                let _sz = mve_size_bits(size);
5466                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5467                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5468                Ok(vfp_to_thumb_bytes(instr))
5469            }
5470            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5471                let qd_enc = qreg_to_num(qd);
5472                let rn_bits = reg_to_bits(rn);
5473                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5474                let lane_in_d = (*lane as u32) & 1;
5475                let _sz = mve_size_bits(size);
5476                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5477                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5478                Ok(vfp_to_thumb_bytes(instr))
5479            }
5480
5481            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5482            ArmOp::MveCmpEqI { qd, qn, qm, size }
5483            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5484            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5485            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5486            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5487            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5488            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5489            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5490            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5491            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5492                // Encode as VADD (placeholder encoding — real implementation
5493                // would use VCMP + VPSEL pair)
5494                let sz = mve_size_bits(size);
5495                let base: u32 = 0xEF000840 | (sz << 20);
5496                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5497            }
5498
5499            // f32x4 MVE arithmetic
5500            ArmOp::MveAddF32 { qd, qn, qm } => {
5501                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5502                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5503            }
5504            ArmOp::MveSubF32 { qd, qn, qm } => {
5505                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5506                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5507            }
5508            ArmOp::MveMulF32 { qd, qn, qm } => {
5509                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5510                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5511            }
5512            ArmOp::MveNegF32 { qd, qm } => {
5513                let qd_enc = qreg_to_num(qd);
5514                let qm_enc = qreg_to_num(qm);
5515                // VNEG.F32 Qd, Qm: FFB907C0
5516                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5517                Ok(vfp_to_thumb_bytes(instr))
5518            }
5519            ArmOp::MveAbsF32 { qd, qm } => {
5520                let qd_enc = qreg_to_num(qd);
5521                let qm_enc = qreg_to_num(qm);
5522                // VABS.F32 Qd, Qm: FFB90740
5523                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5524                Ok(vfp_to_thumb_bytes(instr))
5525            }
5526            ArmOp::MveCmpEqF32 { qd, qn, qm }
5527            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5528            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5529            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5530            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5531            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5532                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5533                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5534            }
5535            ArmOp::MveDupF32 { qd, rn } => {
5536                let qd_enc = qreg_to_num(qd);
5537                let rn_bits = reg_to_bits(rn);
5538                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5539                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5540                Ok(vfp_to_thumb_bytes(instr))
5541            }
5542            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5543                let qn_enc = qreg_to_num(qn);
5544                let rd_bits = reg_to_bits(rd);
5545                // VMOV Rd, Sn where Sn = Q*4 + lane
5546                let s_num = qn_enc * 4 + (*lane as u32);
5547                let (vn, n) = encode_sreg(s_num);
5548                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5549                Ok(vfp_to_thumb_bytes(instr))
5550            }
5551            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5552                let qd_enc = qreg_to_num(qd);
5553                let rn_bits = reg_to_bits(rn);
5554                // VMOV Sn, Rn where Sn = Q*4 + lane
5555                let s_num = qd_enc * 4 + (*lane as u32);
5556                let (vn, n) = encode_sreg(s_num);
5557                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5558                Ok(vfp_to_thumb_bytes(instr))
5559            }
5560            ArmOp::MveDivF32 { qd, qn, qm } => {
5561                // Lane-wise: extract 4 S-regs, VDIV, insert back
5562                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5563            }
5564            ArmOp::MveSqrtF32 { qd, qm } => {
5565                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5566                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5567            }
5568
5569            // Catch-all for any remaining ops
5570            _ => {
5571                let instr: u16 = 0xBF00; // NOP
5572                Ok(instr.to_le_bytes().to_vec())
5573            }
5574        }
5575    }
5576
5577    // === Thumb-2 VFP multi-instruction helpers ===
5578
5579    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5580    fn encode_thumb_f32_compare(
5581        &self,
5582        rd: &Reg,
5583        sn: &VfpReg,
5584        sm: &VfpReg,
5585        cond_code: u32,
5586    ) -> Result<Vec<u8>> {
5587        let mut bytes = Vec::new();
5588        let rd_bits = reg_to_bits(rd);
5589
5590        // VCMP.F32 Sn, Sm
5591        let sn_num = vfp_sreg_to_num(sn)?;
5592        let sm_num = vfp_sreg_to_num(sm)?;
5593        let (vd, d) = encode_sreg(sn_num);
5594        let (vm, m) = encode_sreg(sm_num);
5595        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5596        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5597
5598        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5599        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5600
5601        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5602        if rd_bits < 8 {
5603            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5604            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5605        } else {
5606            // MOV.W Rd, #0 (32-bit Thumb-2)
5607            let hw1: u16 = 0xF04F;
5608            let hw2: u16 = (rd_bits as u16) << 8;
5609            bytes.extend_from_slice(&hw1.to_le_bytes());
5610            bytes.extend_from_slice(&hw2.to_le_bytes());
5611        }
5612
5613        // IT<cond> — If-Then for conditional MOV
5614        // IT encoding: 1011 1111 cond(4) mask(4)
5615        // mask = 0x8 for single "then" (IT)
5616        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5617        bytes.extend_from_slice(&it.to_le_bytes());
5618
5619        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5620        if rd_bits < 8 {
5621            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5622            bytes.extend_from_slice(&mov_one.to_le_bytes());
5623        } else {
5624            // MOV.W Rd, #1 (32-bit)
5625            let hw1: u16 = 0xF04F;
5626            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5627            bytes.extend_from_slice(&hw1.to_le_bytes());
5628            bytes.extend_from_slice(&hw2.to_le_bytes());
5629        }
5630
5631        Ok(bytes)
5632    }
5633
5634    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5635    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5636        let mut bytes = Vec::new();
5637        let bits = value.to_bits();
5638        let rt: u32 = 12; // R12/IP as temp
5639
5640        // MOVW R12, #lo16
5641        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5642        let lo16 = bits & 0xFFFF;
5643        let imm4 = (lo16 >> 12) & 0xF;
5644        let i_bit = (lo16 >> 11) & 1;
5645        let imm3 = (lo16 >> 8) & 0x7;
5646        let imm8 = lo16 & 0xFF;
5647        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5648        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5649        bytes.extend_from_slice(&hw1.to_le_bytes());
5650        bytes.extend_from_slice(&hw2.to_le_bytes());
5651
5652        // MOVT R12, #hi16
5653        let hi16 = (bits >> 16) & 0xFFFF;
5654        let imm4 = (hi16 >> 12) & 0xF;
5655        let i_bit = (hi16 >> 11) & 1;
5656        let imm3 = (hi16 >> 8) & 0x7;
5657        let imm8 = hi16 & 0xFF;
5658        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5659        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5660        bytes.extend_from_slice(&hw1.to_le_bytes());
5661        bytes.extend_from_slice(&hw2.to_le_bytes());
5662
5663        // VMOV Sd, R12
5664        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5665        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5666
5667        Ok(bytes)
5668    }
5669
5670    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5671    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5672        let mut bytes = Vec::new();
5673
5674        // VMOV Sd, Rm
5675        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5676        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5677
5678        // VCVT.F32.S32/U32 Sd, Sd
5679        let sd_num = vfp_sreg_to_num(sd)?;
5680        let (vd, d) = encode_sreg(sd_num);
5681        let (vm, m) = encode_sreg(sd_num);
5682        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5683        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5684        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5685
5686        Ok(bytes)
5687    }
5688
5689    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5690    /// Encode F32 rounding as Thumb-2.
5691    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5692    ///
5693    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5694    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5695    /// then restores FPSCR.
5696    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5697        let mut bytes = Vec::new();
5698        let sm_num = vfp_sreg_to_num(sm)?;
5699        let sd_num = vfp_sreg_to_num(sd)?;
5700        let (vd_s, d_s) = encode_sreg(sd_num);
5701        let (vm_s, m_s) = encode_sreg(sm_num);
5702
5703        if mode == 0b11 {
5704            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5705            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5706            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5707        } else {
5708            // ceil/floor/nearest: manipulate FPSCR rounding mode
5709            let rt: u32 = 12; // R12/IP as temp
5710
5711            // VMRS R12, FPSCR
5712            let vmrs = 0xEEF10A10 | (rt << 12);
5713            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5714
5715            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5716            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5717            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5718            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5719            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5720            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5721            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5722            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5723
5724            // ORR.W R12, R12, #(mode << 22)
5725            if mode != 0 {
5726                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5727                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5728                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5729                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5730            }
5731
5732            // VMSR FPSCR, R12
5733            let vmsr = 0xEEE10A10 | (rt << 12);
5734            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5735
5736            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5737            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5738            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5739
5740            // Restore FPSCR: clear rmode bits back to nearest (default)
5741            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5742            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5743            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5744            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5745        }
5746
5747        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5748        let (vd2, d2) = encode_sreg(sd_num);
5749        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5750        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5751
5752        Ok(bytes)
5753    }
5754
5755    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5756    fn encode_thumb_f32_minmax(
5757        &self,
5758        sd: &VfpReg,
5759        sn: &VfpReg,
5760        sm: &VfpReg,
5761        is_min: bool,
5762    ) -> Result<Vec<u8>> {
5763        let mut bytes = Vec::new();
5764        let sn_num = vfp_sreg_to_num(sn)?;
5765        let sm_num = vfp_sreg_to_num(sm)?;
5766        let sd_num = vfp_sreg_to_num(sd)?;
5767
5768        // VMOV.F32 Sd, Sn
5769        let (vd, d) = encode_sreg(sd_num);
5770        let (vn, n) = encode_sreg(sn_num);
5771        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5772        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5773
5774        // VCMP.F32 Sn, Sm
5775        let (vm, m) = encode_sreg(sm_num);
5776        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5777        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5778
5779        // VMRS APSR_nzcv, FPSCR
5780        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5781
5782        // IT GT (for min) or IT MI (for max)
5783        let cond: u16 = if is_min { 0xC } else { 0x4 };
5784        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5785        bytes.extend_from_slice(&it.to_le_bytes());
5786
5787        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5788        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5789        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5790
5791        Ok(bytes)
5792    }
5793
5794    /// Encode F32 copysign as Thumb-2
5795    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5796        let mut bytes = Vec::new();
5797
5798        // VMOV R12, Sm (get sign source bits)
5799        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5800            false,
5801            sm,
5802            &Reg::R12,
5803        )?));
5804
5805        // VMOV R0, Sn (get magnitude source bits)
5806        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5807            false,
5808            sn,
5809            &Reg::R0,
5810        )?));
5811
5812        // AND.W R12, R12, #0x80000000
5813        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5814        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5815        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5816        // Actually encoding #0x80000000 as modified constant:
5817        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5818        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5819        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5820        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5821        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5822        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5823        bytes.extend_from_slice(&hw1.to_le_bytes());
5824        bytes.extend_from_slice(&hw2.to_le_bytes());
5825
5826        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5827        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5828        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5829        bytes.extend_from_slice(&hw1.to_le_bytes());
5830        bytes.extend_from_slice(&hw2.to_le_bytes());
5831
5832        // ORR.W R0, R0, R12 (R0 = register 0)
5833        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5834        let hw2: u16 = 12; // Rd=R0, Rm=R12
5835        bytes.extend_from_slice(&hw1.to_le_bytes());
5836        bytes.extend_from_slice(&hw2.to_le_bytes());
5837
5838        // VMOV Sd, R0
5839        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5840            true,
5841            sd,
5842            &Reg::R0,
5843        )?));
5844
5845        Ok(bytes)
5846    }
5847
5848    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5849    fn encode_thumb_f64_compare(
5850        &self,
5851        rd: &Reg,
5852        dn: &VfpReg,
5853        dm: &VfpReg,
5854        cond_code: u32,
5855    ) -> Result<Vec<u8>> {
5856        let mut bytes = Vec::new();
5857        let rd_bits = reg_to_bits(rd);
5858
5859        // VCMP.F64 Dn, Dm
5860        let dn_num = vfp_dreg_to_num(dn)?;
5861        let dm_num = vfp_dreg_to_num(dm)?;
5862        let (vd, d) = encode_dreg(dn_num);
5863        let (vm, m) = encode_dreg(dm_num);
5864        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5865        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5866
5867        // VMRS APSR_nzcv, FPSCR
5868        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5869
5870        // MOVS Rd, #0
5871        if rd_bits < 8 {
5872            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5873            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5874        } else {
5875            let hw1: u16 = 0xF04F;
5876            let hw2: u16 = (rd_bits as u16) << 8;
5877            bytes.extend_from_slice(&hw1.to_le_bytes());
5878            bytes.extend_from_slice(&hw2.to_le_bytes());
5879        }
5880
5881        // IT<cond>
5882        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5883        bytes.extend_from_slice(&it.to_le_bytes());
5884
5885        // MOV Rd, #1
5886        if rd_bits < 8 {
5887            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5888            bytes.extend_from_slice(&mov_one.to_le_bytes());
5889        } else {
5890            let hw1: u16 = 0xF04F;
5891            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5892            bytes.extend_from_slice(&hw1.to_le_bytes());
5893            bytes.extend_from_slice(&hw2.to_le_bytes());
5894        }
5895
5896        Ok(bytes)
5897    }
5898
5899    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5900    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5901        let mut bytes = Vec::new();
5902        let bits = value.to_bits();
5903        let lo32 = bits as u32;
5904        let hi32 = (bits >> 32) as u32;
5905
5906        // MOVW R0, #lo16(lo32)
5907        let lo16 = lo32 & 0xFFFF;
5908        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5909
5910        // MOVT R0, #hi16(lo32)
5911        let hi16 = (lo32 >> 16) & 0xFFFF;
5912        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5913
5914        // MOVW R12, #lo16(hi32)
5915        let lo16 = hi32 & 0xFFFF;
5916        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5917
5918        // MOVT R12, #hi16(hi32)
5919        let hi16 = (hi32 >> 16) & 0xFFFF;
5920        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5921
5922        // VMOV Dd, R0, R12
5923        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5924        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5925
5926        Ok(bytes)
5927    }
5928
5929    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5930    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5931        let mut bytes = Vec::new();
5932
5933        // VMOV S0, Rm
5934        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5935        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5936
5937        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5938        let dd_num = vfp_dreg_to_num(dd)?;
5939        let (vd, d) = encode_dreg(dd_num);
5940        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5941        let vcvt = base | (d << 22) | (vd << 12);
5942        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5943
5944        Ok(bytes)
5945    }
5946
5947    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5948    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5949        let dd_num = vfp_dreg_to_num(dd)?;
5950        let sm_num = vfp_sreg_to_num(sm)?;
5951        let (vd, d) = encode_dreg(dd_num);
5952        let (vm, m) = encode_sreg(sm_num);
5953
5954        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
5955        Ok(vfp_to_thumb_bytes(vcvt))
5956    }
5957
5958    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
5959    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5960        let mut bytes = Vec::new();
5961        let dm_num = vfp_dreg_to_num(dm)?;
5962        let (vm, m) = encode_dreg(dm_num);
5963
5964        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
5965        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
5966        let vcvt = base | (m << 5) | vm;
5967        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5968
5969        // VMOV Rd, S0
5970        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
5971        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5972
5973        Ok(bytes)
5974    }
5975
5976    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5977    /// Encode F64 rounding as Thumb-2.
5978    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5979    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5980        let mut bytes = Vec::new();
5981        let dm_num = vfp_dreg_to_num(dm)?;
5982        let dd_num = vfp_dreg_to_num(dd)?;
5983        let (vm, m) = encode_dreg(dm_num);
5984        let (vd, d) = encode_dreg(dd_num);
5985
5986        if mode == 0b11 {
5987            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
5988            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
5989            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5990        } else {
5991            let rt: u32 = 12;
5992
5993            // VMRS R12, FPSCR
5994            let vmrs = 0xEEF10A10 | (rt << 12);
5995            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5996
5997            // BIC.W R12, R12, #(3 << 22)
5998            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
5999            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
6000            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
6001            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
6002
6003            // ORR.W R12, R12, #(mode << 22)
6004            if mode != 0 {
6005                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
6006                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
6007                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
6008                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
6009            }
6010
6011            // VMSR FPSCR, R12
6012            let vmsr = 0xEEE10A10 | (rt << 12);
6013            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
6014
6015            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
6016            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
6017            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
6018
6019            // Restore FPSCR
6020            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
6021            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
6022            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
6023            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
6024        }
6025
6026        // VCVT.F64.S32 Dd, S0
6027        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
6028        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
6029
6030        Ok(bytes)
6031    }
6032
6033    /// Encode F64 min/max as Thumb-2
6034    fn encode_thumb_f64_minmax(
6035        &self,
6036        dd: &VfpReg,
6037        dn: &VfpReg,
6038        dm: &VfpReg,
6039        is_min: bool,
6040    ) -> Result<Vec<u8>> {
6041        let mut bytes = Vec::new();
6042        let dn_num = vfp_dreg_to_num(dn)?;
6043        let dm_num = vfp_dreg_to_num(dm)?;
6044        let dd_num = vfp_dreg_to_num(dd)?;
6045
6046        // VMOV.F64 Dd, Dn
6047        let (vd, d) = encode_dreg(dd_num);
6048        let (vn, n) = encode_dreg(dn_num);
6049        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
6050        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
6051
6052        // VCMP.F64 Dn, Dm
6053        let (vm, m) = encode_dreg(dm_num);
6054        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
6055        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
6056
6057        // VMRS APSR_nzcv, FPSCR
6058        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
6059
6060        // IT GT (for min) or IT MI (for max)
6061        let cond: u16 = if is_min { 0xC } else { 0x4 };
6062        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
6063        bytes.extend_from_slice(&it.to_le_bytes());
6064
6065        // VMOV{cond}.F64 Dd, Dm
6066        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
6067        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
6068
6069        Ok(bytes)
6070    }
6071
6072    /// Encode F64 copysign as Thumb-2
6073    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
6074        let mut bytes = Vec::new();
6075
6076        // VMOV R0, R12, Dm (get sign source)
6077        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
6078            false,
6079            dm,
6080            &Reg::R0,
6081            &Reg::R12,
6082        )?));
6083
6084        // VMOV R1, R2, Dn (get magnitude source)
6085        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
6086            false,
6087            dn,
6088            &Reg::R1,
6089            &Reg::R2,
6090        )?));
6091
6092        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
6093        let hw1: u16 = 0xF000 | 12;
6094        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
6095        bytes.extend_from_slice(&hw1.to_le_bytes());
6096        bytes.extend_from_slice(&hw2.to_le_bytes());
6097
6098        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
6099        let hw1: u16 = 0xF020 | 2;
6100        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
6101        bytes.extend_from_slice(&hw1.to_le_bytes());
6102        bytes.extend_from_slice(&hw2.to_le_bytes());
6103
6104        // ORR.W R2, R2, R12
6105        let hw1: u16 = 0xEA40 | 2;
6106        let hw2: u16 = (2 << 8) | 12;
6107        bytes.extend_from_slice(&hw1.to_le_bytes());
6108        bytes.extend_from_slice(&hw2.to_le_bytes());
6109
6110        // VMOV Dd, R1, R2
6111        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
6112            true,
6113            dd,
6114            &Reg::R1,
6115            &Reg::R2,
6116        )?));
6117
6118        Ok(bytes)
6119    }
6120
6121    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
6122    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
6123        let mut bytes = Vec::new();
6124
6125        let sm_num = vfp_sreg_to_num(sm)?;
6126        let (vd, d) = encode_sreg(sm_num);
6127        let (vm, m) = encode_sreg(sm_num);
6128        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
6129        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
6130        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
6131
6132        // VMOV Rd, Sm
6133        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
6134        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6135
6136        Ok(bytes)
6137    }
6138
6139    // === Thumb-2 32-bit encoding helpers ===
6140
6141    /// Encode Thumb-2 32-bit ADD with immediate
6142    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6143        let rd_bits = reg_to_bits(rd);
6144        let rn_bits = reg_to_bits(rn);
6145
6146        // The `i:imm3:imm8` field is split the same way for both forms.
6147        let i_bit = (imm >> 11) & 1;
6148        let imm3 = (imm >> 8) & 0x7;
6149        let imm8 = imm & 0xFF;
6150
6151        let hw1_base = if imm <= 0xFF {
6152            // ADD.W (T3): the field is a ThumbExpandImm modified immediate. For
6153            // imm <= 0xFF (i:imm3 = 0000) it is the zero-extended byte, which is
6154            // correct — keep this form so existing encodings stay bit-identical.
6155            0xF100
6156        } else if imm <= 0xFFF {
6157            // ADDW (T4): a PLAIN 12-bit immediate (0..4095) — no ThumbExpandImm.
6158            // This is what makes `add sp, sp, #frame` correct for frame sizes
6159            // >= 256, which ADD.W (T3) would silently mis-encode (e.g. #256 -> #0).
6160            0xF200
6161        } else {
6162            return Err(synth_core::Error::synthesis(
6163                "ADD immediate > 0xFFF (4095) requires a multi-instruction sequence (not supported)",
6164            ));
6165        };
6166
6167        let hw1: u16 = (hw1_base | (i_bit << 10) | rn_bits) as u16;
6168        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6169
6170        let mut bytes = hw1.to_le_bytes().to_vec();
6171        bytes.extend_from_slice(&hw2.to_le_bytes());
6172        Ok(bytes)
6173    }
6174
6175    /// Encode Thumb-2 32-bit SUB with immediate
6176    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6177        let rd_bits = reg_to_bits(rd);
6178        let rn_bits = reg_to_bits(rn);
6179
6180        let i_bit = (imm >> 11) & 1;
6181        let imm3 = (imm >> 8) & 0x7;
6182        let imm8 = imm & 0xFF;
6183
6184        let hw1_base = if imm <= 0xFF {
6185            // SUB.W (T3) modified immediate — correct for the zero-extended byte
6186            // (imm <= 0xFF). Kept bit-identical for existing encodings.
6187            0xF1A0
6188        } else if imm <= 0xFFF {
6189            // SUBW (T4): plain 12-bit immediate (0..4095). Makes
6190            // `sub sp, sp, #frame` correct for frame sizes >= 256.
6191            0xF2A0
6192        } else {
6193            return Err(synth_core::Error::synthesis(
6194                "SUB immediate > 0xFFF (4095) requires a multi-instruction sequence (not supported)",
6195            ));
6196        };
6197
6198        let hw1: u16 = (hw1_base | (i_bit << 10) | rn_bits) as u16;
6199        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6200
6201        let mut bytes = hw1.to_le_bytes().to_vec();
6202        bytes.extend_from_slice(&hw2.to_le_bytes());
6203        Ok(bytes)
6204    }
6205
6206    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
6207    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6208        let rd_bits = reg_to_bits(rd);
6209        let rn_bits = reg_to_bits(rn);
6210
6211        // ADDS.W (flag-setting) has only the modified-immediate form — error on
6212        // an un-encodable value rather than silently add the wrong constant.
6213        let field = try_thumb_expand_imm(imm).ok_or_else(|| {
6214            synth_core::Error::synthesis(
6215                "ADDS immediate is not a valid ThumbExpandImm — materialize into a register",
6216            )
6217        })?;
6218        let i_bit = (field >> 11) & 1;
6219        let imm3 = (field >> 8) & 0x7;
6220        let imm8 = field & 0xFF;
6221
6222        // ADDS.W Rd, Rn, #imm (with S=1)
6223        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
6224        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
6225        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6226
6227        let mut bytes = hw1.to_le_bytes().to_vec();
6228        bytes.extend_from_slice(&hw2.to_le_bytes());
6229        Ok(bytes)
6230    }
6231
6232    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
6233    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6234        let rd_bits = reg_to_bits(rd);
6235        let rn_bits = reg_to_bits(rn);
6236
6237        // SUBS.W (flag-setting) has only the modified-immediate form — error on
6238        // an un-encodable value rather than silently subtract the wrong constant.
6239        let field = try_thumb_expand_imm(imm).ok_or_else(|| {
6240            synth_core::Error::synthesis(
6241                "SUBS immediate is not a valid ThumbExpandImm — materialize into a register",
6242            )
6243        })?;
6244        let i_bit = (field >> 11) & 1;
6245        let imm3 = (field >> 8) & 0x7;
6246        let imm8 = field & 0xFF;
6247
6248        // SUBS.W Rd, Rn, #imm (with S=1)
6249        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
6250        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6251        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6252
6253        let mut bytes = hw1.to_le_bytes().to_vec();
6254        bytes.extend_from_slice(&hw2.to_le_bytes());
6255        Ok(bytes)
6256    }
6257
6258    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
6259    ///
6260    /// # Contract (Verus-style)
6261    /// ```text
6262    /// requires rd <= R14
6263    /// ensures result.len() == 4
6264    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
6265    /// ```
6266    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
6267        let rd_bits = reg_to_bits(rd);
6268        reg_bits_checked(rd_bits)?;
6269        let imm16 = imm & 0xFFFF;
6270
6271        // MOVW Rd, #imm16
6272        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6273        let imm4 = (imm16 >> 12) & 0xF;
6274        let i_bit = (imm16 >> 11) & 1;
6275        let imm3 = (imm16 >> 8) & 0x7;
6276        let imm8 = imm16 & 0xFF;
6277
6278        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6279        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6280
6281        let mut bytes = hw1.to_le_bytes().to_vec();
6282        bytes.extend_from_slice(&hw2.to_le_bytes());
6283        encoding_contracts::verify_thumb32(&bytes);
6284        Ok(bytes)
6285    }
6286
6287    /// Encode Thumb-2 32-bit shift with immediate
6288    ///
6289    /// # Contract (Verus-style)
6290    /// ```text
6291    /// requires rd <= R14, rm <= R14
6292    /// ensures result.len() == 4
6293    /// ```
6294    fn encode_thumb32_shift(
6295        &self,
6296        rd: &Reg,
6297        rm: &Reg,
6298        shift: u32,
6299        shift_type: u8,
6300    ) -> Result<Vec<u8>> {
6301        let rd_bits = reg_to_bits(rd);
6302        let rm_bits = reg_to_bits(rm);
6303        reg_bits_checked(rd_bits)?;
6304        reg_bits_checked(rm_bits)?;
6305        let imm5 = shift & 0x1F;
6306        let imm2 = imm5 & 0x3;
6307        let imm3 = (imm5 >> 2) & 0x7;
6308
6309        // MOV.W Rd, Rm, <shift> #imm
6310        // EA4F 0 imm3 Rd imm2 type Rm
6311        let hw1: u16 = 0xEA4F;
6312        let hw2: u16 =
6313            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
6314                as u16;
6315
6316        let mut bytes = hw1.to_le_bytes().to_vec();
6317        bytes.extend_from_slice(&hw2.to_le_bytes());
6318        Ok(bytes)
6319    }
6320
6321    /// Encode Thumb-2 32-bit shift by register
6322    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
6323    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
6324    fn encode_thumb32_shift_reg(
6325        &self,
6326        rd: &Reg,
6327        rn: &Reg,
6328        rm: &Reg,
6329        shift_type: u8,
6330    ) -> Result<Vec<u8>> {
6331        let rd_bits = reg_to_bits(rd);
6332        let rn_bits = reg_to_bits(rn);
6333        let rm_bits = reg_to_bits(rm);
6334
6335        // hw1: 1111 1010 0xx0 Rn
6336        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
6337        // hw2: 1111 Rd 0000 Rm
6338        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
6339
6340        let mut bytes = hw1.to_le_bytes().to_vec();
6341        bytes.extend_from_slice(&hw2.to_le_bytes());
6342        Ok(bytes)
6343    }
6344
6345    /// Encode Thumb-2 32-bit CMP with immediate
6346    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6347        let rn_bits = reg_to_bits(rn);
6348
6349        // CMP.W has only the modified-immediate form (no plain-imm12 like ADDW),
6350        // so an un-encodable immediate MUST be materialized into a register by
6351        // the selector. Error rather than silently compare the wrong constant.
6352        let field = try_thumb_expand_imm(imm).ok_or_else(|| {
6353            synth_core::Error::synthesis(
6354                "CMP immediate is not a valid ThumbExpandImm — materialize into a register",
6355            )
6356        })?;
6357        let i_bit = (field >> 11) & 1;
6358        let imm3 = (field >> 8) & 0x7;
6359        let imm8 = field & 0xFF;
6360
6361        // CMP.W Rn, #imm
6362        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6363        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6364
6365        let mut bytes = hw1.to_le_bytes().to_vec();
6366        bytes.extend_from_slice(&hw2.to_le_bytes());
6367        Ok(bytes)
6368    }
6369
6370    /// #372: resolve the base register for an `I64Ldr`/`I64Str` whose address
6371    /// may carry an index register. If `addr.offset_reg` is set (a memory
6372    /// `i64.load`/`i64.store`: `R11 + addr + offset`), emit `ADD.W ip, base,
6373    /// index` and return `ip` (R12) as the base for the two immediate-offset
6374    /// halves. If unset (a frame access at `[base, #off]`), return `addr.base`
6375    /// unchanged — emitting nothing — so non-indexed i64 access is byte-identical.
6376    /// `ip = base + index` is computed BEFORE the halves load, so an `rdlo`
6377    /// aliasing the index register is safe (the address is already materialized).
6378    fn i64_effective_base(&self, bytes: &mut Vec<u8>, addr: &MemAddr) -> Reg {
6379        match addr.offset_reg {
6380            Some(idx) => {
6381                let ip = Reg::R12;
6382                // ADD.W ip, addr.base, idx  (Thumb-2, byte-verified vs as)
6383                let hw1: u16 = 0xEB00 | reg_to_bits(&addr.base) as u16;
6384                let hw2: u16 = 0x0C00 | reg_to_bits(&idx) as u16;
6385                bytes.extend_from_slice(&hw1.to_le_bytes());
6386                bytes.extend_from_slice(&hw2.to_le_bytes());
6387                ip
6388            }
6389            None => addr.base,
6390        }
6391    }
6392
6393    /// Encode Thumb-2 32-bit LDR
6394    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6395        let rd_bits = reg_to_bits(rd);
6396        let base_bits = reg_to_bits(base);
6397
6398        // LDR.W Rd, [Rn, #imm12]
6399        check_ldst_imm12(offset)?;
6400        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6401        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6402
6403        let mut bytes = hw1.to_le_bytes().to_vec();
6404        bytes.extend_from_slice(&hw2.to_le_bytes());
6405        Ok(bytes)
6406    }
6407
6408    /// Encode Thumb-2 32-bit STR
6409    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6410        let rd_bits = reg_to_bits(rd);
6411        let base_bits = reg_to_bits(base);
6412
6413        // STR.W Rd, [Rn, #imm12]
6414        check_ldst_imm12(offset)?;
6415        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6416        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6417
6418        let mut bytes = hw1.to_le_bytes().to_vec();
6419        bytes.extend_from_slice(&hw2.to_le_bytes());
6420        Ok(bytes)
6421    }
6422
6423    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6424    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6425        let rd_bits = reg_to_bits(rd);
6426        let base_bits = reg_to_bits(base);
6427        let rm_bits = reg_to_bits(offset_reg);
6428
6429        // LDR.W Rd, [Rn, Rm, LSL #0]
6430        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6431        // imm2 = 00 for no shift (LSL #0)
6432        let hw1: u16 = (0xF850 | base_bits) as u16;
6433        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6434
6435        let mut bytes = hw1.to_le_bytes().to_vec();
6436        bytes.extend_from_slice(&hw2.to_le_bytes());
6437        Ok(bytes)
6438    }
6439
6440    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6441    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6442        let rd_bits = reg_to_bits(rd);
6443        let base_bits = reg_to_bits(base);
6444        let rm_bits = reg_to_bits(offset_reg);
6445
6446        // STR.W Rd, [Rn, Rm, LSL #0]
6447        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6448        // imm2 = 00 for no shift (LSL #0)
6449        let hw1: u16 = (0xF840 | base_bits) as u16;
6450        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6451
6452        let mut bytes = hw1.to_le_bytes().to_vec();
6453        bytes.extend_from_slice(&hw2.to_le_bytes());
6454        Ok(bytes)
6455    }
6456
6457    // === Sub-word load/store Thumb-2 encoding helpers ===
6458
6459    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6460    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6461        let rd_bits = reg_to_bits(rd);
6462        let base_bits = reg_to_bits(base);
6463        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6464        check_ldst_imm12(offset)?;
6465        let hw1: u16 = (0xF890 | base_bits) as u16;
6466        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6467        let mut bytes = hw1.to_le_bytes().to_vec();
6468        bytes.extend_from_slice(&hw2.to_le_bytes());
6469        Ok(bytes)
6470    }
6471
6472    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6473    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6474        let rd_bits = reg_to_bits(rd);
6475        let base_bits = reg_to_bits(base);
6476        let rm_bits = reg_to_bits(offset_reg);
6477        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6478        let hw1: u16 = (0xF810 | base_bits) as u16;
6479        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6480        let mut bytes = hw1.to_le_bytes().to_vec();
6481        bytes.extend_from_slice(&hw2.to_le_bytes());
6482        Ok(bytes)
6483    }
6484
6485    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6486    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6487        let rd_bits = reg_to_bits(rd);
6488        let base_bits = reg_to_bits(base);
6489        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6490        check_ldst_imm12(offset)?;
6491        let hw1: u16 = (0xF990 | base_bits) as u16;
6492        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6493        let mut bytes = hw1.to_le_bytes().to_vec();
6494        bytes.extend_from_slice(&hw2.to_le_bytes());
6495        Ok(bytes)
6496    }
6497
6498    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6499    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6500        let rd_bits = reg_to_bits(rd);
6501        let base_bits = reg_to_bits(base);
6502        let rm_bits = reg_to_bits(offset_reg);
6503        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6504        let hw1: u16 = (0xF910 | base_bits) as u16;
6505        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6506        let mut bytes = hw1.to_le_bytes().to_vec();
6507        bytes.extend_from_slice(&hw2.to_le_bytes());
6508        Ok(bytes)
6509    }
6510
6511    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6512    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6513        let rd_bits = reg_to_bits(rd);
6514        let base_bits = reg_to_bits(base);
6515        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6516        check_ldst_imm12(offset)?;
6517        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6518        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6519        let mut bytes = hw1.to_le_bytes().to_vec();
6520        bytes.extend_from_slice(&hw2.to_le_bytes());
6521        Ok(bytes)
6522    }
6523
6524    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6525    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6526        let rd_bits = reg_to_bits(rd);
6527        let base_bits = reg_to_bits(base);
6528        let rm_bits = reg_to_bits(offset_reg);
6529        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6530        let hw1: u16 = (0xF830 | base_bits) as u16;
6531        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6532        let mut bytes = hw1.to_le_bytes().to_vec();
6533        bytes.extend_from_slice(&hw2.to_le_bytes());
6534        Ok(bytes)
6535    }
6536
6537    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6538    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6539        let rd_bits = reg_to_bits(rd);
6540        let base_bits = reg_to_bits(base);
6541        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6542        check_ldst_imm12(offset)?;
6543        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6544        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6545        let mut bytes = hw1.to_le_bytes().to_vec();
6546        bytes.extend_from_slice(&hw2.to_le_bytes());
6547        Ok(bytes)
6548    }
6549
6550    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6551    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6552        let rd_bits = reg_to_bits(rd);
6553        let base_bits = reg_to_bits(base);
6554        let rm_bits = reg_to_bits(offset_reg);
6555        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6556        let hw1: u16 = (0xF930 | base_bits) as u16;
6557        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6558        let mut bytes = hw1.to_le_bytes().to_vec();
6559        bytes.extend_from_slice(&hw2.to_le_bytes());
6560        Ok(bytes)
6561    }
6562
6563    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6564    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6565        let rd_bits = reg_to_bits(rd);
6566        let base_bits = reg_to_bits(base);
6567        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6568        check_ldst_imm12(offset)?;
6569        let hw1: u16 = (0xF880 | base_bits) as u16;
6570        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6571        let mut bytes = hw1.to_le_bytes().to_vec();
6572        bytes.extend_from_slice(&hw2.to_le_bytes());
6573        Ok(bytes)
6574    }
6575
6576    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6577    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6578        let rd_bits = reg_to_bits(rd);
6579        let base_bits = reg_to_bits(base);
6580        let rm_bits = reg_to_bits(offset_reg);
6581        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6582        let hw1: u16 = (0xF800 | base_bits) as u16;
6583        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6584        let mut bytes = hw1.to_le_bytes().to_vec();
6585        bytes.extend_from_slice(&hw2.to_le_bytes());
6586        Ok(bytes)
6587    }
6588
6589    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6590    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6591        let rd_bits = reg_to_bits(rd);
6592        let base_bits = reg_to_bits(base);
6593        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6594        check_ldst_imm12(offset)?;
6595        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6596        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6597        let mut bytes = hw1.to_le_bytes().to_vec();
6598        bytes.extend_from_slice(&hw2.to_le_bytes());
6599        Ok(bytes)
6600    }
6601
6602    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6603    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6604        let rd_bits = reg_to_bits(rd);
6605        let base_bits = reg_to_bits(base);
6606        let rm_bits = reg_to_bits(offset_reg);
6607        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6608        let hw1: u16 = (0xF820 | base_bits) as u16;
6609        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6610        let mut bytes = hw1.to_le_bytes().to_vec();
6611        bytes.extend_from_slice(&hw2.to_le_bytes());
6612        Ok(bytes)
6613    }
6614
6615    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6616    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6617        let rd_bits = reg_to_bits(rd);
6618        let rn_bits = reg_to_bits(rn);
6619
6620        // For small immediates, use ADD.W Rd, Rn, #imm12
6621        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6622        // S = 0 (don't update flags)
6623        // The 12-bit immediate is encoded as: i:imm3:imm8
6624        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6625        if imm <= 0xFFF {
6626            let i_bit = (imm >> 11) & 1;
6627            let imm3 = (imm >> 8) & 0x7;
6628            let imm8 = imm & 0xFF;
6629
6630            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6631            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6632
6633            let mut bytes = hw1.to_le_bytes().to_vec();
6634            bytes.extend_from_slice(&hw2.to_le_bytes());
6635            Ok(bytes)
6636        } else {
6637            // Out-of-range immediate (> 0xFFF): materialize it into a scratch
6638            // register, then ADD.W Rd, Rn, scratch. This is the #180/#185
6639            // "encoder must produce a legal sequence, not assert" class — see #350.
6640            //
6641            // Scratch choice (must NEVER equal Rn, or Rn would be clobbered before
6642            // the ADD reads it):
6643            //   - rd != rn  => use rd itself (rn is untouched, since rd != rn).
6644            //   - rd == rn  => use R12/IP (the reserved encoder scratch). rd/rn are
6645            //                  never R12 (R12 is non-allocatable), so it can't alias.
6646            //
6647            // The materialized value is the same whether or not MOVT is emitted, so
6648            // the byte length depends only on `imm` (and rd==rn) — the size probe and
6649            // the final emit therefore agree (mandatory: the function is encoded twice).
6650            let scratch: u32 = if rd_bits == rn_bits {
6651                12 // R12/IP — in-place add, can't use rd because rd == rn
6652            } else {
6653                rd_bits // rn is preserved because rd != rn
6654            };
6655            // Invariant: the scratch must never alias Rn (would clobber it before
6656            // the ADD reads it). Unreachable in real codegen (rd/rn are never R12,
6657            // which is reserved encoder scratch), but the encoder is also driven by
6658            // the `encoder_no_panic` fuzz harness with ARBITRARY registers — incl.
6659            // rd==rn==R12, which makes scratch (R12) alias Rn. The encoder contract
6660            // (#180/#185) is Ok-or-Err, never a panic, so return a typed error
6661            // instead of asserting. #350 follow-up.
6662            if scratch == rn_bits {
6663                return Err(synth_core::Error::synthesis(format!(
6664                    "ADD #imm: cannot lower #{imm:#x} for Rd==Rn==R12 — no free scratch \
6665                     register (R12 is the reserved encoder scratch and aliases Rn here)"
6666                )));
6667            }
6668
6669            let lo16 = imm & 0xFFFF;
6670            let hi16 = (imm >> 16) & 0xFFFF;
6671
6672            let mut bytes = self.encode_thumb32_movw_raw(scratch, lo16)?;
6673            if hi16 != 0 {
6674                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(scratch, hi16)?);
6675            }
6676            bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(rd_bits, rn_bits, scratch)?);
6677            Ok(bytes)
6678        }
6679    }
6680
6681    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6682
6683    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6684    ///
6685    /// # Contract (Verus-style)
6686    /// ```text
6687    /// requires rd <= 14, imm16 <= 0xFFFF
6688    /// ensures result.len() == 4
6689    /// ```
6690    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6691        reg_bits_checked(rd)?;
6692        encoding_contracts::verify_imm16(imm16);
6693        // MOVW Rd, #imm16
6694        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6695        let imm16 = imm16 & 0xFFFF;
6696        let imm4 = (imm16 >> 12) & 0xF;
6697        let i_bit = (imm16 >> 11) & 1;
6698        let imm3 = (imm16 >> 8) & 0x7;
6699        let imm8 = imm16 & 0xFF;
6700
6701        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6702        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6703
6704        let mut bytes = hw1.to_le_bytes().to_vec();
6705        bytes.extend_from_slice(&hw2.to_le_bytes());
6706        encoding_contracts::verify_thumb32(&bytes);
6707        Ok(bytes)
6708    }
6709
6710    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6711    ///
6712    /// # Contract (Verus-style)
6713    /// ```text
6714    /// requires rd <= 14, imm16 <= 0xFFFF
6715    /// ensures result.len() == 4
6716    /// ```
6717    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6718        reg_bits_checked(rd)?;
6719        encoding_contracts::verify_imm16(imm16);
6720        // MOVT Rd, #imm16
6721        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6722        let imm16 = imm16 & 0xFFFF;
6723        let imm4 = (imm16 >> 12) & 0xF;
6724        let i_bit = (imm16 >> 11) & 1;
6725        let imm3 = (imm16 >> 8) & 0x7;
6726        let imm8 = imm16 & 0xFF;
6727
6728        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6729        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6730
6731        let mut bytes = hw1.to_le_bytes().to_vec();
6732        bytes.extend_from_slice(&hw2.to_le_bytes());
6733        encoding_contracts::verify_thumb32(&bytes);
6734        Ok(bytes)
6735    }
6736
6737    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6738    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6739        // MOV.W Rd, Rm, LSR #imm
6740        // EA4F 0 imm3 Rd imm2 01 Rm
6741        let imm5 = shift & 0x1F;
6742        let imm2 = imm5 & 0x3;
6743        let imm3 = (imm5 >> 2) & 0x7;
6744
6745        let hw1: u16 = 0xEA4F;
6746        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6747
6748        let mut bytes = hw1.to_le_bytes().to_vec();
6749        bytes.extend_from_slice(&hw2.to_le_bytes());
6750        Ok(bytes)
6751    }
6752
6753    /// Encode Thumb-2 32-bit AND (register) - raw version
6754    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6755        // AND.W Rd, Rn, Rm
6756        // EA00 Rn | 0 Rd 00 00 Rm
6757        let hw1: u16 = (0xEA00 | rn) as u16;
6758        let hw2: u16 = ((rd << 8) | rm) as u16;
6759
6760        let mut bytes = hw1.to_le_bytes().to_vec();
6761        bytes.extend_from_slice(&hw2.to_le_bytes());
6762        Ok(bytes)
6763    }
6764
6765    /// Encode Thumb-2 32-bit AND with immediate - raw version
6766    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6767        // AND.W Rd, Rn, #<modified_immediate>
6768        // For small immediates (0-255), the encoding is simpler
6769        // F0 00 Rn | 0 imm3 Rd imm8
6770        let i_bit = (imm >> 11) & 1;
6771        let imm3 = (imm >> 8) & 0x7;
6772        let imm8 = imm & 0xFF;
6773
6774        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6775        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6776
6777        let mut bytes = hw1.to_le_bytes().to_vec();
6778        bytes.extend_from_slice(&hw2.to_le_bytes());
6779        Ok(bytes)
6780    }
6781
6782    /// Encode Thumb-2 32-bit SUB (register) - raw version
6783    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6784        // SUB.W Rd, Rn, Rm
6785        // EBA0 Rn | 0 Rd 00 00 Rm
6786        let hw1: u16 = (0xEBA0 | rn) as u16;
6787        let hw2: u16 = ((rd << 8) | rm) as u16;
6788
6789        let mut bytes = hw1.to_le_bytes().to_vec();
6790        bytes.extend_from_slice(&hw2.to_le_bytes());
6791        Ok(bytes)
6792    }
6793
6794    /// Encode Thumb-2 32-bit ADD (register) - raw version
6795    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6796        // ADD.W Rd, Rn, Rm
6797        // EB00 Rn | 0 Rd 00 00 Rm
6798        let hw1: u16 = (0xEB00 | rn) as u16;
6799        let hw2: u16 = ((rd << 8) | rm) as u16;
6800
6801        let mut bytes = hw1.to_le_bytes().to_vec();
6802        bytes.extend_from_slice(&hw2.to_le_bytes());
6803        Ok(bytes)
6804    }
6805
6806    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6807    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6808    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6809    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6810        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6811        let hw1: u16 = (0xEB10 | rn) as u16;
6812        let hw2: u16 = ((rd << 8) | rm) as u16;
6813        let mut bytes = hw1.to_le_bytes().to_vec();
6814        bytes.extend_from_slice(&hw2.to_le_bytes());
6815        Ok(bytes)
6816    }
6817
6818    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6819    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6820    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6821        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6822        let hw1: u16 = (0xEBB0 | rn) as u16;
6823        let hw2: u16 = ((rd << 8) | rm) as u16;
6824        let mut bytes = hw1.to_le_bytes().to_vec();
6825        bytes.extend_from_slice(&hw2.to_le_bytes());
6826        Ok(bytes)
6827    }
6828
6829    /// Encode a sequence of ARM instructions
6830    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6831        let mut code = Vec::new();
6832
6833        for op in ops {
6834            let encoded = self.encode(op)?;
6835            code.extend_from_slice(&encoded);
6836        }
6837
6838        Ok(code)
6839    }
6840}
6841
6842/// Convert register to bit encoding (0-15)
6843/// Reverse of the ARMv7-M `ThumbExpandImm`: given a 32-bit immediate, return the
6844/// 12-bit `i:imm3:imm8` field if it is a representable modified immediate, else
6845/// `None` (the caller must materialize the value into a register). This is the
6846/// shared correct path for the data-processing immediate encoders — without it
6847/// they pack raw bits and silently mis-encode any value `> 0xFF` that isn't a
6848/// modified immediate (the silent-miscompile class behind #251/#253/#255).
6849fn try_thumb_expand_imm(value: u32) -> Option<u32> {
6850    // i:imm3 = 0000 → 8-bit value, zero-extended (00000000 00000000 00000000 XY).
6851    if value <= 0xFF {
6852        return Some(value);
6853    }
6854    let b0 = value & 0xFF; // byte 0
6855    let b1 = (value >> 8) & 0xFF; // byte 1
6856    // 0x00XY00XY (i:imm3 = 0001) — XY in bytes 0 and 2
6857    if value == (b0 << 16) | b0 {
6858        return Some(0x100 | b0);
6859    }
6860    // 0xXY00XY00 (i:imm3 = 0010) — XY in bytes 1 and 3
6861    if value == (b1 << 24) | (b1 << 8) {
6862        return Some(0x200 | b1);
6863    }
6864    // 0xXYXYXYXY (i:imm3 = 0011) — XY in all four bytes
6865    if value == (b0 << 24) | (b0 << 16) | (b0 << 8) | b0 {
6866        return Some(0x300 | b0);
6867    }
6868    // An 8-bit value with bit 7 set, rotated right by 8..=31. `rotate_left(rot)`
6869    // undoes the encoded right rotation; if the result is `1bbbbbbb` (0x80..=0xFF)
6870    // the value is representable. imm12[11:7] = rot, imm12[6:0] = low 7 bits.
6871    for rot in 8..=31u32 {
6872        let unrot = value.rotate_left(rot);
6873        if (0x80..=0xFF).contains(&unrot) {
6874            return Some((rot << 7) | (unrot & 0x7F));
6875        }
6876    }
6877    None
6878}
6879
6880/// Guard a Thumb-2 `LDR/STR Rd, [Rn, #imm12]` offset. The imm12 form supports
6881/// `0..=4095`; a larger offset must be materialized into a register by the
6882/// selector (register-offset addressing). Returning `Err` rather than silently
6883/// masking `offset & 0xFFF` closes the wrong-address miscompile class (#259,
6884/// the load/store sibling of #253/#255).
6885fn check_ldst_imm12(offset: u32) -> Result<()> {
6886    if offset > 0xFFF {
6887        Err(synth_core::Error::synthesis(
6888            "load/store immediate offset > 0xFFF (4095) — materialize the offset into a register",
6889        ))
6890    } else {
6891        Ok(())
6892    }
6893}
6894
6895fn reg_to_bits(reg: &Reg) -> u32 {
6896    match reg {
6897        Reg::R0 => 0,
6898        Reg::R1 => 1,
6899        Reg::R2 => 2,
6900        Reg::R3 => 3,
6901        Reg::R4 => 4,
6902        Reg::R5 => 5,
6903        Reg::R6 => 6,
6904        Reg::R7 => 7,
6905        Reg::R8 => 8,
6906        Reg::R9 => 9,
6907        Reg::R10 => 10,
6908        Reg::R11 => 11,
6909        Reg::R12 => 12,
6910        Reg::SP => 13,
6911        Reg::LR => 14,
6912        Reg::PC => 15,
6913    }
6914}
6915
6916/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6917/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6918/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6919/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6920/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6921/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6922/// Returns a typed Err instead. See #185.
6923fn reg_bits_checked(bits: u32) -> Result<()> {
6924    if bits > 14 {
6925        return Err(synth_core::Error::synthesis(format!(
6926            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6927        )));
6928    }
6929    Ok(())
6930}
6931
6932/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6933/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6934fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6935    if val == 0 {
6936        return Some((0, 1));
6937    }
6938    for rot in 0..16u32 {
6939        let shift = rot * 2;
6940        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6941        let unrotated = val.rotate_left(shift);
6942        if unrotated <= 0xFF {
6943            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6944            return Some(((rot << 8) | unrotated, 1));
6945        }
6946    }
6947    None
6948}
6949
6950/// Encode operand2 field and return (bits, immediate_flag).
6951/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
6952/// Panics if an immediate value cannot be represented. Callers that need large
6953/// immediates should use MOVW/MOVT instead of Operand2::Imm.
6954fn encode_operand2(op2: &Operand2) -> (u32, u32) {
6955    match op2 {
6956        Operand2::Imm(val) => {
6957            let uval = *val as u32;
6958            // Attempt rotated-immediate encoding (ARM32 Operand2)
6959            if let Some(encoded) = try_encode_rotated_imm(uval) {
6960                encoded
6961            } else {
6962                // Fallback: mask to 8 bits (legacy behavior for values that
6963                // cannot be represented). This should not be reached for
6964                // correctly-selected instructions; the instruction selector
6965                // must use MOVW/MOVT for large constants.
6966                let imm = uval & 0xFF;
6967                (imm, 1)
6968            }
6969        }
6970
6971        Operand2::Reg(reg) => {
6972            let reg_bits = reg_to_bits(reg);
6973            (reg_bits, 0) // I=0 for register
6974        }
6975
6976        Operand2::RegShift {
6977            rm,
6978            shift: _,
6979            amount,
6980        } => {
6981            // Simplified encoding with shift
6982            let rm_bits = reg_to_bits(rm);
6983            let shift_bits = (*amount & 0x1F) << 7;
6984            (shift_bits | rm_bits, 0)
6985        }
6986    }
6987}
6988
6989/// Encode memory address to (base_reg, offset)
6990fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
6991    let base_bits = reg_to_bits(&addr.base);
6992    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
6993    (base_bits, offset_bits)
6994}
6995
6996/// S-register number: S0=0, S1=1, ..., S31=31
6997fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
6998    match reg {
6999        VfpReg::S0 => Ok(0),
7000        VfpReg::S1 => Ok(1),
7001        VfpReg::S2 => Ok(2),
7002        VfpReg::S3 => Ok(3),
7003        VfpReg::S4 => Ok(4),
7004        VfpReg::S5 => Ok(5),
7005        VfpReg::S6 => Ok(6),
7006        VfpReg::S7 => Ok(7),
7007        VfpReg::S8 => Ok(8),
7008        VfpReg::S9 => Ok(9),
7009        VfpReg::S10 => Ok(10),
7010        VfpReg::S11 => Ok(11),
7011        VfpReg::S12 => Ok(12),
7012        VfpReg::S13 => Ok(13),
7013        VfpReg::S14 => Ok(14),
7014        VfpReg::S15 => Ok(15),
7015        VfpReg::S16 => Ok(16),
7016        VfpReg::S17 => Ok(17),
7017        VfpReg::S18 => Ok(18),
7018        VfpReg::S19 => Ok(19),
7019        VfpReg::S20 => Ok(20),
7020        VfpReg::S21 => Ok(21),
7021        VfpReg::S22 => Ok(22),
7022        VfpReg::S23 => Ok(23),
7023        VfpReg::S24 => Ok(24),
7024        VfpReg::S25 => Ok(25),
7025        VfpReg::S26 => Ok(26),
7026        VfpReg::S27 => Ok(27),
7027        VfpReg::S28 => Ok(28),
7028        VfpReg::S29 => Ok(29),
7029        VfpReg::S30 => Ok(30),
7030        VfpReg::S31 => Ok(31),
7031        // D-registers are not used in F32 single-precision encodings
7032        _ => Err(synth_core::Error::SynthesisError(
7033            "D-register not supported in single-precision VFP encoding".to_string(),
7034        )),
7035    }
7036}
7037
7038/// D-register number: D0=0, D1=1, ..., D15=15
7039fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
7040    match reg {
7041        VfpReg::D0 => Ok(0),
7042        VfpReg::D1 => Ok(1),
7043        VfpReg::D2 => Ok(2),
7044        VfpReg::D3 => Ok(3),
7045        VfpReg::D4 => Ok(4),
7046        VfpReg::D5 => Ok(5),
7047        VfpReg::D6 => Ok(6),
7048        VfpReg::D7 => Ok(7),
7049        VfpReg::D8 => Ok(8),
7050        VfpReg::D9 => Ok(9),
7051        VfpReg::D10 => Ok(10),
7052        VfpReg::D11 => Ok(11),
7053        VfpReg::D12 => Ok(12),
7054        VfpReg::D13 => Ok(13),
7055        VfpReg::D14 => Ok(14),
7056        VfpReg::D15 => Ok(15),
7057        // S-registers are not used in F64 double-precision encodings
7058        _ => Err(synth_core::Error::SynthesisError(
7059            "S-register not supported in double-precision VFP encoding".to_string(),
7060        )),
7061    }
7062}
7063
7064/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
7065/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
7066/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
7067fn encode_sreg(s: u32) -> (u32, u32) {
7068    (s >> 1, s & 1)
7069}
7070
7071/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
7072/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
7073/// For D0-D15, qualifier is always 0.
7074fn encode_dreg(d: u32) -> (u32, u32) {
7075    (d & 0xF, (d >> 4) & 1)
7076}
7077
7078/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
7079/// Returns the full 32-bit instruction word.
7080///
7081/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
7082/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
7083fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
7084    let sd_num = vfp_sreg_to_num(sd)?;
7085    let sn_num = vfp_sreg_to_num(sn)?;
7086    let sm_num = vfp_sreg_to_num(sm)?;
7087    let (vd, d) = encode_sreg(sd_num);
7088    let (vn, n) = encode_sreg(sn_num);
7089    let (vm, m) = encode_sreg(sm_num);
7090
7091    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
7092}
7093
7094/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
7095/// Returns the full 32-bit instruction word.
7096fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
7097    let sd_num = vfp_sreg_to_num(sd)?;
7098    let sm_num = vfp_sreg_to_num(sm)?;
7099    let (vd, d) = encode_sreg(sd_num);
7100    let (vm, m) = encode_sreg(sm_num);
7101
7102    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
7103}
7104
7105/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
7106/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
7107/// U bit (bit 23) controls add/subtract offset.
7108fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
7109    let sd_num = vfp_sreg_to_num(sd)?;
7110    let (vd, d) = encode_sreg(sd_num);
7111    let rn = reg_to_bits(&addr.base);
7112
7113    let offset = addr.offset;
7114    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7115    let abs_offset = offset.unsigned_abs();
7116    let imm8 = (abs_offset / 4) & 0xFF;
7117
7118    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
7119}
7120
7121/// Encode VMOV between core register and S-register.
7122/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
7123/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
7124fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
7125    let s_num = vfp_sreg_to_num(sreg)?;
7126    let (vn, n) = encode_sreg(s_num);
7127    let rt = reg_to_bits(core);
7128
7129    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
7130    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
7131}
7132
7133/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
7134/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
7135/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
7136fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
7137    let dd_num = vfp_dreg_to_num(dd)?;
7138    let dn_num = vfp_dreg_to_num(dn)?;
7139    let dm_num = vfp_dreg_to_num(dm)?;
7140    let (vd, d) = encode_dreg(dd_num);
7141    let (vn, n) = encode_dreg(dn_num);
7142    let (vm, m) = encode_dreg(dm_num);
7143
7144    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
7145}
7146
7147/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
7148fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
7149    let dd_num = vfp_dreg_to_num(dd)?;
7150    let dm_num = vfp_dreg_to_num(dm)?;
7151    let (vd, d) = encode_dreg(dd_num);
7152    let (vm, m) = encode_dreg(dm_num);
7153
7154    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
7155}
7156
7157/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
7158/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
7159fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
7160    let dd_num = vfp_dreg_to_num(dd)?;
7161    let (vd, d) = encode_dreg(dd_num);
7162    let rn = reg_to_bits(&addr.base);
7163
7164    let offset = addr.offset;
7165    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7166    let abs_offset = offset.unsigned_abs();
7167    let imm8 = (abs_offset / 4) & 0xFF;
7168
7169    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
7170}
7171
7172/// Encode VMOV between two core registers and a D-register.
7173/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
7174/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
7175fn encode_vmov_core_dreg(
7176    to_dreg: bool,
7177    dreg: &VfpReg,
7178    core_lo: &Reg,
7179    core_hi: &Reg,
7180) -> Result<u32> {
7181    let d_num = vfp_dreg_to_num(dreg)?;
7182    let (vm, m) = encode_dreg(d_num);
7183    let rt = reg_to_bits(core_lo);
7184    let rt2 = reg_to_bits(core_hi);
7185
7186    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
7187    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
7188}
7189
7190/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
7191fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
7192    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
7193    let hw2 = (instr & 0xFFFF) as u16;
7194    let mut bytes = hw1.to_le_bytes().to_vec();
7195    bytes.extend_from_slice(&hw2.to_le_bytes());
7196    bytes
7197}
7198
7199// ============================================================================
7200// Helium MVE encoding helpers
7201// ============================================================================
7202
7203/// Q-register number: Q0=0, Q1=1, ..., Q7=7
7204fn qreg_to_num(reg: &QReg) -> u32 {
7205    match reg {
7206        QReg::Q0 => 0,
7207        QReg::Q1 => 1,
7208        QReg::Q2 => 2,
7209        QReg::Q3 => 3,
7210        QReg::Q4 => 4,
7211        QReg::Q5 => 5,
7212        QReg::Q6 => 6,
7213        QReg::Q7 => 7,
7214    }
7215}
7216
7217/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
7218fn mve_size_bits(size: &MveSize) -> u32 {
7219    match size {
7220        MveSize::S8 => 0b00,
7221        MveSize::S16 => 0b01,
7222        MveSize::S32 => 0b10,
7223    }
7224}
7225
7226/// Encode MVE 3-register instruction.
7227/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
7228/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
7229fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
7230    let d = qreg_to_num(qd) * 2;
7231    let n = qreg_to_num(qn) * 2;
7232    let m = qreg_to_num(qm) * 2;
7233
7234    // Standard NEON/MVE 3-register encoding:
7235    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
7236    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
7237    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
7238    let vd = d & 0xF;
7239    let d_bit = (d >> 4) & 1;
7240    let vn = n & 0xF;
7241    let n_bit = (n >> 4) & 1;
7242    let vm = m & 0xF;
7243    let m_bit = (m >> 4) & 1;
7244
7245    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
7246}
7247
7248/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
7249fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
7250    encode_mve_3reg(base, qd, qn, qm)
7251}
7252
7253/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
7254/// Format: EC9x xxxx - contiguous load, word-sized elements
7255fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
7256    let qd_enc = qreg_to_num(qd) * 2;
7257    let rn = reg_to_bits(&addr.base);
7258    let offset = addr.offset;
7259    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7260    let abs_offset = offset.unsigned_abs();
7261    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
7262
7263    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
7264    0xED100E80
7265        | (u_bit << 23)
7266        | ((qd_enc >> 4) << 22)
7267        | (rn << 16)
7268        | ((qd_enc & 0xF) << 12)
7269        | (imm7 & 0x7F)
7270}
7271
7272/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
7273fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
7274    let qd_enc = qreg_to_num(qd) * 2;
7275    let rn = reg_to_bits(&addr.base);
7276    let offset = addr.offset;
7277    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7278    let abs_offset = offset.unsigned_abs();
7279    let imm7 = (abs_offset / 4) & 0x7F;
7280
7281    0xED000E80
7282        | (u_bit << 23)
7283        | ((qd_enc >> 4) << 22)
7284        | (rn << 16)
7285        | ((qd_enc & 0xF) << 12)
7286        | (imm7 & 0x7F)
7287}
7288
7289impl ArmEncoder {
7290    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
7291    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
7292        let mut result = Vec::new();
7293        let qd_num = qreg_to_num(qd);
7294
7295        // Load each 32-bit word into R12 (temp) then VMOV into S-register
7296        for i in 0..4 {
7297            let word = u32::from_le_bytes([
7298                bytes[i * 4],
7299                bytes[i * 4 + 1],
7300                bytes[i * 4 + 2],
7301                bytes[i * 4 + 3],
7302            ]);
7303            let lo16 = word & 0xFFFF;
7304            let hi16 = (word >> 16) & 0xFFFF;
7305
7306            // MOVW R12, #lo16
7307            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
7308            // MOVT R12, #hi16
7309            if hi16 != 0 {
7310                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
7311            }
7312
7313            // VMOV Sn, R12 where Sn = Qd*4 + i
7314            let s_num = qd_num * 4 + i as u32;
7315            let (vn, n) = encode_sreg(s_num);
7316            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
7317            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
7318        }
7319
7320        Ok(result)
7321    }
7322
7323    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
7324    fn encode_thumb_mve_lane_wise_f32_binop(
7325        &self,
7326        qd: &QReg,
7327        qn: &QReg,
7328        qm: &QReg,
7329        vfp_base: u32,
7330    ) -> Result<Vec<u8>> {
7331        let mut result = Vec::new();
7332        let qd_num = qreg_to_num(qd);
7333        let qn_num = qreg_to_num(qn);
7334        let qm_num = qreg_to_num(qm);
7335
7336        // For each lane 0..3: use S-registers directly (Q aliasing)
7337        for i in 0..4u32 {
7338            let sd = qd_num * 4 + i;
7339            let sn = qn_num * 4 + i;
7340            let sm = qm_num * 4 + i;
7341
7342            let (vd, d) = encode_sreg(sd);
7343            let (vn, n) = encode_sreg(sn);
7344            let (vm, m) = encode_sreg(sm);
7345
7346            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
7347            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7348        }
7349
7350        Ok(result)
7351    }
7352
7353    /// Encode lane-wise f32 VSQRT via S-register extraction
7354    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
7355        let mut result = Vec::new();
7356        let qd_num = qreg_to_num(qd);
7357        let qm_num = qreg_to_num(qm);
7358
7359        // VSQRT.F32 base: 0xEEB10AC0
7360        for i in 0..4u32 {
7361            let sd = qd_num * 4 + i;
7362            let sm = qm_num * 4 + i;
7363
7364            let (vd, d) = encode_sreg(sd);
7365            let (vm, m) = encode_sreg(sm);
7366
7367            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
7368            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7369        }
7370
7371        Ok(result)
7372    }
7373}
7374
7375#[cfg(test)]
7376mod tests {
7377    use super::*;
7378
7379    #[test]
7380    fn test_encoder_creation() {
7381        let encoder_arm = ArmEncoder::new_arm32();
7382        assert!(!encoder_arm.thumb_mode);
7383
7384        let encoder_thumb = ArmEncoder::new_thumb2();
7385        assert!(encoder_thumb.thumb_mode);
7386    }
7387
7388    /// #204 WAKE-path regression: `SetCond` materialized 0/1 with the 16-bit
7389    /// `MOVS Rd,#imm` (T1), whose Rd field is 3 bits (R0–R7). For a high Rd
7390    /// (R8–R12) `rd_bits << 8` overflows bit 11, flipping the opcode MOVS→CMP
7391    /// (`0x2c00`), so the boolean was never written — gale's `has_waiter` kept a
7392    /// stale value and the binary-sem WAKE dispatch read garbage. High Rd must
7393    /// use the 32-bit `MOV.W` (T2). Verify the bytes, not the IR.
7394    /// #311: the SAME high-Rd MOVS→CMP transmutation as #204, but in the
7395    /// i64 comparison expansions (I64SetCond / I64SetCondZ) — missed by the
7396    /// #204 hardening. With rd=R8 the boolean died in the flags
7397    /// (`ite eq; cmpeq r0,#1; cmpne r0,#0`), so gale's packed-u64 select
7398    /// read a stale register on silicon. High Rd must take MOV.W / CMP.W.
7399    #[test]
7400    fn test_encode_i64setcond_high_reg_uses_mov_w_311() {
7401        use synth_synthesis::{ArmOp, Condition, Reg};
7402        let enc = ArmEncoder::new_thumb2();
7403        let bytes = enc
7404            .encode(&ArmOp::I64SetCond {
7405                rd: Reg::R8,
7406                rn_lo: Reg::R2,
7407                rn_hi: Reg::R3,
7408                rm_lo: Reg::R6,
7409                rm_hi: Reg::R7,
7410                cond: Condition::EQ,
7411            })
7412            .unwrap();
7413        // The 32-bit MOV.W immediate (T2) first halfword is 0xF04F; the
7414        // 16-bit transmuted forms would contain 0x2801/0x2800 (CMP r0,#1/#0).
7415        let halfwords: Vec<u16> = bytes
7416            .chunks(2)
7417            .map(|c| u16::from_le_bytes([c[0], c[1]]))
7418            .collect();
7419        assert!(
7420            halfwords.iter().filter(|&&h| h == 0xF04F).count() == 2,
7421            "high rd must use two MOV.W (T2) encodings, got {halfwords:04x?}"
7422        );
7423        assert!(
7424            !halfwords.contains(&0x2801) && !halfwords.contains(&0x2800),
7425            "no transmuted 16-bit CMP imm: {halfwords:04x?}"
7426        );
7427
7428        let bytes_z = enc
7429            .encode(&ArmOp::I64SetCondZ {
7430                rd: Reg::R8,
7431                rn_lo: Reg::R2,
7432                rn_hi: Reg::R3,
7433            })
7434            .unwrap();
7435        let hw_z: Vec<u16> = bytes_z
7436            .chunks(2)
7437            .map(|c| u16::from_le_bytes([c[0], c[1]]))
7438            .collect();
7439        assert!(
7440            hw_z.iter().filter(|&&h| h == 0xF04F).count() == 2,
7441            "SetCondZ high rd MOV.W: {hw_z:04x?}"
7442        );
7443        // CMP.W rd,#0 (T2) first halfword: 0xF1B0 | rd
7444        assert!(
7445            hw_z.contains(&(0xF1B0 | 8)),
7446            "SetCondZ high rd must use CMP.W: {hw_z:04x?}"
7447        );
7448    }
7449
7450    #[test]
7451    fn test_encode_setcond_high_reg_uses_mov_w_204() {
7452        use synth_synthesis::{ArmOp, Condition, Reg};
7453        let enc = ArmEncoder::new_thumb2();
7454        // R12 (high): must be ITE + MOV.W #1 + MOV.W #0, never a 16-bit MOVS/CMP.
7455        let hi = enc
7456            .encode(&ArmOp::SetCond {
7457                rd: Reg::R12,
7458                cond: Condition::NE,
7459            })
7460            .unwrap();
7461        assert_eq!(hi.len(), 10, "ITE(2) + MOV.W(4) + MOV.W(4): {hi:02x?}");
7462        // both value halfwords are MOV.W (0xF04F) — NOT the corrupt CMP (0x2c..).
7463        assert_eq!(&hi[2..4], &[0x4F, 0xF0], "then = MOV.W: {hi:02x?}");
7464        assert_eq!(&hi[6..8], &[0x4F, 0xF0], "else = MOV.W: {hi:02x?}");
7465        assert_eq!(hi[4] & 0x0F, 0x01, "then imm = #1");
7466        assert_eq!(hi[8] & 0x0F, 0x00, "else imm = #0");
7467        // Low Rd keeps the compact 16-bit MOVS form.
7468        let lo = enc
7469            .encode(&ArmOp::SetCond {
7470                rd: Reg::R0,
7471                cond: Condition::NE,
7472            })
7473            .unwrap();
7474        assert_eq!(lo.len(), 6, "ITE(2) + MOVS(2) + MOVS(2): {lo:02x?}");
7475        assert_eq!(lo[2..4], [0x01, 0x20], "then = MOVS R0,#1");
7476        assert_eq!(lo[4..6], [0x00, 0x20], "else = MOVS R0,#0");
7477    }
7478
7479    /// #209 Opt 1b: UMULL RdLo, RdHi, Rn, Rm encodes correctly on both ISAs.
7480    /// Thumb-2 T1: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm.
7481    /// A32:        cond 0000 1000 RdHi RdLo Rm 1001 Rn.
7482    #[test]
7483    fn test_encode_umull_209b() {
7484        use synth_synthesis::{ArmOp, Reg};
7485        let op = ArmOp::Umull {
7486            rdlo: Reg::R4,
7487            rdhi: Reg::R5,
7488            rn: Reg::R0,
7489            rm: Reg::R3,
7490        };
7491        // Thumb-2: hw1 = 0xFBA0 | 0 = 0xFBA0; hw2 = (4<<12)|(5<<8)|3 = 0x4503.
7492        let t = ArmEncoder::new_thumb2().encode(&op).unwrap();
7493        assert_eq!(
7494            t,
7495            vec![0xA0, 0xFB, 0x03, 0x45],
7496            "umull r4,r5,r0,r3 (T2): {t:02x?}"
7497        );
7498        // A32: 0xE0800090 | (5<<16) | (4<<12) | (3<<8) | 0 = 0xE0854390.
7499        let a = ArmEncoder::new_arm32().encode(&op).unwrap();
7500        assert_eq!(
7501            a,
7502            0xE085_4390u32.to_le_bytes().to_vec(),
7503            "umull (A32): {a:02x?}"
7504        );
7505    }
7506
7507    /// #206 regression: the ARM32 (A32) `Ldr`/`Str` encoders fed `addr` through
7508    /// `encode_mem_addr`, which returns only the 12-bit immediate — so a register
7509    /// offset (`[rn, rm, #off]`) was silently dropped to `[rn, #off]`, sending
7510    /// the access to the wrong runtime address (silent miscompile on the default
7511    /// `--target arm`). A register offset must materialize `ip = rn + rm` and
7512    /// load from `[ip, #off]`. Verify the bytes.
7513    #[test]
7514    fn test_encode_arm32_indexed_load_keeps_index_206() {
7515        use synth_synthesis::{ArmOp, MemAddr, Reg};
7516        let enc = ArmEncoder::new_arm32();
7517        // ldr r0, [r11, r1, #8]  must NOT collapse to a single immediate ldr.
7518        let bytes = enc
7519            .encode(&ArmOp::Ldr {
7520                rd: Reg::R0,
7521                addr: MemAddr::reg_imm(Reg::R11, Reg::R1, 8),
7522            })
7523            .unwrap();
7524        assert_eq!(
7525            bytes.len(),
7526            8,
7527            "expected ADD ip + LDR (2 words): {bytes:02x?}"
7528        );
7529        let add = u32::from_le_bytes(bytes[0..4].try_into().unwrap());
7530        let ldr = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
7531        // ADD ip, r11, r1  = 0xE08BC001
7532        assert_eq!(add, 0xE08B_C001, "ADD ip,r11,r1: {add:#010x}");
7533        // LDR r0, [ip, #8] = 0xE59C0008
7534        assert_eq!(ldr, 0xE59C_0008, "LDR r0,[ip,#8]: {ldr:#010x}");
7535        // A bare immediate ldr (the bug) would be 0xE59B0008 (base=r11) — reject.
7536        assert_ne!(ldr, 0xE59B_0008, "index must not be dropped");
7537    }
7538
7539    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
7540    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
7541    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
7542    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
7543    /// dropping the address operand and miscompiling every optimized memory
7544    /// access. High registers must use the 32-bit `.W` forms.
7545    #[test]
7546    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
7547        let encoder = ArmEncoder::new_thumb2();
7548
7549        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
7550        let code = encoder
7551            .encode(&ArmOp::Add {
7552                rd: Reg::R12,
7553                rn: Reg::R12,
7554                op2: Operand2::Reg(Reg::R0),
7555            })
7556            .unwrap();
7557        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
7558        assert_eq!(
7559            code,
7560            vec![0x0C, 0xEB, 0x00, 0x0C],
7561            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
7562        );
7563        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
7564        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
7565
7566        // Low-register add stays 16-bit (no regression for the common case).
7567        let lo = encoder
7568            .encode(&ArmOp::Add {
7569                rd: Reg::R1,
7570                rn: Reg::R2,
7571                op2: Operand2::Reg(Reg::R3),
7572            })
7573            .unwrap();
7574        assert_eq!(
7575            lo.len(),
7576            2,
7577            "low-reg ADD should remain 16-bit, got {lo:02X?}"
7578        );
7579    }
7580
7581    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
7582    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
7583    #[test]
7584    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
7585        let encoder = ArmEncoder::new_thumb2();
7586
7587        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
7588        let adds = encoder
7589            .encode(&ArmOp::Adds {
7590                rd: Reg::R10,
7591                rn: Reg::R10,
7592                op2: Operand2::Reg(Reg::R8),
7593            })
7594            .unwrap();
7595        assert_eq!(
7596            adds,
7597            vec![0x1A, 0xEB, 0x08, 0x0A],
7598            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
7599        );
7600
7601        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
7602        let subs = encoder
7603            .encode(&ArmOp::Subs {
7604                rd: Reg::R10,
7605                rn: Reg::R10,
7606                op2: Operand2::Reg(Reg::R8),
7607            })
7608            .unwrap();
7609        assert_eq!(
7610            subs,
7611            vec![0xBA, 0xEB, 0x08, 0x0A],
7612            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
7613        );
7614    }
7615
7616    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
7617    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
7618    #[test]
7619    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
7620        let encoder = ArmEncoder::new_thumb2();
7621
7622        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7623        let cmn = encoder
7624            .encode(&ArmOp::Cmn {
7625                rn: Reg::R10,
7626                op2: Operand2::Reg(Reg::R8),
7627            })
7628            .unwrap();
7629        assert_eq!(
7630            cmn,
7631            vec![0x1A, 0xEB, 0x08, 0x0F],
7632            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7633        );
7634
7635        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7636        let lo = encoder
7637            .encode(&ArmOp::Cmn {
7638                rn: Reg::R1,
7639                op2: Operand2::Reg(Reg::R2),
7640            })
7641            .unwrap();
7642        assert_eq!(
7643            lo.len(),
7644            2,
7645            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7646        );
7647        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7648    }
7649
7650    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7651    /// guards its registers must return Err, not panic under debug-assertions.
7652    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7653    #[test]
7654    fn test_encode_pc_operand_returns_err_not_panic_185() {
7655        let encoder = ArmEncoder::new_thumb2();
7656        for op in [
7657            ArmOp::Sdiv {
7658                rd: Reg::PC,
7659                rn: Reg::R0,
7660                rm: Reg::R1,
7661            },
7662            ArmOp::Udiv {
7663                rd: Reg::R0,
7664                rn: Reg::PC,
7665                rm: Reg::R1,
7666            },
7667            ArmOp::Sdiv {
7668                rd: Reg::R0,
7669                rn: Reg::R1,
7670                rm: Reg::PC,
7671            },
7672        ] {
7673            let r = encoder.encode(&op);
7674            assert!(
7675                r.is_err(),
7676                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7677            );
7678        }
7679        // Valid registers still encode fine (no false rejection).
7680        assert!(
7681            encoder
7682                .encode(&ArmOp::Sdiv {
7683                    rd: Reg::R0,
7684                    rn: Reg::R1,
7685                    rm: Reg::R2
7686                })
7687                .is_ok()
7688        );
7689    }
7690
7691    #[test]
7692    fn test_encode_nop_arm32() {
7693        let encoder = ArmEncoder::new_arm32();
7694        let code = encoder.encode(&ArmOp::Nop).unwrap();
7695
7696        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7697        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7698    }
7699
7700    #[test]
7701    fn test_encode_nop_thumb() {
7702        let encoder = ArmEncoder::new_thumb2();
7703        let code = encoder.encode(&ArmOp::Nop).unwrap();
7704
7705        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7706        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7707    }
7708
7709    #[test]
7710    fn test_encode_mov_immediate_arm32() {
7711        let encoder = ArmEncoder::new_arm32();
7712        let op = ArmOp::Mov {
7713            rd: Reg::R0,
7714            op2: Operand2::Imm(42),
7715        };
7716
7717        let code = encoder.encode(&op).unwrap();
7718        assert_eq!(code.len(), 4);
7719
7720        // Verify it's a MOV instruction (bits should have immediate flag set)
7721        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7722        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7723    }
7724
7725    #[test]
7726    fn test_encode_add_registers_arm32() {
7727        let encoder = ArmEncoder::new_arm32();
7728        let op = ArmOp::Add {
7729            rd: Reg::R0,
7730            rn: Reg::R1,
7731            op2: Operand2::Reg(Reg::R2),
7732        };
7733
7734        let code = encoder.encode(&op).unwrap();
7735        assert_eq!(code.len(), 4);
7736
7737        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7738        // Verify it's an ADD instruction with correct opcode
7739        assert_eq!(instr & 0x0FE00000, 0x00800000);
7740    }
7741
7742    /// #350 — `encode_thumb32_add_imm` must lower an out-of-range immediate
7743    /// (> 0xFFF) to a legal MOVW(/MOVT) + ADD.W-register sequence instead of
7744    /// erroring. The small-imm fast path (imm <= 0xFFF) stays byte-identical.
7745    #[test]
7746    fn test_encode_add_imm_large_350() {
7747        let enc = ArmEncoder::new_thumb2();
7748
7749        // --- Fast path unchanged: imm <= 0xFFF is a single 4-byte ADD.W ---
7750        let small = enc
7751            .encode_thumb32_add_imm(&Reg::R0, &Reg::R1, 0x123)
7752            .unwrap();
7753        assert_eq!(small.len(), 4, "small imm must stay a single instruction");
7754
7755        // helper: decode a Thumb-2 MOVW/MOVT halfword pair back to its imm16
7756        fn movx_imm16(b: &[u8]) -> u32 {
7757            let hw1 = u16::from_le_bytes([b[0], b[1]]) as u32;
7758            let hw2 = u16::from_le_bytes([b[2], b[3]]) as u32;
7759            let imm4 = hw1 & 0xF;
7760            let i = (hw1 >> 10) & 1;
7761            let imm3 = (hw2 >> 12) & 0x7;
7762            let imm8 = hw2 & 0xFF;
7763            (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8
7764        }
7765        fn movx_rd(b: &[u8]) -> u32 {
7766            (u16::from_le_bytes([b[2], b[3]]) as u32 >> 8) & 0xF
7767        }
7768
7769        // --- rd != rn: scratch is rd. imm = 70000 = 0x11170 needs MOVW+MOVT. ---
7770        // 0x11170: lo16 = 0x1170, hi16 = 0x0001
7771        let seq = enc
7772            .encode_thumb32_add_imm(&Reg::R12, &Reg::R0, 70000)
7773            .unwrap();
7774        assert_eq!(seq.len(), 12, "MOVW + MOVT + ADD = 12 bytes");
7775        // MOVW r12, #0x1170
7776        assert_eq!(u16::from_le_bytes([seq[0], seq[1]]) & 0xFBF0, 0xF240);
7777        assert_eq!(movx_rd(&seq[0..4]), 12);
7778        assert_eq!(movx_imm16(&seq[0..4]), 0x1170);
7779        // MOVT r12, #0x0001
7780        assert_eq!(u16::from_le_bytes([seq[4], seq[5]]) & 0xFBF0, 0xF2C0);
7781        assert_eq!(movx_rd(&seq[4..8]), 12);
7782        assert_eq!(movx_imm16(&seq[4..8]), 0x0001);
7783        // ADD.W r12, r0, r12  (EB00 | rn=0 ; rd=12, rm=12)
7784        let add1 = u16::from_le_bytes([seq[8], seq[9]]) as u32;
7785        let add2 = u16::from_le_bytes([seq[10], seq[11]]) as u32;
7786        assert_eq!(add1 & 0xFFF0, 0xEB00);
7787        assert_eq!(add1 & 0xF, 0); // rn = r0
7788        assert_eq!((add2 >> 8) & 0xF, 12); // rd = r12
7789        assert_eq!(add2 & 0xF, 12); // rm = scratch = r12
7790        // The materialized scratch must reconstruct exactly 70000.
7791        assert_eq!(
7792            (movx_imm16(&seq[4..8]) << 16) | movx_imm16(&seq[0..4]),
7793            70000
7794        );
7795
7796        // --- imm <= 0xFFFF: MOVT is skipped (MOVW + ADD = 8 bytes). ---
7797        let seq16 = enc
7798            .encode_thumb32_add_imm(&Reg::R3, &Reg::R0, 0xABCD)
7799            .unwrap();
7800        assert_eq!(seq16.len(), 8, "imm <= 0xFFFF skips MOVT");
7801        assert_eq!(movx_imm16(&seq16[0..4]), 0xABCD);
7802        assert_eq!(movx_rd(&seq16[0..4]), 3); // scratch = rd = r3
7803
7804        // --- rd == rn (in-place add): scratch must be R12, not rd. ---
7805        // imm = 0x12345: lo16 = 0x2345, hi16 = 0x0001
7806        let inplace = enc
7807            .encode_thumb32_add_imm(&Reg::R5, &Reg::R5, 0x12345)
7808            .unwrap();
7809        assert_eq!(inplace.len(), 12);
7810        assert_eq!(movx_rd(&inplace[0..4]), 12, "rd==rn must use R12 scratch");
7811        assert_eq!(
7812            (movx_imm16(&inplace[4..8]) << 16) | movx_imm16(&inplace[0..4]),
7813            0x12345
7814        );
7815        // ADD.W r5, r5, r12 — rm must be the scratch (12), never rn.
7816        let ip_add2 = u16::from_le_bytes([inplace[10], inplace[11]]) as u32;
7817        assert_eq!(ip_add2 & 0xF, 12);
7818        assert_eq!((ip_add2 >> 8) & 0xF, 5);
7819    }
7820
7821    /// #350 follow-up — the `encoder_no_panic` fuzz harness drives the encoder
7822    /// with ARBITRARY registers, including the one case the in-place lowering
7823    /// cannot serve: rd==rn==R12. There the scratch (R12, the reserved encoder
7824    /// register) would alias Rn and clobber it before the ADD reads it. The
7825    /// encoder contract (#180/#185) is Ok-or-Err, never a panic — so this must
7826    /// return Err, not assert. (Real codegen never emits rd==rn==R12 because R12
7827    /// is non-allocatable; this guards only the fuzz/adversarial path.)
7828    #[test]
7829    fn test_encode_add_imm_large_rd_rn_r12_errs_not_panics_350() {
7830        let enc = ArmEncoder::new_thumb2();
7831        // Out-of-range imm with rd==rn==R12: no free scratch -> Err.
7832        let r = enc.encode_thumb32_add_imm(&Reg::R12, &Reg::R12, 70000);
7833        assert!(
7834            r.is_err(),
7835            "rd==rn==R12 with out-of-range imm must Err (no free scratch), got {r:?}"
7836        );
7837        // Small imm with rd==rn==R12 still takes the single-instruction fast path
7838        // (no scratch needed) and must succeed — the guard is scoped to the
7839        // out-of-range lowering only.
7840        let small = enc.encode_thumb32_add_imm(&Reg::R12, &Reg::R12, 0x10);
7841        assert!(small.is_ok(), "small imm needs no scratch, must stay Ok");
7842    }
7843
7844    #[test]
7845    fn test_encode_ldr_arm32() {
7846        let encoder = ArmEncoder::new_arm32();
7847        let op = ArmOp::Ldr {
7848            rd: Reg::R0,
7849            addr: MemAddr::imm(Reg::R1, 4),
7850        };
7851
7852        let code = encoder.encode(&op).unwrap();
7853        assert_eq!(code.len(), 4);
7854
7855        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7856        // Verify load bit is set
7857        assert_eq!(instr & 0x00100000, 0x00100000);
7858    }
7859
7860    #[test]
7861    fn test_encode_str_arm32() {
7862        let encoder = ArmEncoder::new_arm32();
7863        let op = ArmOp::Str {
7864            rd: Reg::R0,
7865            addr: MemAddr::imm(Reg::SP, 0),
7866        };
7867
7868        let code = encoder.encode(&op).unwrap();
7869        assert_eq!(code.len(), 4);
7870    }
7871
7872    #[test]
7873    fn test_encode_branch_arm32() {
7874        let encoder = ArmEncoder::new_arm32();
7875        let op = ArmOp::Bl {
7876            label: "main".to_string(),
7877        };
7878
7879        let code = encoder.encode(&op).unwrap();
7880        assert_eq!(code.len(), 4);
7881
7882        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7883        // Verify BL opcode
7884        assert_eq!(instr & 0x0F000000, 0x0B000000);
7885    }
7886
7887    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7888    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7889    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7890    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7891    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7892    ///     to fit (#167).
7893    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7894    ///     entry (#174).
7895    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7896    #[test]
7897    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7898        let encoder = ArmEncoder::new_thumb2();
7899        let op = ArmOp::Bl {
7900            label: "callee".to_string(),
7901        };
7902
7903        let code = encoder.encode(&op).unwrap();
7904        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7905
7906        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7907        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7908        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7909        assert_eq!(
7910            hw2, 0xFFFE,
7911            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
7912        );
7913        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
7914        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
7915    }
7916
7917    #[test]
7918    fn test_encode_sequence() {
7919        let encoder = ArmEncoder::new_arm32();
7920        let ops = vec![
7921            ArmOp::Mov {
7922                rd: Reg::R0,
7923                op2: Operand2::Imm(42),
7924            },
7925            ArmOp::Mov {
7926                rd: Reg::R1,
7927                op2: Operand2::Imm(10),
7928            },
7929            ArmOp::Add {
7930                rd: Reg::R2,
7931                rn: Reg::R0,
7932                op2: Operand2::Reg(Reg::R1),
7933            },
7934        ];
7935
7936        let code = encoder.encode_sequence(&ops).unwrap();
7937        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
7938    }
7939
7940    #[test]
7941    fn test_reg_to_bits() {
7942        assert_eq!(reg_to_bits(&Reg::R0), 0);
7943        assert_eq!(reg_to_bits(&Reg::R7), 7);
7944        assert_eq!(reg_to_bits(&Reg::SP), 13);
7945        assert_eq!(reg_to_bits(&Reg::LR), 14);
7946        assert_eq!(reg_to_bits(&Reg::PC), 15);
7947    }
7948
7949    #[test]
7950    fn test_encode_bitwise_operations() {
7951        let encoder = ArmEncoder::new_arm32();
7952
7953        let and_op = ArmOp::And {
7954            rd: Reg::R0,
7955            rn: Reg::R1,
7956            op2: Operand2::Reg(Reg::R2),
7957        };
7958        let and_code = encoder.encode(&and_op).unwrap();
7959        assert_eq!(and_code.len(), 4);
7960
7961        let orr_op = ArmOp::Orr {
7962            rd: Reg::R0,
7963            rn: Reg::R1,
7964            op2: Operand2::Reg(Reg::R2),
7965        };
7966        let orr_code = encoder.encode(&orr_op).unwrap();
7967        assert_eq!(orr_code.len(), 4);
7968
7969        let eor_op = ArmOp::Eor {
7970            rd: Reg::R0,
7971            rn: Reg::R1,
7972            op2: Operand2::Reg(Reg::R2),
7973        };
7974        let eor_code = encoder.encode(&eor_op).unwrap();
7975        assert_eq!(eor_code.len(), 4);
7976    }
7977
7978    // === Thumb-2 32-bit encoding tests ===
7979
7980    #[test]
7981    fn test_encode_sdiv_thumb2() {
7982        let encoder = ArmEncoder::new_thumb2();
7983        let op = ArmOp::Sdiv {
7984            rd: Reg::R0,
7985            rn: Reg::R1,
7986            rm: Reg::R2,
7987        };
7988
7989        let code = encoder.encode(&op).unwrap();
7990        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7991
7992        // SDIV R0, R1, R2: 0xFB91 0xF0F2
7993        // First halfword: 0xFB90 | Rn(1) = 0xFB91
7994        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
7995        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
7996        assert_eq!(code[0], 0x91);
7997        assert_eq!(code[1], 0xFB);
7998        assert_eq!(code[2], 0xF2);
7999        assert_eq!(code[3], 0xF0);
8000    }
8001
8002    #[test]
8003    fn test_encode_udiv_thumb2() {
8004        let encoder = ArmEncoder::new_thumb2();
8005        let op = ArmOp::Udiv {
8006            rd: Reg::R0,
8007            rn: Reg::R1,
8008            rm: Reg::R2,
8009        };
8010
8011        let code = encoder.encode(&op).unwrap();
8012        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8013
8014        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
8015        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
8016        assert_eq!(code[0], 0xB1);
8017        assert_eq!(code[1], 0xFB);
8018        assert_eq!(code[2], 0xF2);
8019        assert_eq!(code[3], 0xF0);
8020    }
8021
8022    #[test]
8023    fn test_encode_mul_thumb2() {
8024        let encoder = ArmEncoder::new_thumb2();
8025        let op = ArmOp::Mul {
8026            rd: Reg::R0,
8027            rn: Reg::R1,
8028            rm: Reg::R2,
8029        };
8030
8031        let code = encoder.encode(&op).unwrap();
8032        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8033    }
8034
8035    #[test]
8036    fn test_encode_and_thumb2() {
8037        let encoder = ArmEncoder::new_thumb2();
8038        let op = ArmOp::And {
8039            rd: Reg::R0,
8040            rn: Reg::R1,
8041            op2: Operand2::Reg(Reg::R2),
8042        };
8043
8044        let code = encoder.encode(&op).unwrap();
8045        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8046    }
8047
8048    #[test]
8049    fn test_encode_lsl_thumb2_low_regs() {
8050        let encoder = ArmEncoder::new_thumb2();
8051        let op = ArmOp::Lsl {
8052            rd: Reg::R0,
8053            rn: Reg::R1,
8054            shift: 5,
8055        };
8056
8057        let code = encoder.encode(&op).unwrap();
8058        assert_eq!(code.len(), 2); // 16-bit for low registers
8059    }
8060
8061    #[test]
8062    fn test_encode_clz_thumb2() {
8063        let encoder = ArmEncoder::new_thumb2();
8064        let op = ArmOp::Clz {
8065            rd: Reg::R0,
8066            rm: Reg::R1,
8067        };
8068
8069        let code = encoder.encode(&op).unwrap();
8070        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8071    }
8072
8073    #[test]
8074    fn test_encode_bx_thumb2() {
8075        let encoder = ArmEncoder::new_thumb2();
8076        let op = ArmOp::Bx { rm: Reg::LR };
8077
8078        let code = encoder.encode(&op).unwrap();
8079        assert_eq!(code.len(), 2); // 16-bit instruction
8080
8081        // BX LR: 0x4770
8082        assert_eq!(code, vec![0x70, 0x47]);
8083    }
8084
8085    // ========================================================================
8086    // f32 pseudo-op encoding tests
8087    // ========================================================================
8088
8089    #[test]
8090    fn test_encode_f32_abs_arm32() {
8091        let encoder = ArmEncoder::new_arm32();
8092        let op = ArmOp::F32Abs {
8093            sd: VfpReg::S0,
8094            sm: VfpReg::S2,
8095        };
8096        let code = encoder.encode(&op).unwrap();
8097        assert_eq!(code.len(), 4); // Single VFP instruction
8098    }
8099
8100    #[test]
8101    fn test_encode_f32_neg_arm32() {
8102        let encoder = ArmEncoder::new_arm32();
8103        let op = ArmOp::F32Neg {
8104            sd: VfpReg::S0,
8105            sm: VfpReg::S2,
8106        };
8107        let code = encoder.encode(&op).unwrap();
8108        assert_eq!(code.len(), 4);
8109    }
8110
8111    #[test]
8112    fn test_encode_f32_sqrt_arm32() {
8113        let encoder = ArmEncoder::new_arm32();
8114        let op = ArmOp::F32Sqrt {
8115            sd: VfpReg::S0,
8116            sm: VfpReg::S2,
8117        };
8118        let code = encoder.encode(&op).unwrap();
8119        assert_eq!(code.len(), 4);
8120    }
8121
8122    #[test]
8123    fn test_encode_f32_ceil_arm32() {
8124        let encoder = ArmEncoder::new_arm32();
8125        let op = ArmOp::F32Ceil {
8126            sd: VfpReg::S0,
8127            sm: VfpReg::S2,
8128        };
8129        let code = encoder.encode(&op).unwrap();
8130        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
8131        assert_eq!(code.len(), 36);
8132    }
8133
8134    #[test]
8135    fn test_encode_f32_floor_thumb2() {
8136        let encoder = ArmEncoder::new_thumb2();
8137        let op = ArmOp::F32Floor {
8138            sd: VfpReg::S0,
8139            sm: VfpReg::S2,
8140        };
8141        let code = encoder.encode(&op).unwrap();
8142        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
8143        assert_eq!(code.len(), 36);
8144    }
8145
8146    #[test]
8147    fn test_encode_f32_min_arm32() {
8148        let encoder = ArmEncoder::new_arm32();
8149        let op = ArmOp::F32Min {
8150            sd: VfpReg::S0,
8151            sn: VfpReg::S2,
8152            sm: VfpReg::S4,
8153        };
8154        let code = encoder.encode(&op).unwrap();
8155        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
8156    }
8157
8158    #[test]
8159    fn test_encode_f32_max_thumb2() {
8160        let encoder = ArmEncoder::new_thumb2();
8161        let op = ArmOp::F32Max {
8162            sd: VfpReg::S0,
8163            sn: VfpReg::S2,
8164            sm: VfpReg::S4,
8165        };
8166        let code = encoder.encode(&op).unwrap();
8167        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
8168        assert_eq!(code.len(), 18);
8169    }
8170
8171    #[test]
8172    fn test_encode_f32_copysign_arm32() {
8173        let encoder = ArmEncoder::new_arm32();
8174        let op = ArmOp::F32Copysign {
8175            sd: VfpReg::S0,
8176            sn: VfpReg::S2,
8177            sm: VfpReg::S4,
8178        };
8179        let code = encoder.encode(&op).unwrap();
8180        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
8181        assert_eq!(code.len(), 24);
8182    }
8183
8184    // ========================================================================
8185    // f64 encoding tests
8186    // ========================================================================
8187
8188    #[test]
8189    fn test_encode_f64_add_arm32() {
8190        let encoder = ArmEncoder::new_arm32();
8191        let op = ArmOp::F64Add {
8192            dd: VfpReg::D0,
8193            dn: VfpReg::D1,
8194            dm: VfpReg::D2,
8195        };
8196        let code = encoder.encode(&op).unwrap();
8197        assert_eq!(code.len(), 4);
8198        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
8199        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8200        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
8201    }
8202
8203    #[test]
8204    fn test_encode_f64_sub_thumb2() {
8205        let encoder = ArmEncoder::new_thumb2();
8206        let op = ArmOp::F64Sub {
8207            dd: VfpReg::D0,
8208            dn: VfpReg::D1,
8209            dm: VfpReg::D2,
8210        };
8211        let code = encoder.encode(&op).unwrap();
8212        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
8213    }
8214
8215    #[test]
8216    fn test_encode_f64_mul_arm32() {
8217        let encoder = ArmEncoder::new_arm32();
8218        let op = ArmOp::F64Mul {
8219            dd: VfpReg::D0,
8220            dn: VfpReg::D1,
8221            dm: VfpReg::D2,
8222        };
8223        let code = encoder.encode(&op).unwrap();
8224        assert_eq!(code.len(), 4);
8225    }
8226
8227    #[test]
8228    fn test_encode_f64_div_arm32() {
8229        let encoder = ArmEncoder::new_arm32();
8230        let op = ArmOp::F64Div {
8231            dd: VfpReg::D0,
8232            dn: VfpReg::D1,
8233            dm: VfpReg::D2,
8234        };
8235        let code = encoder.encode(&op).unwrap();
8236        assert_eq!(code.len(), 4);
8237    }
8238
8239    #[test]
8240    fn test_encode_f64_abs_arm32() {
8241        let encoder = ArmEncoder::new_arm32();
8242        let op = ArmOp::F64Abs {
8243            dd: VfpReg::D0,
8244            dm: VfpReg::D2,
8245        };
8246        let code = encoder.encode(&op).unwrap();
8247        assert_eq!(code.len(), 4);
8248    }
8249
8250    #[test]
8251    fn test_encode_f64_neg_arm32() {
8252        let encoder = ArmEncoder::new_arm32();
8253        let op = ArmOp::F64Neg {
8254            dd: VfpReg::D0,
8255            dm: VfpReg::D2,
8256        };
8257        let code = encoder.encode(&op).unwrap();
8258        assert_eq!(code.len(), 4);
8259    }
8260
8261    #[test]
8262    fn test_encode_f64_sqrt_arm32() {
8263        let encoder = ArmEncoder::new_arm32();
8264        let op = ArmOp::F64Sqrt {
8265            dd: VfpReg::D0,
8266            dm: VfpReg::D2,
8267        };
8268        let code = encoder.encode(&op).unwrap();
8269        assert_eq!(code.len(), 4);
8270    }
8271
8272    #[test]
8273    fn test_encode_f64_load_arm32() {
8274        let encoder = ArmEncoder::new_arm32();
8275        let op = ArmOp::F64Load {
8276            dd: VfpReg::D0,
8277            addr: MemAddr::imm(Reg::R0, 8),
8278        };
8279        let code = encoder.encode(&op).unwrap();
8280        assert_eq!(code.len(), 4);
8281        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8282        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
8283        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
8284    }
8285
8286    #[test]
8287    fn test_encode_f64_store_thumb2() {
8288        let encoder = ArmEncoder::new_thumb2();
8289        let op = ArmOp::F64Store {
8290            dd: VfpReg::D0,
8291            addr: MemAddr::imm(Reg::SP, 0),
8292        };
8293        let code = encoder.encode(&op).unwrap();
8294        assert_eq!(code.len(), 4);
8295    }
8296
8297    #[test]
8298    fn test_encode_f64_compare_arm32() {
8299        let encoder = ArmEncoder::new_arm32();
8300        let op = ArmOp::F64Eq {
8301            rd: Reg::R0,
8302            dn: VfpReg::D0,
8303            dm: VfpReg::D1,
8304        };
8305        let code = encoder.encode(&op).unwrap();
8306        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
8307    }
8308
8309    #[test]
8310    fn test_encode_f64_compare_thumb2() {
8311        let encoder = ArmEncoder::new_thumb2();
8312        let op = ArmOp::F64Lt {
8313            rd: Reg::R0,
8314            dn: VfpReg::D0,
8315            dm: VfpReg::D1,
8316        };
8317        let code = encoder.encode(&op).unwrap();
8318        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
8319        assert_eq!(code.len(), 14);
8320    }
8321
8322    #[test]
8323    fn test_encode_f64_const_arm32() {
8324        let encoder = ArmEncoder::new_arm32();
8325        let op = ArmOp::F64Const {
8326            dd: VfpReg::D0,
8327            value: 3.125,
8328        };
8329        let code = encoder.encode(&op).unwrap();
8330        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
8331        assert_eq!(code.len(), 20);
8332    }
8333
8334    #[test]
8335    fn test_encode_f64_const_thumb2() {
8336        let encoder = ArmEncoder::new_thumb2();
8337        let op = ArmOp::F64Const {
8338            dd: VfpReg::D0,
8339            value: 2.5,
8340        };
8341        let code = encoder.encode(&op).unwrap();
8342        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
8343        assert_eq!(code.len(), 20);
8344    }
8345
8346    #[test]
8347    fn test_encode_f64_convert_i32s_arm32() {
8348        let encoder = ArmEncoder::new_arm32();
8349        let op = ArmOp::F64ConvertI32S {
8350            dd: VfpReg::D0,
8351            rm: Reg::R0,
8352        };
8353        let code = encoder.encode(&op).unwrap();
8354        // VMOV(4) + VCVT(4) = 8
8355        assert_eq!(code.len(), 8);
8356    }
8357
8358    #[test]
8359    fn test_encode_f64_promote_f32_arm32() {
8360        let encoder = ArmEncoder::new_arm32();
8361        let op = ArmOp::F64PromoteF32 {
8362            dd: VfpReg::D0,
8363            sm: VfpReg::S0,
8364        };
8365        let code = encoder.encode(&op).unwrap();
8366        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
8367    }
8368
8369    #[test]
8370    fn test_encode_f64_promote_f32_thumb2() {
8371        let encoder = ArmEncoder::new_thumb2();
8372        let op = ArmOp::F64PromoteF32 {
8373            dd: VfpReg::D0,
8374            sm: VfpReg::S0,
8375        };
8376        let code = encoder.encode(&op).unwrap();
8377        assert_eq!(code.len(), 4);
8378    }
8379
8380    #[test]
8381    fn test_encode_i32_trunc_f64s_arm32() {
8382        let encoder = ArmEncoder::new_arm32();
8383        let op = ArmOp::I32TruncF64S {
8384            rd: Reg::R0,
8385            dm: VfpReg::D0,
8386        };
8387        let code = encoder.encode(&op).unwrap();
8388        // VCVT(4) + VMOV(4) = 8
8389        assert_eq!(code.len(), 8);
8390    }
8391
8392    #[test]
8393    fn test_encode_f64_reinterpret_i64_arm32() {
8394        let encoder = ArmEncoder::new_arm32();
8395        let op = ArmOp::F64ReinterpretI64 {
8396            dd: VfpReg::D0,
8397            rmlo: Reg::R0,
8398            rmhi: Reg::R1,
8399        };
8400        let code = encoder.encode(&op).unwrap();
8401        assert_eq!(code.len(), 4); // Single VMOV instruction
8402    }
8403
8404    #[test]
8405    fn test_encode_i64_reinterpret_f64_thumb2() {
8406        let encoder = ArmEncoder::new_thumb2();
8407        let op = ArmOp::I64ReinterpretF64 {
8408            rdlo: Reg::R0,
8409            rdhi: Reg::R1,
8410            dm: VfpReg::D0,
8411        };
8412        let code = encoder.encode(&op).unwrap();
8413        assert_eq!(code.len(), 4);
8414    }
8415
8416    #[test]
8417    fn test_encode_f64_trunc_thumb2() {
8418        let encoder = ArmEncoder::new_thumb2();
8419        let op = ArmOp::F64Trunc {
8420            dd: VfpReg::D0,
8421            dm: VfpReg::D1,
8422        };
8423        let code = encoder.encode(&op).unwrap();
8424        // Two VFP instructions via Thumb encoding
8425        assert_eq!(code.len(), 8);
8426    }
8427
8428    #[test]
8429    fn test_encode_f64_min_arm32() {
8430        let encoder = ArmEncoder::new_arm32();
8431        let op = ArmOp::F64Min {
8432            dd: VfpReg::D0,
8433            dn: VfpReg::D1,
8434            dm: VfpReg::D2,
8435        };
8436        let code = encoder.encode(&op).unwrap();
8437        // VMOV + VCMP + VMRS + conditional VMOV = 16
8438        assert_eq!(code.len(), 16);
8439    }
8440
8441    #[test]
8442    fn test_f64_cp11_encoding() {
8443        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
8444        let encoder = ArmEncoder::new_arm32();
8445
8446        // F64Add
8447        let code = encoder
8448            .encode(&ArmOp::F64Add {
8449                dd: VfpReg::D0,
8450                dn: VfpReg::D0,
8451                dm: VfpReg::D0,
8452            })
8453            .unwrap();
8454        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8455        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
8456
8457        // F32Add for comparison
8458        let code = encoder
8459            .encode(&ArmOp::F32Add {
8460                sd: VfpReg::S0,
8461                sn: VfpReg::S0,
8462                sm: VfpReg::S0,
8463            })
8464            .unwrap();
8465        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8466        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
8467    }
8468
8469    #[test]
8470    fn test_dreg_encoding_higher_registers() {
8471        let encoder = ArmEncoder::new_arm32();
8472
8473        // Test with D15 (highest register)
8474        let op = ArmOp::F64Add {
8475            dd: VfpReg::D15,
8476            dn: VfpReg::D14,
8477            dm: VfpReg::D13,
8478        };
8479        let code = encoder.encode(&op).unwrap();
8480        assert_eq!(code.len(), 4);
8481
8482        // Verify the register encoding worked (instruction is valid)
8483        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8484        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
8485    }
8486
8487    // ========================================================================
8488    // Control flow encoding tests
8489    // ========================================================================
8490
8491    #[test]
8492    fn test_encode_label_emits_no_bytes() {
8493        let encoder = ArmEncoder::new_thumb2();
8494        let op = ArmOp::Label {
8495            name: ".Lblock_end_0".to_string(),
8496        };
8497        let code = encoder.encode(&op).unwrap();
8498        assert!(code.is_empty(), "Label should emit zero bytes");
8499
8500        let encoder32 = ArmEncoder::new_arm32();
8501        let code32 = encoder32.encode(&op).unwrap();
8502        assert!(
8503            code32.is_empty(),
8504            "Label should emit zero bytes in ARM32 too"
8505        );
8506    }
8507
8508    #[test]
8509    fn test_encode_bcc_eq_thumb2() {
8510        use synth_synthesis::Condition;
8511        let encoder = ArmEncoder::new_thumb2();
8512        let op = ArmOp::Bcc {
8513            cond: Condition::EQ,
8514            label: "target".to_string(),
8515        };
8516        let code = encoder.encode(&op).unwrap();
8517        assert_eq!(code.len(), 2); // 16-bit conditional branch
8518
8519        // BEQ with offset 0: 0xD000 in little-endian
8520        assert_eq!(code, vec![0x00, 0xD0]);
8521    }
8522
8523    #[test]
8524    fn test_encode_bcc_ne_thumb2() {
8525        use synth_synthesis::Condition;
8526        let encoder = ArmEncoder::new_thumb2();
8527        let op = ArmOp::Bcc {
8528            cond: Condition::NE,
8529            label: "target".to_string(),
8530        };
8531        let code = encoder.encode(&op).unwrap();
8532        assert_eq!(code.len(), 2);
8533
8534        // BNE with offset 0: 0xD100 in little-endian
8535        assert_eq!(code, vec![0x00, 0xD1]);
8536    }
8537
8538    #[test]
8539    fn test_encode_bcc_arm32() {
8540        use synth_synthesis::Condition;
8541        let encoder = ArmEncoder::new_arm32();
8542        let op = ArmOp::Bcc {
8543            cond: Condition::EQ,
8544            label: "target".to_string(),
8545        };
8546        let code = encoder.encode(&op).unwrap();
8547        assert_eq!(code.len(), 4); // 32-bit ARM instruction
8548
8549        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8550        // BEQ: cond=0x0, opcode=0xA, offset=0
8551        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
8552        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
8553    }
8554
8555    #[test]
8556    fn test_encode_udf_thumb2() {
8557        let encoder = ArmEncoder::new_thumb2();
8558        let op = ArmOp::Udf { imm: 0 };
8559        let code = encoder.encode(&op).unwrap();
8560        assert_eq!(code.len(), 2); // 16-bit
8561
8562        // UDF #0: 0xDE00 in little-endian
8563        assert_eq!(code, vec![0x00, 0xDE]);
8564    }
8565
8566    #[test]
8567    fn test_encode_nop_thumb2() {
8568        let encoder = ArmEncoder::new_thumb2();
8569        let op = ArmOp::Nop;
8570        let code = encoder.encode(&op).unwrap();
8571        assert_eq!(code.len(), 2); // 16-bit
8572
8573        // NOP: 0xBF00 in little-endian
8574        assert_eq!(code, vec![0x00, 0xBF]);
8575    }
8576
8577    // =========================================================================
8578    // i64 Thumb-2 encoding tests
8579    // =========================================================================
8580
8581    #[test]
8582    fn test_encode_i64_add_thumb2() {
8583        let encoder = ArmEncoder::new_thumb2();
8584        let op = ArmOp::I64Add {
8585            rdlo: Reg::R0,
8586            rdhi: Reg::R1,
8587            rnlo: Reg::R0,
8588            rnhi: Reg::R1,
8589            rmlo: Reg::R2,
8590            rmhi: Reg::R3,
8591        };
8592        let code = encoder.encode(&op).unwrap();
8593        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
8594        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
8595    }
8596
8597    #[test]
8598    fn test_encode_i64_sub_thumb2() {
8599        let encoder = ArmEncoder::new_thumb2();
8600        let op = ArmOp::I64Sub {
8601            rdlo: Reg::R0,
8602            rdhi: Reg::R1,
8603            rnlo: Reg::R0,
8604            rnhi: Reg::R1,
8605            rmlo: Reg::R2,
8606            rmhi: Reg::R3,
8607        };
8608        let code = encoder.encode(&op).unwrap();
8609        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
8610        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
8611    }
8612
8613    #[test]
8614    fn test_encode_i64_and_thumb2() {
8615        let encoder = ArmEncoder::new_thumb2();
8616        let op = ArmOp::I64And {
8617            rdlo: Reg::R0,
8618            rdhi: Reg::R1,
8619            rnlo: Reg::R0,
8620            rnhi: Reg::R1,
8621            rmlo: Reg::R2,
8622            rmhi: Reg::R3,
8623        };
8624        let code = encoder.encode(&op).unwrap();
8625        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
8626        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
8627    }
8628
8629    #[test]
8630    fn test_encode_i64_or_thumb2() {
8631        let encoder = ArmEncoder::new_thumb2();
8632        let op = ArmOp::I64Or {
8633            rdlo: Reg::R0,
8634            rdhi: Reg::R1,
8635            rnlo: Reg::R0,
8636            rnhi: Reg::R1,
8637            rmlo: Reg::R2,
8638            rmhi: Reg::R3,
8639        };
8640        let code = encoder.encode(&op).unwrap();
8641        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
8642    }
8643
8644    #[test]
8645    fn test_encode_i64_xor_thumb2() {
8646        let encoder = ArmEncoder::new_thumb2();
8647        let op = ArmOp::I64Xor {
8648            rdlo: Reg::R0,
8649            rdhi: Reg::R1,
8650            rnlo: Reg::R0,
8651            rnhi: Reg::R1,
8652            rmlo: Reg::R2,
8653            rmhi: Reg::R3,
8654        };
8655        let code = encoder.encode(&op).unwrap();
8656        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
8657    }
8658
8659    #[test]
8660    fn test_encode_i64_const_small_thumb2() {
8661        let encoder = ArmEncoder::new_thumb2();
8662        // Small constant: only needs MOVW for each half
8663        let op = ArmOp::I64Const {
8664            rdlo: Reg::R0,
8665            rdhi: Reg::R1,
8666            value: 42,
8667        };
8668        let code = encoder.encode(&op).unwrap();
8669        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
8670        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
8671    }
8672
8673    #[test]
8674    fn test_encode_i64_const_large_thumb2() {
8675        let encoder = ArmEncoder::new_thumb2();
8676        // Large constant: needs MOVW+MOVT for each half
8677        let op = ArmOp::I64Const {
8678            rdlo: Reg::R0,
8679            rdhi: Reg::R1,
8680            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
8681        };
8682        let code = encoder.encode(&op).unwrap();
8683        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8684        assert_eq!(
8685            code.len(),
8686            16,
8687            "I64Const with large value should be 16 bytes"
8688        );
8689    }
8690
8691    #[test]
8692    fn test_encode_i64_extend_i32_s_thumb2() {
8693        let encoder = ArmEncoder::new_thumb2();
8694        let op = ArmOp::I64ExtendI32S {
8695            rdlo: Reg::R0,
8696            rdhi: Reg::R1,
8697            rn: Reg::R0,
8698        };
8699        let code = encoder.encode(&op).unwrap();
8700        // When rdlo == rn, only ASR (4 bytes) is emitted
8701        assert_eq!(
8702            code.len(),
8703            4,
8704            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
8705        );
8706    }
8707
8708    #[test]
8709    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
8710        let encoder = ArmEncoder::new_thumb2();
8711        let op = ArmOp::I64ExtendI32S {
8712            rdlo: Reg::R0,
8713            rdhi: Reg::R1,
8714            rn: Reg::R2,
8715        };
8716        let code = encoder.encode(&op).unwrap();
8717        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
8718        assert!(
8719            code.len() >= 6,
8720            "I64ExtendI32S (diff reg) should be at least 6 bytes"
8721        );
8722    }
8723
8724    #[test]
8725    fn test_encode_i64_extend_i32_u_thumb2() {
8726        let encoder = ArmEncoder::new_thumb2();
8727        let op = ArmOp::I64ExtendI32U {
8728            rdlo: Reg::R0,
8729            rdhi: Reg::R1,
8730            rn: Reg::R0,
8731        };
8732        let code = encoder.encode(&op).unwrap();
8733        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8734        assert_eq!(
8735            code.len(),
8736            2,
8737            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8738        );
8739    }
8740
8741    #[test]
8742    fn test_encode_i32_wrap_i64_nop_thumb2() {
8743        let encoder = ArmEncoder::new_thumb2();
8744        // When rd == rnlo, should be a NOP
8745        let op = ArmOp::I32WrapI64 {
8746            rd: Reg::R0,
8747            rnlo: Reg::R0,
8748        };
8749        let code = encoder.encode(&op).unwrap();
8750        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8751        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8752    }
8753
8754    #[test]
8755    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8756        let encoder = ArmEncoder::new_thumb2();
8757        let op = ArmOp::I32WrapI64 {
8758            rd: Reg::R2,
8759            rnlo: Reg::R0,
8760        };
8761        let code = encoder.encode(&op).unwrap();
8762        // MOV R2, R0 (2 or 4 bytes)
8763        assert!(
8764            code.len() >= 2,
8765            "I32WrapI64 diff reg should emit at least 2 bytes"
8766        );
8767    }
8768
8769    #[test]
8770    fn test_encode_i64_eqz_thumb2() {
8771        let encoder = ArmEncoder::new_thumb2();
8772        let op = ArmOp::I64Eqz {
8773            rd: Reg::R0,
8774            rnlo: Reg::R0,
8775            rnhi: Reg::R1,
8776        };
8777        let code = encoder.encode(&op).unwrap();
8778        // Delegates to I64SetCondZ which is already encoded
8779        assert!(
8780            code.len() >= 6,
8781            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8782        );
8783    }
8784
8785    #[test]
8786    fn test_encode_i64_eq_thumb2() {
8787        let encoder = ArmEncoder::new_thumb2();
8788        let op = ArmOp::I64Eq {
8789            rd: Reg::R0,
8790            rnlo: Reg::R0,
8791            rnhi: Reg::R1,
8792            rmlo: Reg::R2,
8793            rmhi: Reg::R3,
8794        };
8795        let code = encoder.encode(&op).unwrap();
8796        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8797        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8798    }
8799
8800    #[test]
8801    fn test_encode_i64_ldr_thumb2() {
8802        let encoder = ArmEncoder::new_thumb2();
8803        let op = ArmOp::I64Ldr {
8804            rdlo: Reg::R0,
8805            rdhi: Reg::R1,
8806            addr: MemAddr::imm(Reg::SP, 0),
8807        };
8808        let code = encoder.encode(&op).unwrap();
8809        // Two LDR instructions (lo at offset, hi at offset+4)
8810        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8811    }
8812
8813    #[test]
8814    fn test_372_i64_ldr_indexed_materializes_address() {
8815        // #372: a memory i64.load carries an index register (R11 + addr + off).
8816        // The encoder must materialize `ip = base + index` (ADD.W) and load via
8817        // `[ip,#off]` — NOT drop the index. A frame (non-indexed) i64.load must
8818        // stay byte-identical (plain `[base,#off]`, no ADD).
8819        let encoder = ArmEncoder::new_thumb2();
8820        let indexed = encoder
8821            .encode(&ArmOp::I64Ldr {
8822                rdlo: Reg::R0,
8823                rdhi: Reg::R1,
8824                addr: MemAddr::reg_imm(Reg::R11, Reg::R0, 0),
8825            })
8826            .unwrap();
8827        // ADD.W ip, fp, r0 = eb0b 0c00 (byte-verified vs arm-none-eabi-as).
8828        assert_eq!(
8829            &indexed[0..4],
8830            &[0x0b, 0xeb, 0x00, 0x0c],
8831            "indexed I64Ldr must start with ADD.W ip, base, index"
8832        );
8833        let frame = encoder
8834            .encode(&ArmOp::I64Ldr {
8835                rdlo: Reg::R0,
8836                rdhi: Reg::R1,
8837                addr: MemAddr::imm(Reg::SP, 8),
8838            })
8839            .unwrap();
8840        // No index -> no ADD.W prefix (byte-identical frame access).
8841        assert_ne!(
8842            &frame[0..2],
8843            &[0x0b, 0xeb],
8844            "frame (non-indexed) I64Ldr must NOT emit an ADD.W"
8845        );
8846    }
8847
8848    #[test]
8849    fn test_encode_i64_str_thumb2() {
8850        let encoder = ArmEncoder::new_thumb2();
8851        let op = ArmOp::I64Str {
8852            rdlo: Reg::R0,
8853            rdhi: Reg::R1,
8854            addr: MemAddr::imm(Reg::SP, 0),
8855        };
8856        let code = encoder.encode(&op).unwrap();
8857        // Two STR instructions (lo at offset, hi at offset+4)
8858        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8859    }
8860
8861    #[test]
8862    fn test_encode_i64_all_comparisons_thumb2() {
8863        let encoder = ArmEncoder::new_thumb2();
8864
8865        let ops = vec![
8866            ArmOp::I64Ne {
8867                rd: Reg::R0,
8868                rnlo: Reg::R0,
8869                rnhi: Reg::R1,
8870                rmlo: Reg::R2,
8871                rmhi: Reg::R3,
8872            },
8873            ArmOp::I64LtS {
8874                rd: Reg::R0,
8875                rnlo: Reg::R0,
8876                rnhi: Reg::R1,
8877                rmlo: Reg::R2,
8878                rmhi: Reg::R3,
8879            },
8880            ArmOp::I64LtU {
8881                rd: Reg::R0,
8882                rnlo: Reg::R0,
8883                rnhi: Reg::R1,
8884                rmlo: Reg::R2,
8885                rmhi: Reg::R3,
8886            },
8887            ArmOp::I64LeS {
8888                rd: Reg::R0,
8889                rnlo: Reg::R0,
8890                rnhi: Reg::R1,
8891                rmlo: Reg::R2,
8892                rmhi: Reg::R3,
8893            },
8894            ArmOp::I64LeU {
8895                rd: Reg::R0,
8896                rnlo: Reg::R0,
8897                rnhi: Reg::R1,
8898                rmlo: Reg::R2,
8899                rmhi: Reg::R3,
8900            },
8901            ArmOp::I64GtS {
8902                rd: Reg::R0,
8903                rnlo: Reg::R0,
8904                rnhi: Reg::R1,
8905                rmlo: Reg::R2,
8906                rmhi: Reg::R3,
8907            },
8908            ArmOp::I64GtU {
8909                rd: Reg::R0,
8910                rnlo: Reg::R0,
8911                rnhi: Reg::R1,
8912                rmlo: Reg::R2,
8913                rmhi: Reg::R3,
8914            },
8915            ArmOp::I64GeS {
8916                rd: Reg::R0,
8917                rnlo: Reg::R0,
8918                rnhi: Reg::R1,
8919                rmlo: Reg::R2,
8920                rmhi: Reg::R3,
8921            },
8922            ArmOp::I64GeU {
8923                rd: Reg::R0,
8924                rnlo: Reg::R0,
8925                rnhi: Reg::R1,
8926                rmlo: Reg::R2,
8927                rmhi: Reg::R3,
8928            },
8929        ];
8930
8931        for op in &ops {
8932            let code = encoder.encode(op).unwrap();
8933            assert!(
8934                code.len() >= 8,
8935                "i64 comparison {:?} should emit at least 8 bytes, got {}",
8936                op,
8937                code.len()
8938            );
8939        }
8940    }
8941
8942    #[test]
8943    fn test_encode_i64_const_zero_thumb2() {
8944        let encoder = ArmEncoder::new_thumb2();
8945        let op = ArmOp::I64Const {
8946            rdlo: Reg::R0,
8947            rdhi: Reg::R1,
8948            value: 0,
8949        };
8950        let code = encoder.encode(&op).unwrap();
8951        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
8952        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
8953    }
8954
8955    #[test]
8956    fn test_encode_i64_const_negative_one_thumb2() {
8957        let encoder = ArmEncoder::new_thumb2();
8958        let op = ArmOp::I64Const {
8959            rdlo: Reg::R0,
8960            rdhi: Reg::R1,
8961            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
8962        };
8963        let code = encoder.encode(&op).unwrap();
8964        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8965        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
8966    }
8967
8968    // =========================================================================
8969    // Sub-word load/store encoding tests
8970    // =========================================================================
8971
8972    #[test]
8973    fn test_encode_ldrb_arm32() {
8974        let encoder = ArmEncoder::new_arm32();
8975        let op = ArmOp::Ldrb {
8976            rd: Reg::R0,
8977            addr: MemAddr::imm(Reg::R1, 4),
8978        };
8979        let code = encoder.encode(&op).unwrap();
8980        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
8981        // LDRB R0, [R1, #4] = 0xE5D10004
8982        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8983        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
8984    }
8985
8986    #[test]
8987    fn test_encode_strb_arm32() {
8988        let encoder = ArmEncoder::new_arm32();
8989        let op = ArmOp::Strb {
8990            rd: Reg::R0,
8991            addr: MemAddr::imm(Reg::R1, 0),
8992        };
8993        let code = encoder.encode(&op).unwrap();
8994        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
8995        // STRB R0, [R1, #0] = 0xE5C10000
8996        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8997        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
8998    }
8999
9000    #[test]
9001    fn test_encode_ldrh_arm32() {
9002        let encoder = ArmEncoder::new_arm32();
9003        let op = ArmOp::Ldrh {
9004            rd: Reg::R0,
9005            addr: MemAddr::imm(Reg::R1, 2),
9006        };
9007        let code = encoder.encode(&op).unwrap();
9008        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
9009    }
9010
9011    #[test]
9012    fn test_encode_strh_arm32() {
9013        let encoder = ArmEncoder::new_arm32();
9014        let op = ArmOp::Strh {
9015            rd: Reg::R0,
9016            addr: MemAddr::imm(Reg::R1, 0),
9017        };
9018        let code = encoder.encode(&op).unwrap();
9019        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
9020    }
9021
9022    #[test]
9023    fn test_encode_ldrsb_arm32() {
9024        let encoder = ArmEncoder::new_arm32();
9025        let op = ArmOp::Ldrsb {
9026            rd: Reg::R0,
9027            addr: MemAddr::imm(Reg::R1, 0),
9028        };
9029        let code = encoder.encode(&op).unwrap();
9030        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
9031    }
9032
9033    #[test]
9034    fn test_encode_ldrsh_arm32() {
9035        let encoder = ArmEncoder::new_arm32();
9036        let op = ArmOp::Ldrsh {
9037            rd: Reg::R0,
9038            addr: MemAddr::imm(Reg::R1, 0),
9039        };
9040        let code = encoder.encode(&op).unwrap();
9041        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
9042    }
9043
9044    #[test]
9045    fn test_encode_ldrb_thumb2_16bit() {
9046        let encoder = ArmEncoder::new_thumb2();
9047        let op = ArmOp::Ldrb {
9048            rd: Reg::R0,
9049            addr: MemAddr::imm(Reg::R1, 4),
9050        };
9051        let code = encoder.encode(&op).unwrap();
9052        // Low registers + small offset -> 16-bit encoding
9053        assert_eq!(
9054            code.len(),
9055            2,
9056            "Thumb-2 LDRB with small offset should be 16-bit"
9057        );
9058    }
9059
9060    #[test]
9061    fn test_encode_ldrb_thumb2_32bit() {
9062        let encoder = ArmEncoder::new_thumb2();
9063        let op = ArmOp::Ldrb {
9064            rd: Reg::R0,
9065            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
9066        };
9067        let code = encoder.encode(&op).unwrap();
9068        assert_eq!(
9069            code.len(),
9070            4,
9071            "Thumb-2 LDRB with large offset should be 32-bit"
9072        );
9073    }
9074
9075    #[test]
9076    fn test_encode_strb_thumb2_16bit() {
9077        let encoder = ArmEncoder::new_thumb2();
9078        let op = ArmOp::Strb {
9079            rd: Reg::R0,
9080            addr: MemAddr::imm(Reg::R1, 10),
9081        };
9082        let code = encoder.encode(&op).unwrap();
9083        assert_eq!(
9084            code.len(),
9085            2,
9086            "Thumb-2 STRB with small offset should be 16-bit"
9087        );
9088    }
9089
9090    #[test]
9091    fn test_encode_ldrh_thumb2_16bit() {
9092        let encoder = ArmEncoder::new_thumb2();
9093        let op = ArmOp::Ldrh {
9094            rd: Reg::R0,
9095            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
9096        };
9097        let code = encoder.encode(&op).unwrap();
9098        assert_eq!(
9099            code.len(),
9100            2,
9101            "Thumb-2 LDRH with small aligned offset should be 16-bit"
9102        );
9103    }
9104
9105    #[test]
9106    fn test_encode_strh_thumb2_16bit() {
9107        let encoder = ArmEncoder::new_thumb2();
9108        let op = ArmOp::Strh {
9109            rd: Reg::R0,
9110            addr: MemAddr::imm(Reg::R1, 4),
9111        };
9112        let code = encoder.encode(&op).unwrap();
9113        assert_eq!(
9114            code.len(),
9115            2,
9116            "Thumb-2 STRH with small aligned offset should be 16-bit"
9117        );
9118    }
9119
9120    #[test]
9121    fn test_encode_ldrsb_thumb2() {
9122        let encoder = ArmEncoder::new_thumb2();
9123        let op = ArmOp::Ldrsb {
9124            rd: Reg::R0,
9125            addr: MemAddr::imm(Reg::R1, 0),
9126        };
9127        let code = encoder.encode(&op).unwrap();
9128        // LDRSB has no 16-bit immediate form, always 32-bit
9129        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
9130    }
9131
9132    #[test]
9133    fn test_encode_ldrsh_thumb2() {
9134        let encoder = ArmEncoder::new_thumb2();
9135        let op = ArmOp::Ldrsh {
9136            rd: Reg::R0,
9137            addr: MemAddr::imm(Reg::R1, 0),
9138        };
9139        let code = encoder.encode(&op).unwrap();
9140        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
9141    }
9142
9143    #[test]
9144    fn test_encode_memory_size_thumb2() {
9145        let encoder = ArmEncoder::new_thumb2();
9146        let op = ArmOp::MemorySize { rd: Reg::R0 };
9147        let code = encoder.encode(&op).unwrap();
9148        // R0 and R10 are not both low registers, so this needs careful handling
9149        assert!(!code.is_empty(), "MemorySize should produce code");
9150    }
9151
9152    #[test]
9153    fn test_encode_memory_grow_thumb2() {
9154        let encoder = ArmEncoder::new_thumb2();
9155        let op = ArmOp::MemoryGrow {
9156            rd: Reg::R0,
9157            rn: Reg::R0,
9158        };
9159        let code = encoder.encode(&op).unwrap();
9160        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
9161    }
9162
9163    #[test]
9164    fn test_encode_subword_reg_offset_thumb2() {
9165        let encoder = ArmEncoder::new_thumb2();
9166
9167        // LDRB with register offset
9168        let op = ArmOp::Ldrb {
9169            rd: Reg::R0,
9170            addr: MemAddr::reg(Reg::R1, Reg::R2),
9171        };
9172        let code = encoder.encode(&op).unwrap();
9173        assert_eq!(
9174            code.len(),
9175            4,
9176            "Thumb-2 LDRB with reg offset should be 32-bit"
9177        );
9178
9179        // STRB with register offset
9180        let op = ArmOp::Strb {
9181            rd: Reg::R0,
9182            addr: MemAddr::reg(Reg::R1, Reg::R2),
9183        };
9184        let code = encoder.encode(&op).unwrap();
9185        assert_eq!(
9186            code.len(),
9187            4,
9188            "Thumb-2 STRB with reg offset should be 32-bit"
9189        );
9190
9191        // LDRH with register offset
9192        let op = ArmOp::Ldrh {
9193            rd: Reg::R0,
9194            addr: MemAddr::reg(Reg::R1, Reg::R2),
9195        };
9196        let code = encoder.encode(&op).unwrap();
9197        assert_eq!(
9198            code.len(),
9199            4,
9200            "Thumb-2 LDRH with reg offset should be 32-bit"
9201        );
9202
9203        // STRH with register offset
9204        let op = ArmOp::Strh {
9205            rd: Reg::R0,
9206            addr: MemAddr::reg(Reg::R1, Reg::R2),
9207        };
9208        let code = encoder.encode(&op).unwrap();
9209        assert_eq!(
9210            code.len(),
9211            4,
9212            "Thumb-2 STRH with reg offset should be 32-bit"
9213        );
9214    }
9215
9216    #[test]
9217    fn test_encode_subword_reg_imm_offset_thumb2() {
9218        let encoder = ArmEncoder::new_thumb2();
9219
9220        // LDRB with both register and immediate offset
9221        let op = ArmOp::Ldrb {
9222            rd: Reg::R0,
9223            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
9224        };
9225        let code = encoder.encode(&op).unwrap();
9226        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
9227        assert_eq!(
9228            code.len(),
9229            8,
9230            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
9231        );
9232    }
9233
9234    // ========================================================================
9235    // Helium MVE encoding tests
9236    // ========================================================================
9237
9238    #[test]
9239    fn test_encode_mve_addi32_thumb2() {
9240        let encoder = ArmEncoder::new_thumb2();
9241        let op = ArmOp::MveAddI {
9242            qd: QReg::Q0,
9243            qn: QReg::Q1,
9244            qm: QReg::Q2,
9245            size: MveSize::S32,
9246        };
9247        let code = encoder.encode(&op).unwrap();
9248        assert_eq!(
9249            code.len(),
9250            4,
9251            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
9252        );
9253    }
9254
9255    #[test]
9256    fn test_encode_mve_subi16_thumb2() {
9257        let encoder = ArmEncoder::new_thumb2();
9258        let op = ArmOp::MveSubI {
9259            qd: QReg::Q0,
9260            qn: QReg::Q1,
9261            qm: QReg::Q2,
9262            size: MveSize::S16,
9263        };
9264        let code = encoder.encode(&op).unwrap();
9265        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
9266    }
9267
9268    #[test]
9269    fn test_encode_mve_muli8_thumb2() {
9270        let encoder = ArmEncoder::new_thumb2();
9271        let op = ArmOp::MveMulI {
9272            qd: QReg::Q0,
9273            qn: QReg::Q1,
9274            qm: QReg::Q2,
9275            size: MveSize::S8,
9276        };
9277        let code = encoder.encode(&op).unwrap();
9278        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
9279    }
9280
9281    #[test]
9282    fn test_encode_mve_bitwise_thumb2() {
9283        let encoder = ArmEncoder::new_thumb2();
9284
9285        let ops = vec![
9286            ArmOp::MveAnd {
9287                qd: QReg::Q0,
9288                qn: QReg::Q1,
9289                qm: QReg::Q2,
9290            },
9291            ArmOp::MveOrr {
9292                qd: QReg::Q0,
9293                qn: QReg::Q1,
9294                qm: QReg::Q2,
9295            },
9296            ArmOp::MveEor {
9297                qd: QReg::Q0,
9298                qn: QReg::Q1,
9299                qm: QReg::Q2,
9300            },
9301            ArmOp::MveBic {
9302                qd: QReg::Q0,
9303                qn: QReg::Q1,
9304                qm: QReg::Q2,
9305            },
9306        ];
9307        for op in ops {
9308            let code = encoder.encode(&op).unwrap();
9309            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
9310        }
9311    }
9312
9313    #[test]
9314    fn test_encode_mve_mvn_thumb2() {
9315        let encoder = ArmEncoder::new_thumb2();
9316        let op = ArmOp::MveMvn {
9317            qd: QReg::Q0,
9318            qm: QReg::Q1,
9319        };
9320        let code = encoder.encode(&op).unwrap();
9321        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
9322    }
9323
9324    #[test]
9325    fn test_encode_mve_load_store_thumb2() {
9326        let encoder = ArmEncoder::new_thumb2();
9327
9328        let load = ArmOp::MveLoad {
9329            qd: QReg::Q0,
9330            addr: MemAddr::imm(Reg::R0, 16),
9331        };
9332        let code = encoder.encode(&load).unwrap();
9333        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
9334
9335        let store = ArmOp::MveStore {
9336            qd: QReg::Q1,
9337            addr: MemAddr::imm(Reg::R1, 0),
9338        };
9339        let code = encoder.encode(&store).unwrap();
9340        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
9341    }
9342
9343    #[test]
9344    fn test_encode_mve_const_thumb2() {
9345        let encoder = ArmEncoder::new_thumb2();
9346        let op = ArmOp::MveConst {
9347            qd: QReg::Q0,
9348            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
9349        };
9350        let code = encoder.encode(&op).unwrap();
9351        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
9352        // Some words with hi16=0 skip MOVT, so length varies
9353        assert!(
9354            code.len() >= 24,
9355            "MVE const should produce multiple instructions"
9356        );
9357    }
9358
9359    #[test]
9360    fn test_encode_mve_dup_thumb2() {
9361        let encoder = ArmEncoder::new_thumb2();
9362        let op = ArmOp::MveDup {
9363            qd: QReg::Q0,
9364            rn: Reg::R0,
9365            size: MveSize::S32,
9366        };
9367        let code = encoder.encode(&op).unwrap();
9368        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
9369    }
9370
9371    #[test]
9372    fn test_encode_mve_extract_lane_thumb2() {
9373        let encoder = ArmEncoder::new_thumb2();
9374        let op = ArmOp::MveExtractLane {
9375            rd: Reg::R0,
9376            qn: QReg::Q1,
9377            lane: 2,
9378            size: MveSize::S32,
9379        };
9380        let code = encoder.encode(&op).unwrap();
9381        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
9382    }
9383
9384    #[test]
9385    fn test_encode_mve_insert_lane_thumb2() {
9386        let encoder = ArmEncoder::new_thumb2();
9387        let op = ArmOp::MveInsertLane {
9388            qd: QReg::Q0,
9389            rn: Reg::R1,
9390            lane: 3,
9391            size: MveSize::S32,
9392        };
9393        let code = encoder.encode(&op).unwrap();
9394        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
9395    }
9396
9397    #[test]
9398    fn test_encode_mve_addf32_thumb2() {
9399        let encoder = ArmEncoder::new_thumb2();
9400        let op = ArmOp::MveAddF32 {
9401            qd: QReg::Q0,
9402            qn: QReg::Q1,
9403            qm: QReg::Q2,
9404        };
9405        let code = encoder.encode(&op).unwrap();
9406        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
9407    }
9408
9409    #[test]
9410    fn test_encode_mve_divf32_thumb2() {
9411        let encoder = ArmEncoder::new_thumb2();
9412        let op = ArmOp::MveDivF32 {
9413            qd: QReg::Q0,
9414            qn: QReg::Q1,
9415            qm: QReg::Q2,
9416        };
9417        let code = encoder.encode(&op).unwrap();
9418        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
9419        assert_eq!(
9420            code.len(),
9421            16,
9422            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
9423        );
9424    }
9425
9426    #[test]
9427    fn test_encode_mve_sqrtf32_thumb2() {
9428        let encoder = ArmEncoder::new_thumb2();
9429        let op = ArmOp::MveSqrtF32 {
9430            qd: QReg::Q0,
9431            qm: QReg::Q1,
9432        };
9433        let code = encoder.encode(&op).unwrap();
9434        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
9435        assert_eq!(
9436            code.len(),
9437            16,
9438            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
9439        );
9440    }
9441
9442    #[test]
9443    fn test_encode_mve_negf32_thumb2() {
9444        let encoder = ArmEncoder::new_thumb2();
9445        let op = ArmOp::MveNegF32 {
9446            qd: QReg::Q0,
9447            qm: QReg::Q1,
9448        };
9449        let code = encoder.encode(&op).unwrap();
9450        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
9451    }
9452
9453    #[test]
9454    fn test_encode_mve_absf32_thumb2() {
9455        let encoder = ArmEncoder::new_thumb2();
9456        let op = ArmOp::MveAbsF32 {
9457            qd: QReg::Q0,
9458            qm: QReg::Q1,
9459        };
9460        let code = encoder.encode(&op).unwrap();
9461        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
9462    }
9463
9464    /// VCR-RA-001 / immediate-folding precondition: pins the Thumb-2 `AND`
9465    /// immediate encoding for the byte range and documents its bound.
9466    ///
9467    /// The `And { Operand2::Imm }` encoder packs the low 12 bits straight into
9468    /// the `i:imm3:imm8` field WITHOUT applying ThumbExpandImm (the modified-
9469    /// immediate expansion). For `imm <= 0xFF` (e.g. gale's int8 clamps
9470    /// `#0x7e` / `#0x7f`) that is correct — `i:imm3 = 0000` means "imm8
9471    /// zero-extended". So `and r2, r0, #0x7e` encodes to the canonical
9472    /// `00 f0 7e 02`. For `imm >= 0x100` the field would need a true
9473    /// ThumbExpandImm pattern (rotation / replication), which is NOT
9474    /// implemented here — so **immediate folding must gate on `imm <= 0xFF`**
9475    /// until the encoder is hardened to ThumbExpandImm/Ok-or-Err (the
9476    /// "encoder must be Ok-or-Err, never silently wrong" principle, #180/#185).
9477    /// This bound covers the measured `flat_flight` waste (#209).
9478    #[test]
9479    fn and_immediate_encodes_correctly_in_byte_range_documents_fold_bound() {
9480        let encoder = ArmEncoder::new_thumb2();
9481        let op = ArmOp::And {
9482            rd: Reg::R2,
9483            rn: Reg::R0,
9484            op2: Operand2::Imm(0x7e),
9485        };
9486        let code = encoder.encode(&op).unwrap();
9487        assert_eq!(
9488            code,
9489            vec![0x00, 0xf0, 0x7e, 0x02],
9490            "and r2, r0, #0x7e must encode to the canonical AND.W T1 (imm8=0x7e)"
9491        );
9492    }
9493
9494    /// #255: the shared ThumbExpandImm reverse-encoder underpinning the
9495    /// data-processing immediate fix. Encodable modified immediates round-trip to
9496    /// the expected `i:imm3:imm8` field; a genuinely non-modified value is `None`
9497    /// (caller must materialize into a register). Note `1000 = 0xFA ror 30` *is*
9498    /// representable (field 0xF7A) — the old encoder mis-encoded it (raw 0x3E8);
9499    /// this encodes it correctly.
9500    #[test]
9501    fn try_thumb_expand_imm_encodes_modified_immediates() {
9502        assert_eq!(try_thumb_expand_imm(0x7e), Some(0x07e)); // zero-extended byte
9503        assert_eq!(try_thumb_expand_imm(0xff), Some(0x0ff));
9504        assert_eq!(try_thumb_expand_imm(0x0001_0001), Some(0x101)); // 0x00XY00XY
9505        assert_eq!(try_thumb_expand_imm(0xff00_ff00), Some(0x2ff)); // 0xXY00XY00
9506        assert_eq!(try_thumb_expand_imm(0xffff_ffff), Some(0x3ff)); // 0xXYXYXYXY
9507        assert_eq!(try_thumb_expand_imm(0x100), Some(0xf80)); // 0x80 ror 31
9508        assert_eq!(try_thumb_expand_imm(0x8000_0000), Some(0x400)); // 0x80 ror 8
9509        assert_eq!(try_thumb_expand_imm(1000), Some(0xf7a)); // 0xFA ror 30
9510        // Genuinely unrepresentable (bits too far apart for an 8-bit window).
9511        assert_eq!(try_thumb_expand_imm(0x101), None);
9512        assert_eq!(try_thumb_expand_imm(0x12345), None);
9513    }
9514
9515    /// #255: CMP/ADDS/SUBS encode any valid modified immediate correctly, and
9516    /// ERROR (not silently mis-encode) on a genuinely unrepresentable one,
9517    /// forcing the selector to materialize into a register — closing the
9518    /// silent-miscompile class of #251/#253.
9519    #[test]
9520    fn cmp_adds_subs_immediate_error_on_non_modified_imm() {
9521        let encoder = ArmEncoder::new_thumb2();
9522        // cmp r0, #0xff → valid → Ok; cmp r0, #1000 → valid (0xFA ror 30) → Ok.
9523        assert!(encoder.encode_thumb32_cmp_imm(&Reg::R0, 0xff).is_ok());
9524        assert!(encoder.encode_thumb32_cmp_imm(&Reg::R0, 1000).is_ok());
9525        // cmp r0, #0x101 → NOT a modified immediate → Err (materialize-reg).
9526        assert!(
9527            encoder.encode_thumb32_cmp_imm(&Reg::R0, 0x101).is_err(),
9528            "cmp #0x101 must error, not compare the wrong constant"
9529        );
9530        assert!(
9531            encoder
9532                .encode_thumb32_adds(&Reg::R0, &Reg::R0, 0x101)
9533                .is_err()
9534        );
9535        assert!(
9536            encoder
9537                .encode_thumb32_subs(&Reg::R0, &Reg::R0, 0x101)
9538                .is_err()
9539        );
9540        // ...but a valid modified immediate still encodes.
9541        assert!(
9542            encoder
9543                .encode_thumb32_adds(&Reg::R0, &Reg::R0, 0x80)
9544                .is_ok()
9545        );
9546    }
9547
9548    /// #257: MLA (multiply-accumulate) encodes as MLS without the bit-4 op flag.
9549    /// `mla r2, r3, r4, r8` (rd=r2, rn=r3, rm=r4, ra=r8) → Thumb-2 `03 fb 04 82`.
9550    #[test]
9551    fn mla_thumb2_encodes_correctly() {
9552        let encoder = ArmEncoder::new_thumb2();
9553        let code = encoder
9554            .encode(&ArmOp::Mla {
9555                rd: Reg::R2,
9556                rn: Reg::R3,
9557                rm: Reg::R4,
9558                ra: Reg::R8,
9559            })
9560            .unwrap();
9561        // hw1 = 0xFB03, hw2 = (8<<12)|(2<<8)|4 = 0x8204
9562        assert_eq!(code, vec![0x03, 0xfb, 0x04, 0x82]);
9563    }
9564
9565    /// #259: LDR/STR (and sub-word) immediate-offset encoders truncated
9566    /// `offset & 0xFFF`, silently targeting the wrong address for offset >= 4096.
9567    /// They now error (the selector must use register-offset addressing) — the
9568    /// load/store sibling of the #253/#255 class. Offsets <= 4095 still encode.
9569    #[test]
9570    fn ldst_imm12_offset_errors_when_out_of_range() {
9571        let encoder = ArmEncoder::new_thumb2();
9572        // offset 0xFFF (4095): valid → Ok; ldr r0, [r1, #4095].
9573        assert!(
9574            encoder
9575                .encode_thumb32_ldr(&Reg::R0, &Reg::R1, 0xFFF)
9576                .is_ok()
9577        );
9578        // offset 0x1000 (4096): out of imm12 range → Err (not & 0xFFF → #0).
9579        assert!(
9580            encoder
9581                .encode_thumb32_ldr(&Reg::R0, &Reg::R1, 0x1000)
9582                .is_err(),
9583            "ldr offset 4096 must error, not wrap to 0"
9584        );
9585        assert!(
9586            encoder
9587                .encode_thumb32_str(&Reg::R0, &Reg::R1, 0x1000)
9588                .is_err()
9589        );
9590        assert!(
9591            encoder
9592                .encode_thumb32_ldrb_imm(&Reg::R0, &Reg::R1, 5000)
9593                .is_err()
9594        );
9595        assert!(
9596            encoder
9597                .encode_thumb32_strh_imm(&Reg::R0, &Reg::R1, 5000)
9598                .is_err()
9599        );
9600    }
9601
9602    /// Latent miscompile fix: ADD/SUB with a >0xFF immediate (e.g.
9603    /// `add sp, sp, #frame` for a >=256-byte frame) used ADD.W (T3), whose
9604    /// `i:imm3:imm8` is a ThumbExpandImm modified immediate — so `#256` silently
9605    /// encoded as `#0` (stack corruption). Use ADDW/SUBW (T4), a PLAIN 12-bit
9606    /// immediate, for 0x100..=0xFFF; keep T3 for <=0xFF (bit-identical); error
9607    /// beyond 4095.
9608    #[test]
9609    fn add_sub_large_immediate_use_addw_subw_not_misencoded() {
9610        let encoder = ArmEncoder::new_thumb2();
9611        // add sp, sp, #256  →  ADDW (T4) SP, SP, #256  =  0d f2 00 1d
9612        assert_eq!(
9613            encoder
9614                .encode(&ArmOp::Add {
9615                    rd: Reg::SP,
9616                    rn: Reg::SP,
9617                    op2: Operand2::Imm(256),
9618                })
9619                .unwrap(),
9620            vec![0x0d, 0xf2, 0x00, 0x1d],
9621            "add sp,sp,#256 must be ADDW (plain imm12), not a mis-encoded ADD.W"
9622        );
9623        // sub sp, sp, #256  →  SUBW (T4) SP, SP, #256  =  ad f2 00 1d
9624        assert_eq!(
9625            encoder
9626                .encode(&ArmOp::Sub {
9627                    rd: Reg::SP,
9628                    rn: Reg::SP,
9629                    op2: Operand2::Imm(256),
9630                })
9631                .unwrap(),
9632            vec![0xad, 0xf2, 0x00, 0x1d],
9633        );
9634        // > 4095 has no single-instruction encoding → error, not silent wrong.
9635        assert!(
9636            encoder
9637                .encode(&ArmOp::Add {
9638                    rd: Reg::SP,
9639                    rn: Reg::SP,
9640                    op2: Operand2::Imm(5000),
9641                })
9642                .is_err(),
9643            "add #5000 must error (no single ADDW), not mis-encode"
9644        );
9645    }
9646
9647    /// Closes the data-proc immediate class: AND and CMN now go through
9648    /// `try_thumb_expand_imm` like ORR/EOR/CMP — correct for any modified
9649    /// immediate, `Err` (not raw-pack / NOP) on an un-encodable one. The byte
9650    /// range stays bit-identical (`and r2,r0,#0x7e` is unchanged).
9651    #[test]
9652    fn and_cmn_immediate_thumb_expand_else_error() {
9653        let encoder = ArmEncoder::new_thumb2();
9654        // byte range unchanged (bit-identical with the pre-retrofit encoding)
9655        assert_eq!(
9656            encoder
9657                .encode(&ArmOp::And {
9658                    rd: Reg::R2,
9659                    rn: Reg::R0,
9660                    op2: Operand2::Imm(0x7e),
9661                })
9662                .unwrap(),
9663            vec![0x00, 0xf0, 0x7e, 0x02],
9664        );
9665        // a valid replicated modified immediate now encodes (was silently wrong)
9666        assert!(
9667            encoder
9668                .encode(&ArmOp::And {
9669                    rd: Reg::R2,
9670                    rn: Reg::R0,
9671                    op2: Operand2::Imm(0xff00ff00u32 as i32),
9672                })
9673                .is_ok()
9674        );
9675        // a genuinely un-encodable immediate errors (AND was raw-pack; CMN NOP)
9676        assert!(
9677            encoder
9678                .encode(&ArmOp::And {
9679                    rd: Reg::R2,
9680                    rn: Reg::R0,
9681                    op2: Operand2::Imm(0x101),
9682                })
9683                .is_err()
9684        );
9685        assert!(
9686            encoder
9687                .encode(&ArmOp::Cmn {
9688                    rn: Reg::R0,
9689                    op2: Operand2::Imm(0x101),
9690                })
9691                .is_err(),
9692            "CMN #0x101 must error, not emit a NOP"
9693        );
9694    }
9695
9696    /// VCR-RA-001: ORR/EOR with a small immediate must encode the real
9697    /// instruction (not a silent `0xBF00` NOP). Pins the byte range and the
9698    /// Ok-or-Err bound that makes future Or/Eor immediate folding safe.
9699    #[test]
9700    fn orr_eor_immediate_encode_in_byte_range_else_error() {
9701        let encoder = ArmEncoder::new_thumb2();
9702        // orr r2, r0, #0x7e  →  ORR.W T1, imm8=0x7e
9703        assert_eq!(
9704            encoder
9705                .encode(&ArmOp::Orr {
9706                    rd: Reg::R2,
9707                    rn: Reg::R0,
9708                    op2: Operand2::Imm(0x7e),
9709                })
9710                .unwrap(),
9711            vec![0x40, 0xf0, 0x7e, 0x02],
9712        );
9713        // eor r2, r0, #0x7e  →  EOR.W T1, imm8=0x7e
9714        assert_eq!(
9715            encoder
9716                .encode(&ArmOp::Eor {
9717                    rd: Reg::R2,
9718                    rn: Reg::R0,
9719                    op2: Operand2::Imm(0x7e),
9720                })
9721                .unwrap(),
9722            vec![0x80, 0xf0, 0x7e, 0x02],
9723        );
9724        // Out-of-range immediates error rather than silently mis-encode / NOP.
9725        assert!(
9726            encoder
9727                .encode(&ArmOp::Orr {
9728                    rd: Reg::R2,
9729                    rn: Reg::R0,
9730                    op2: Operand2::Imm(0x140),
9731                })
9732                .is_err(),
9733            "ORR #0x140 must error, not emit a NOP"
9734        );
9735    }
9736
9737    #[test]
9738    fn test_encode_mve_different_qregs() {
9739        let encoder = ArmEncoder::new_thumb2();
9740
9741        // Test that different Q-register numbers produce different encodings
9742        let op1 = ArmOp::MveAddI {
9743            qd: QReg::Q0,
9744            qn: QReg::Q0,
9745            qm: QReg::Q0,
9746            size: MveSize::S32,
9747        };
9748        let op2 = ArmOp::MveAddI {
9749            qd: QReg::Q3,
9750            qn: QReg::Q5,
9751            qm: QReg::Q7,
9752            size: MveSize::S32,
9753        };
9754        let code1 = encoder.encode(&op1).unwrap();
9755        let code2 = encoder.encode(&op2).unwrap();
9756        assert_ne!(
9757            code1, code2,
9758            "Different Q-registers should produce different encodings"
9759        );
9760    }
9761
9762    #[test]
9763    fn test_encode_mve_arm32_nop() {
9764        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
9765        let encoder = ArmEncoder::new_arm32();
9766        let op = ArmOp::MveAddI {
9767            qd: QReg::Q0,
9768            qn: QReg::Q1,
9769            qm: QReg::Q2,
9770            size: MveSize::S32,
9771        };
9772        let code = encoder.encode(&op).unwrap();
9773        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
9774        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
9775        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
9776        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
9777    }
9778}