Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    /// #206: encode an ARM32 (A32) load/store whose address uses a register
55    /// offset (`[rn, rm{, #off}]`). Returns `None` for ops with no register
56    /// offset (the caller falls through to the immediate-form arms). Computes
57    /// `ip = base + rm` then re-encodes the op against `[ip, #off]`, which works
58    /// uniformly for word/byte/halfword/signed forms. IP (R12) is the scratch
59    /// register the selector already treats as clobberable across memory ops.
60    fn encode_arm_reg_offset_mem(&self, op: &ArmOp) -> Result<Option<Vec<u8>>> {
61        use synth_synthesis::Reg;
62        let addr = match op {
63            ArmOp::Ldr { addr, .. }
64            | ArmOp::Str { addr, .. }
65            | ArmOp::Ldrb { addr, .. }
66            | ArmOp::Strb { addr, .. }
67            | ArmOp::Ldrh { addr, .. }
68            | ArmOp::Strh { addr, .. }
69            | ArmOp::Ldrsb { addr, .. }
70            | ArmOp::Ldrsh { addr, .. } => addr,
71            _ => return Ok(None),
72        };
73        let Some(rm) = addr.offset_reg else {
74            return Ok(None);
75        };
76        let ip = Reg::R12;
77        // ADD ip, base, rm  (cond=AL, opcode=ADD, S=0, register operand2)
78        let add: u32 = 0xE0800000
79            | (reg_to_bits(&addr.base) << 16)
80            | (reg_to_bits(&ip) << 12)
81            | reg_to_bits(&rm);
82        let mut bytes = add.to_le_bytes().to_vec();
83        // Re-encode the op against [ip, #off] (immediate form → no offset_reg,
84        // so this recursion hits the immediate arms, not this helper again).
85        let imm_addr = MemAddr::imm(ip, addr.offset);
86        let imm_op = match op {
87            ArmOp::Ldr { rd, .. } => ArmOp::Ldr {
88                rd: *rd,
89                addr: imm_addr,
90            },
91            ArmOp::Str { rd, .. } => ArmOp::Str {
92                rd: *rd,
93                addr: imm_addr,
94            },
95            ArmOp::Ldrb { rd, .. } => ArmOp::Ldrb {
96                rd: *rd,
97                addr: imm_addr,
98            },
99            ArmOp::Strb { rd, .. } => ArmOp::Strb {
100                rd: *rd,
101                addr: imm_addr,
102            },
103            ArmOp::Ldrh { rd, .. } => ArmOp::Ldrh {
104                rd: *rd,
105                addr: imm_addr,
106            },
107            ArmOp::Strh { rd, .. } => ArmOp::Strh {
108                rd: *rd,
109                addr: imm_addr,
110            },
111            ArmOp::Ldrsb { rd, .. } => ArmOp::Ldrsb {
112                rd: *rd,
113                addr: imm_addr,
114            },
115            ArmOp::Ldrsh { rd, .. } => ArmOp::Ldrsh {
116                rd: *rd,
117                addr: imm_addr,
118            },
119            _ => unreachable!(),
120        };
121        bytes.extend(self.encode_arm(&imm_op)?);
122        Ok(Some(bytes))
123    }
124
125    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
126        // #206: ARM32 register-offset loads/stores. `encode_mem_addr` only
127        // returns the 12-bit immediate, so the immediate-form arms below
128        // silently DROP `addr.offset_reg` — a runtime address index vanished,
129        // turning `ldr rd,[rn,rm,#off]` into `ldr rd,[rn,#off]` (the access went
130        // to the wrong address). Compute the effective base into IP and re-encode
131        // against `[ip, #off]`, which is uniform for word/byte/halfword/signed.
132        if let Some(bytes) = self.encode_arm_reg_offset_mem(op)? {
133            return Ok(bytes);
134        }
135        let instr: u32 = match op {
136            // Data processing instructions
137            ArmOp::Add { rd, rn, op2 } => {
138                let rd_bits = reg_to_bits(rd);
139                let rn_bits = reg_to_bits(rn);
140                let (op2_bits, i_flag) = encode_operand2(op2);
141
142                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
143                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
144                    | (i_flag << 25)
145                    | (rn_bits << 16)
146                    | (rd_bits << 12)
147                    | op2_bits
148            }
149
150            ArmOp::Sub { rd, rn, op2 } => {
151                let rd_bits = reg_to_bits(rd);
152                let rn_bits = reg_to_bits(rn);
153                let (op2_bits, i_flag) = encode_operand2(op2);
154
155                // SUB encoding: opcode=0010
156                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
157            }
158
159            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
160            ArmOp::Adds { rd, rn, op2 } => {
161                let rd_bits = reg_to_bits(rd);
162                let rn_bits = reg_to_bits(rn);
163                let (op2_bits, i_flag) = encode_operand2(op2);
164
165                // ADDS encoding: opcode=0100, S=1
166                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
167            }
168
169            ArmOp::Adc { rd, rn, op2 } => {
170                let rd_bits = reg_to_bits(rd);
171                let rn_bits = reg_to_bits(rn);
172                let (op2_bits, i_flag) = encode_operand2(op2);
173
174                // ADC encoding: opcode=0101
175                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
176            }
177
178            ArmOp::Subs { rd, rn, op2 } => {
179                let rd_bits = reg_to_bits(rd);
180                let rn_bits = reg_to_bits(rn);
181                let (op2_bits, i_flag) = encode_operand2(op2);
182
183                // SUBS encoding: opcode=0010, S=1
184                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
185            }
186
187            ArmOp::Sbc { rd, rn, op2 } => {
188                let rd_bits = reg_to_bits(rd);
189                let rn_bits = reg_to_bits(rn);
190                let (op2_bits, i_flag) = encode_operand2(op2);
191
192                // SBC encoding: opcode=0110
193                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
194            }
195
196            ArmOp::Mul { rd, rn, rm } => {
197                let rd_bits = reg_to_bits(rd);
198                let rn_bits = reg_to_bits(rn);
199                let rm_bits = reg_to_bits(rm);
200
201                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
202                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
203            }
204
205            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
206                let rdlo_bits = reg_to_bits(rdlo);
207                let rdhi_bits = reg_to_bits(rdhi);
208                let rn_bits = reg_to_bits(rn);
209                let rm_bits = reg_to_bits(rm);
210
211                // UMULL encoding: cond(4) | 0000 1000 | RdHi(4) | RdLo(4) | Rm(4) | 1001 | Rn(4)
212                0xE0800090 | (rdhi_bits << 16) | (rdlo_bits << 12) | (rm_bits << 8) | rn_bits
213            }
214
215            ArmOp::Sdiv { rd, rn, rm } => {
216                let rd_bits = reg_to_bits(rd);
217                let rn_bits = reg_to_bits(rn);
218                let rm_bits = reg_to_bits(rm);
219
220                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
221                // ARMv7-M and above
222                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
223            }
224
225            ArmOp::Udiv { rd, rn, rm } => {
226                let rd_bits = reg_to_bits(rd);
227                let rn_bits = reg_to_bits(rn);
228                let rm_bits = reg_to_bits(rm);
229
230                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
231                // ARMv7-M and above
232                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
233            }
234
235            ArmOp::Mls { rd, rn, rm, ra } => {
236                let rd_bits = reg_to_bits(rd);
237                let rn_bits = reg_to_bits(rn);
238                let rm_bits = reg_to_bits(rm);
239                let ra_bits = reg_to_bits(ra);
240
241                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
242                // Rd = Ra - (Rn * Rm)
243                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
244            }
245
246            ArmOp::And { rd, rn, op2 } => {
247                let rd_bits = reg_to_bits(rd);
248                let rn_bits = reg_to_bits(rn);
249                let (op2_bits, i_flag) = encode_operand2(op2);
250
251                // AND encoding: opcode=0000
252                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
253            }
254
255            ArmOp::Orr { rd, rn, op2 } => {
256                let rd_bits = reg_to_bits(rd);
257                let rn_bits = reg_to_bits(rn);
258                let (op2_bits, i_flag) = encode_operand2(op2);
259
260                // ORR encoding: opcode=1100
261                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
262            }
263
264            ArmOp::Eor { rd, rn, op2 } => {
265                let rd_bits = reg_to_bits(rd);
266                let rn_bits = reg_to_bits(rn);
267                let (op2_bits, i_flag) = encode_operand2(op2);
268
269                // EOR encoding: opcode=0001
270                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
271            }
272
273            // Shift instructions
274            ArmOp::Lsl { rd, rn, shift } => {
275                let rd_bits = reg_to_bits(rd);
276                let rn_bits = reg_to_bits(rn);
277                let shift_bits = *shift & 0x1F;
278
279                // LSL encoding: MOV with shift
280                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
281            }
282
283            ArmOp::Lsr { rd, rn, shift } => {
284                let rd_bits = reg_to_bits(rd);
285                let rn_bits = reg_to_bits(rn);
286                let shift_bits = *shift & 0x1F;
287
288                // LSR encoding
289                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
290            }
291
292            ArmOp::Asr { rd, rn, shift } => {
293                let rd_bits = reg_to_bits(rd);
294                let rn_bits = reg_to_bits(rn);
295                let shift_bits = *shift & 0x1F;
296
297                // ASR encoding
298                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
299            }
300
301            ArmOp::Ror { rd, rn, shift } => {
302                let rd_bits = reg_to_bits(rd);
303                let rn_bits = reg_to_bits(rn);
304                let shift_bits = *shift & 0x1F;
305
306                // ROR encoding: MOV with ROR shift
307                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
308            }
309
310            // Register-based shifts (ARM32)
311            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
312            ArmOp::LslReg { rd, rn, rm } => {
313                let rd_bits = reg_to_bits(rd);
314                let rn_bits = reg_to_bits(rn);
315                let rm_bits = reg_to_bits(rm);
316                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
317            }
318            ArmOp::LsrReg { rd, rn, rm } => {
319                let rd_bits = reg_to_bits(rd);
320                let rn_bits = reg_to_bits(rn);
321                let rm_bits = reg_to_bits(rm);
322                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
323            }
324            ArmOp::AsrReg { rd, rn, rm } => {
325                let rd_bits = reg_to_bits(rd);
326                let rn_bits = reg_to_bits(rn);
327                let rm_bits = reg_to_bits(rm);
328                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
329            }
330            ArmOp::RorReg { rd, rn, rm } => {
331                let rd_bits = reg_to_bits(rd);
332                let rn_bits = reg_to_bits(rn);
333                let rm_bits = reg_to_bits(rm);
334                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
335            }
336
337            // RSB (Reverse Subtract): Rd = imm - Rn
338            ArmOp::Rsb { rd, rn, imm } => {
339                let rd_bits = reg_to_bits(rd);
340                let rn_bits = reg_to_bits(rn);
341                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
342                // Opcode for RSB = 0011, I=1 (immediate), S=0
343                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
344            }
345
346            // Bit manipulation instructions
347            ArmOp::Clz { rd, rm } => {
348                let rd_bits = reg_to_bits(rd);
349                let rm_bits = reg_to_bits(rm);
350
351                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
352                // ARMv5T and above
353                0xE16F0F10 | (rd_bits << 12) | rm_bits
354            }
355
356            ArmOp::Rbit { rd, rm } => {
357                let rd_bits = reg_to_bits(rd);
358                let rm_bits = reg_to_bits(rm);
359
360                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
361                // ARMv6T2 and above
362                0xE6FF0F30 | (rd_bits << 12) | rm_bits
363            }
364
365            ArmOp::Sxtb { rd, rm } => {
366                let rd_bits = reg_to_bits(rd);
367                let rm_bits = reg_to_bits(rm);
368
369                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
370                // ARMv6 and above. rotate=00 for no rotation
371                0xE6AF0070 | (rd_bits << 12) | rm_bits
372            }
373
374            ArmOp::Sxth { rd, rm } => {
375                let rd_bits = reg_to_bits(rd);
376                let rm_bits = reg_to_bits(rm);
377
378                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
379                // ARMv6 and above. rotate=00 for no rotation
380                0xE6BF0070 | (rd_bits << 12) | rm_bits
381            }
382
383            // Move instructions
384            ArmOp::Mov { rd, op2 } => {
385                let rd_bits = reg_to_bits(rd);
386                let (op2_bits, i_flag) = encode_operand2(op2);
387
388                // MOV encoding: opcode=1101
389                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
390            }
391
392            ArmOp::Mvn { rd, op2 } => {
393                let rd_bits = reg_to_bits(rd);
394                let (op2_bits, i_flag) = encode_operand2(op2);
395
396                // MVN encoding: opcode=1111
397                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
398            }
399
400            // MOVW - Move Wide (ARM32)
401            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
402            ArmOp::Movw { rd, imm16 } => {
403                let rd_bits = reg_to_bits(rd);
404                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
405                let imm12 = (*imm16 as u32) & 0xFFF;
406                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
407            }
408
409            // MOVT - Move Top (ARM32)
410            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
411            ArmOp::Movt { rd, imm16 } => {
412                let rd_bits = reg_to_bits(rd);
413                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
414                let imm12 = (*imm16 as u32) & 0xFFF;
415                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
416            }
417
418            // #237: symbol-relative MOVW/MOVT (ARM mode) — addend in place, the
419            // backend records the MOVW_ABS/MOVT_ABS relocation against `symbol`.
420            ArmOp::MovwSym { rd, addend, .. } => {
421                let rd_bits = reg_to_bits(rd);
422                let v = (*addend as u32) & 0xffff;
423                0xE3000000 | (((v >> 12) & 0xF) << 16) | (rd_bits << 12) | (v & 0xFFF)
424            }
425            ArmOp::MovtSym { rd, addend, .. } => {
426                let rd_bits = reg_to_bits(rd);
427                let v = ((*addend as u32) >> 16) & 0xffff;
428                0xE3400000 | (((v >> 12) & 0xF) << 16) | (rd_bits << 12) | (v & 0xFFF)
429            }
430
431            // Compare
432            ArmOp::Cmp { rn, op2 } => {
433                let rn_bits = reg_to_bits(rn);
434                let (op2_bits, i_flag) = encode_operand2(op2);
435
436                // CMP encoding: opcode=1010, S=1
437                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
438            }
439
440            // Compare Negative (CMN) - computes Rn + op2 and sets flags
441            ArmOp::Cmn { rn, op2 } => {
442                let rn_bits = reg_to_bits(rn);
443                let (op2_bits, i_flag) = encode_operand2(op2);
444
445                // CMN encoding: opcode=1011, S=1
446                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
447            }
448
449            // Load/Store
450            ArmOp::Ldr { rd, addr } => {
451                let rd_bits = reg_to_bits(rd);
452                let (base_bits, offset_bits) = encode_mem_addr(addr);
453
454                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
455                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
456                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
457            }
458
459            ArmOp::Str { rd, addr } => {
460                let rd_bits = reg_to_bits(rd);
461                let (base_bits, offset_bits) = encode_mem_addr(addr);
462
463                // STR encoding: L=0 (store)
464                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
465            }
466
467            // Sub-word loads (ARM32 encoding)
468            ArmOp::Ldrb { rd, addr } => {
469                let rd_bits = reg_to_bits(rd);
470                let (base_bits, offset_bits) = encode_mem_addr(addr);
471                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
472                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
473            }
474
475            ArmOp::Ldrsb { rd, addr } => {
476                let rd_bits = reg_to_bits(rd);
477                let (base_bits, offset_bits) = encode_mem_addr(addr);
478                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
479                // Simplified with immediate offset
480                let offset_val = offset_bits & 0xFF;
481                let imm4h = (offset_val >> 4) & 0xF;
482                let imm4l = offset_val & 0xF;
483                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
484            }
485
486            ArmOp::Ldrh { rd, addr } => {
487                let rd_bits = reg_to_bits(rd);
488                let (base_bits, offset_bits) = encode_mem_addr(addr);
489                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
490                let offset_val = offset_bits & 0xFF;
491                let imm4h = (offset_val >> 4) & 0xF;
492                let imm4l = offset_val & 0xF;
493                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
494            }
495
496            ArmOp::Ldrsh { rd, addr } => {
497                let rd_bits = reg_to_bits(rd);
498                let (base_bits, offset_bits) = encode_mem_addr(addr);
499                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
500                let offset_val = offset_bits & 0xFF;
501                let imm4h = (offset_val >> 4) & 0xF;
502                let imm4l = offset_val & 0xF;
503                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
504            }
505
506            // Sub-word stores (ARM32 encoding)
507            ArmOp::Strb { rd, addr } => {
508                let rd_bits = reg_to_bits(rd);
509                let (base_bits, offset_bits) = encode_mem_addr(addr);
510                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
511                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
512            }
513
514            ArmOp::Strh { rd, addr } => {
515                let rd_bits = reg_to_bits(rd);
516                let (base_bits, offset_bits) = encode_mem_addr(addr);
517                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
518                let offset_val = offset_bits & 0xFF;
519                let imm4h = (offset_val >> 4) & 0xF;
520                let imm4l = offset_val & 0xF;
521                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
522            }
523
524            // Memory management (ARM32 encoding)
525            ArmOp::MemorySize { rd } => {
526                let rd_bits = reg_to_bits(rd);
527                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
528                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
529                // LSR #16: shift5=10000, type=01
530                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
531            }
532
533            ArmOp::MemoryGrow { rd, .. } => {
534                let rd_bits = reg_to_bits(rd);
535                // On embedded, always fail: MOV rd, #-1
536                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
537            }
538
539            // Label pseudo-instruction: emits no machine code
540            ArmOp::Label { .. } => {
541                return Ok(Vec::new());
542            }
543
544            // Branch instructions
545            ArmOp::B { label: _ } => {
546                // B encoding: cond(4) | 1010 | offset(24)
547                // Simplified: branch to offset 0 (will be patched by linker/resolver)
548                0xEA000000
549            }
550
551            // Conditional branch to label (generic)
552            ArmOp::Bcc { cond, label: _ } => {
553                use synth_synthesis::Condition;
554                let cond_bits: u32 = match cond {
555                    Condition::EQ => 0x0,
556                    Condition::NE => 0x1,
557                    Condition::HS => 0x2,
558                    Condition::LO => 0x3,
559                    Condition::HI => 0x8,
560                    Condition::LS => 0x9,
561                    Condition::GE => 0xA,
562                    Condition::LT => 0xB,
563                    Condition::GT => 0xC,
564                    Condition::LE => 0xD,
565                };
566                // B<cond> with offset 0 (will be patched)
567                (cond_bits << 28) | 0x0A000000
568            }
569
570            // BHS (Branch if Higher or Same) - used for bounds checking
571            ArmOp::Bhs { label: _ } => {
572                // BHS encoding: cond(2=HS) | 1010 | offset(24)
573                0x2A000000 // BHS with offset 0
574            }
575
576            // BLO (Branch if Lower) - complementary to BHS
577            ArmOp::Blo { label: _ } => {
578                // BLO encoding: cond(3=LO) | 1010 | offset(24)
579                0x3A000000 // BLO with offset 0
580            }
581
582            // Branch with numeric offset (in instructions)
583            // ARM32 B instruction: offset is in instructions, stored as words
584            // The offset is relative to PC+8 (due to ARM pipeline)
585            ArmOp::BOffset { offset } => {
586                // B encoding: cond(4) | 1010 | offset(24)
587                // Offset is signed, in words (4-byte units)
588                // ARM adds PC+8 to the offset, so we need to adjust:
589                // target = PC + 8 + (offset * 4)
590                // For backward branch of N instructions: offset = -(N + 2)
591                // wrapping_sub keeps the encoder total under fuzzing (#186): an
592                // extreme i32::MIN offset would otherwise overflow-panic; for any
593                // real branch offset this is identical to `- 2`.
594                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
595                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
596                0xEA000000 | offset_bits
597            }
598
599            // Conditional branch with numeric offset
600            ArmOp::BCondOffset { cond, offset } => {
601                use synth_synthesis::Condition;
602                let cond_bits: u32 = match cond {
603                    Condition::EQ => 0x0,
604                    Condition::NE => 0x1,
605                    Condition::HS => 0x2,
606                    Condition::LO => 0x3,
607                    Condition::HI => 0x8,
608                    Condition::LS => 0x9,
609                    Condition::GE => 0xA,
610                    Condition::LT => 0xB,
611                    Condition::GT => 0xC,
612                    Condition::LE => 0xD,
613                };
614                // B<cond> encoding: cond(4) | 1010 | offset(24)
615                // wrapping_sub: total under fuzzing (#186), identical for real offsets.
616                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
617                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
618                (cond_bits << 28) | 0x0A000000 | offset_bits
619            }
620
621            ArmOp::Bl { label: _ } => {
622                // BL encoding: cond(4) | 1011 | offset(24)
623                0xEB000000
624            }
625
626            ArmOp::Bx { rm } => {
627                let rm_bits = reg_to_bits(rm);
628
629                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
630                0xE12FFF10 | rm_bits
631            }
632
633            ArmOp::Blx { rm } => {
634                let rm_bits = reg_to_bits(rm);
635
636                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
637                0xE12FFF30 | rm_bits
638            }
639
640            ArmOp::Push { regs } => {
641                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
642                let mut reg_list: u32 = 0;
643                for r in regs {
644                    reg_list |= 1 << reg_to_bits(r);
645                }
646                0xE92D0000 | reg_list
647            }
648
649            ArmOp::Pop { regs } => {
650                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
651                let mut reg_list: u32 = 0;
652                for r in regs {
653                    reg_list |= 1 << reg_to_bits(r);
654                }
655                0xE8BD0000 | reg_list
656            }
657
658            ArmOp::Nop => {
659                // NOP encoding: MOV R0, R0
660                0xE1A00000
661            }
662
663            ArmOp::Udf { imm } => {
664                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
665                // We only use imm8, so split into imm4_hi and imm4_lo
666                let imm8 = *imm as u32;
667                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
668            }
669
670            // Pseudo-instructions for verification - encode as NOP
671            // These are used in formal verification but not actual code generation
672            ArmOp::Popcnt { .. } => {
673                // Population count pseudo-instruction
674                // Not a real ARM instruction, would be expanded to actual code
675                0xE1A00000 // NOP for now
676            }
677
678            ArmOp::SetCond { .. } => {
679                // Condition evaluation pseudo-instruction
680                // Not a real ARM instruction, would be expanded to actual code
681                0xE1A00000 // NOP for now
682            }
683
684            ArmOp::SelectMove { .. } => {
685                // Conditional move pseudo-instruction for ARM32
686                // Would use MOV{cond} instruction
687                0xE1A00000 // NOP for now
688            }
689
690            ArmOp::Select { .. } => {
691                // Select pseudo-instruction
692                // Not a real ARM instruction, would be expanded to conditional moves
693                0xE1A00000 // NOP for now
694            }
695
696            ArmOp::LocalGet { .. } => {
697                // Local variable get pseudo-instruction
698                // Not a real ARM instruction, would be expanded to memory access
699                0xE1A00000 // NOP for now
700            }
701
702            ArmOp::LocalSet { .. } => {
703                // Local variable set pseudo-instruction
704                // Not a real ARM instruction, would be expanded to memory access
705                0xE1A00000 // NOP for now
706            }
707
708            ArmOp::LocalTee { .. } => {
709                // Local variable tee pseudo-instruction
710                // Not a real ARM instruction, would be expanded to memory access
711                0xE1A00000 // NOP for now
712            }
713
714            ArmOp::GlobalGet { .. } => {
715                // Global variable get pseudo-instruction
716                // Not a real ARM instruction, would be expanded to memory access
717                0xE1A00000 // NOP for now
718            }
719
720            ArmOp::GlobalSet { .. } => {
721                // Global variable set pseudo-instruction
722                // Not a real ARM instruction, would be expanded to memory access
723                0xE1A00000 // NOP for now
724            }
725
726            ArmOp::BrTable { .. } => {
727                // Branch table pseudo-instruction
728                // Not a real ARM instruction, would be expanded to jump table
729                0xE1A00000 // NOP for now
730            }
731
732            ArmOp::Call { .. } => {
733                // Function call pseudo-instruction
734                // Not a real ARM instruction, would be expanded to BL
735                0xE1A00000 // NOP for now
736            }
737
738            ArmOp::CallIndirect { .. } => {
739                // Indirect function call pseudo-instruction
740                // Not a real ARM instruction, would be expanded to indirect branch
741                0xE1A00000 // NOP for now
742            }
743
744            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
745            // Real compiler would expand these to multi-instruction sequences
746            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
747            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
748            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
749            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
750            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
751            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
752            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
753            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
754            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
755            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
756            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
757            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
758            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
759            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
760            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
761            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
762            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
763            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
764            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
765            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
766            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
767            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
768            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
769            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
770            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
771            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
772            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
773            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
774            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
775            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
776            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
777            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
778
779            // f32 VFP single-precision instructions
780            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
781            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
782            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
783            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
784            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
785            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
786            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
787
788            // f32 pseudo-ops — multi-instruction sequences
789            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
790            ArmOp::F32Ceil { sd, sm } => {
791                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
792            }
793            ArmOp::F32Floor { sd, sm } => {
794                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
795            }
796            ArmOp::F32Trunc { sd, sm } => {
797                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
798            }
799            ArmOp::F32Nearest { sd, sm } => {
800                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
801            }
802            ArmOp::F32Min { sd, sn, sm } => {
803                return self.encode_arm_f32_minmax(sd, sn, sm, true);
804            }
805            ArmOp::F32Max { sd, sn, sm } => {
806                return self.encode_arm_f32_minmax(sd, sn, sm, false);
807            }
808            ArmOp::F32Copysign { sd, sn, sm } => {
809                return self.encode_arm_f32_copysign(sd, sn, sm);
810            }
811
812            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
813            ArmOp::F32Eq { rd, sn, sm } => {
814                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
815            }
816            ArmOp::F32Ne { rd, sn, sm } => {
817                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
818            }
819            ArmOp::F32Lt { rd, sn, sm } => {
820                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
821            }
822            ArmOp::F32Le { rd, sn, sm } => {
823                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
824            }
825            ArmOp::F32Gt { rd, sn, sm } => {
826                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
827            }
828            ArmOp::F32Ge { rd, sn, sm } => {
829                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
830            }
831
832            // f32 const — multi-instruction: MOVW + MOVT + VMOV
833            ArmOp::F32Const { sd, value } => {
834                return self.encode_arm_f32_const(sd, *value);
835            }
836
837            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
838            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
839
840            // f32 conversions — multi-instruction sequences
841            ArmOp::F32ConvertI32S { sd, rm } => {
842                return self.encode_arm_f32_convert_i32(sd, rm, true);
843            }
844            ArmOp::F32ConvertI32U { sd, rm } => {
845                return self.encode_arm_f32_convert_i32(sd, rm, false);
846            }
847            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
848                return Err(synth_core::Error::synthesis(
849                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
850                ));
851            }
852            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
853            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
854            ArmOp::I32TruncF32S { rd, sm } => {
855                return self.encode_arm_i32_trunc_f32(rd, sm, true);
856            }
857            ArmOp::I32TruncF32U { rd, sm } => {
858                return self.encode_arm_i32_trunc_f32(rd, sm, false);
859            }
860
861            // f64 VFP double-precision instructions (ARM32)
862            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
863            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
864            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
865            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
866            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
867            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
868            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
869            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
870
871            // f64 pseudo-ops
872            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
873            ArmOp::F64Ceil { dd, dm } => {
874                return self.encode_arm_f64_rounding(dd, dm, 0b01);
875            }
876            ArmOp::F64Floor { dd, dm } => {
877                return self.encode_arm_f64_rounding(dd, dm, 0b10);
878            }
879            ArmOp::F64Trunc { dd, dm } => {
880                return self.encode_arm_f64_rounding(dd, dm, 0b11);
881            }
882            ArmOp::F64Nearest { dd, dm } => {
883                return self.encode_arm_f64_rounding(dd, dm, 0b00);
884            }
885            ArmOp::F64Min { dd, dn, dm } => {
886                return self.encode_arm_f64_minmax(dd, dn, dm, true);
887            }
888            ArmOp::F64Max { dd, dn, dm } => {
889                return self.encode_arm_f64_minmax(dd, dn, dm, false);
890            }
891            ArmOp::F64Copysign { dd, dn, dm } => {
892                return self.encode_arm_f64_copysign(dd, dn, dm);
893            }
894
895            // f64 comparisons
896            ArmOp::F64Eq { rd, dn, dm } => {
897                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
898            }
899            ArmOp::F64Ne { rd, dn, dm } => {
900                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
901            }
902            ArmOp::F64Lt { rd, dn, dm } => {
903                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
904            }
905            ArmOp::F64Le { rd, dn, dm } => {
906                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
907            }
908            ArmOp::F64Gt { rd, dn, dm } => {
909                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
910            }
911            ArmOp::F64Ge { rd, dn, dm } => {
912                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
913            }
914
915            ArmOp::F64Const { dd, value } => {
916                return self.encode_arm_f64_const(dd, *value);
917            }
918
919            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
920            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
921
922            ArmOp::F64ConvertI32S { dd, rm } => {
923                return self.encode_arm_f64_convert_i32(dd, rm, true);
924            }
925            ArmOp::F64ConvertI32U { dd, rm } => {
926                return self.encode_arm_f64_convert_i32(dd, rm, false);
927            }
928            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
929                return Err(synth_core::Error::synthesis(
930                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
931                ));
932            }
933            ArmOp::F64PromoteF32 { dd, sm } => {
934                return self.encode_arm_f64_promote_f32(dd, sm);
935            }
936            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
937                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
938            }
939            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
940                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
941            }
942            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
943                return Err(synth_core::Error::synthesis(
944                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
945                ));
946            }
947            ArmOp::I32TruncF64S { rd, dm } => {
948                return self.encode_arm_i32_trunc_f64(rd, dm, true);
949            }
950            ArmOp::I32TruncF64U { rd, dm } => {
951                return self.encode_arm_i32_trunc_f64(rd, dm, false);
952            }
953            // Multi-instruction sequences - only meaningful in Thumb-2 mode
954            ArmOp::I64SetCond { .. }
955            | ArmOp::I64SetCondZ { .. }
956            | ArmOp::I64Mul { .. }
957            | ArmOp::I64Shl { .. }
958            | ArmOp::I64ShrS { .. }
959            | ArmOp::I64ShrU { .. }
960            | ArmOp::I64Rotl { .. }
961            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
962
963            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
964            ArmOp::MveLoad { .. }
965            | ArmOp::MveStore { .. }
966            | ArmOp::MveConst { .. }
967            | ArmOp::MveAnd { .. }
968            | ArmOp::MveOrr { .. }
969            | ArmOp::MveEor { .. }
970            | ArmOp::MveMvn { .. }
971            | ArmOp::MveBic { .. }
972            | ArmOp::MveAddI { .. }
973            | ArmOp::MveSubI { .. }
974            | ArmOp::MveMulI { .. }
975            | ArmOp::MveNegI { .. }
976            | ArmOp::MveCmpEqI { .. }
977            | ArmOp::MveCmpNeI { .. }
978            | ArmOp::MveCmpLtS { .. }
979            | ArmOp::MveCmpLtU { .. }
980            | ArmOp::MveCmpGtS { .. }
981            | ArmOp::MveCmpGtU { .. }
982            | ArmOp::MveCmpLeS { .. }
983            | ArmOp::MveCmpLeU { .. }
984            | ArmOp::MveCmpGeS { .. }
985            | ArmOp::MveCmpGeU { .. }
986            | ArmOp::MveDup { .. }
987            | ArmOp::MveExtractLane { .. }
988            | ArmOp::MveInsertLane { .. }
989            | ArmOp::MveAddF32 { .. }
990            | ArmOp::MveSubF32 { .. }
991            | ArmOp::MveMulF32 { .. }
992            | ArmOp::MveNegF32 { .. }
993            | ArmOp::MveAbsF32 { .. }
994            | ArmOp::MveCmpEqF32 { .. }
995            | ArmOp::MveCmpNeF32 { .. }
996            | ArmOp::MveCmpLtF32 { .. }
997            | ArmOp::MveCmpLeF32 { .. }
998            | ArmOp::MveCmpGtF32 { .. }
999            | ArmOp::MveCmpGeF32 { .. }
1000            | ArmOp::MveDupF32 { .. }
1001            | ArmOp::MveExtractLaneF32 { .. }
1002            | ArmOp::MveReplaceLaneF32 { .. }
1003            | ArmOp::MveDivF32 { .. }
1004            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
1005        };
1006
1007        // ARM32 instructions are little-endian
1008        Ok(instr.to_le_bytes().to_vec())
1009    }
1010
1011    // === ARM32 VFP multi-instruction helpers ===
1012
1013    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
1014    fn encode_arm_f32_compare(
1015        &self,
1016        rd: &Reg,
1017        sn: &VfpReg,
1018        sm: &VfpReg,
1019        cond_code: u32,
1020    ) -> Result<Vec<u8>> {
1021        let mut bytes = Vec::new();
1022
1023        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
1024        let sn_num = vfp_sreg_to_num(sn)?;
1025        let sm_num = vfp_sreg_to_num(sm)?;
1026        let (vd, d) = encode_sreg(sn_num);
1027        let (vm, m) = encode_sreg(sm_num);
1028        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1029        bytes.extend_from_slice(&vcmp.to_le_bytes());
1030
1031        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
1032        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1033
1034        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
1035        let rd_bits = reg_to_bits(rd);
1036        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1037        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1038
1039        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
1040        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1041        bytes.extend_from_slice(&mov_one.to_le_bytes());
1042
1043        Ok(bytes)
1044    }
1045
1046    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
1047    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
1048        let mut bytes = Vec::new();
1049        let bits = value.to_bits();
1050
1051        // Use R12 as temp register for constant loading
1052        let rt: u32 = 12; // R12/IP
1053
1054        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
1055        let lo16 = bits & 0xFFFF;
1056        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1057        bytes.extend_from_slice(&movw.to_le_bytes());
1058
1059        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
1060        let hi16 = (bits >> 16) & 0xFFFF;
1061        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1062        bytes.extend_from_slice(&movt.to_le_bytes());
1063
1064        // VMOV Sd, R12
1065        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
1066        bytes.extend_from_slice(&vmov.to_le_bytes());
1067
1068        Ok(bytes)
1069    }
1070
1071    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
1072    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1073        let mut bytes = Vec::new();
1074
1075        // VMOV Sd, Rm — move integer to VFP register
1076        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
1077        bytes.extend_from_slice(&vmov.to_le_bytes());
1078
1079        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
1080        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
1081        let sd_num = vfp_sreg_to_num(sd)?;
1082        let (vd, d) = encode_sreg(sd_num);
1083        let (vm, m) = encode_sreg(sd_num); // same register as source
1084        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
1085        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1086        bytes.extend_from_slice(&vcvt.to_le_bytes());
1087
1088        Ok(bytes)
1089    }
1090
1091    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
1092    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
1093    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
1094    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
1095    /// Simplified: convert to int (toward zero for trunc) then back to float.
1096    /// Encode F32 rounding as ARM32.
1097    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1098    ///
1099    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
1100    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
1101    /// which honours FPSCR rmode), then restores FPSCR.
1102    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1103        let mut bytes = Vec::new();
1104        let sm_num = vfp_sreg_to_num(sm)?;
1105        let sd_num = vfp_sreg_to_num(sd)?;
1106        let (vd_s, d_s) = encode_sreg(sd_num);
1107        let (vm_s, m_s) = encode_sreg(sm_num);
1108
1109        if mode == 0b11 {
1110            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1111            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1112            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1113            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1114        } else {
1115            // ceil/floor/nearest: manipulate FPSCR rounding mode
1116            let rt: u32 = 12; // R12/IP as temp
1117
1118            // VMRS R12, FPSCR
1119            let vmrs = 0xEEF10A10 | (rt << 12);
1120            bytes.extend_from_slice(&vmrs.to_le_bytes());
1121
1122            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1123            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1124            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1125            bytes.extend_from_slice(&bic.to_le_bytes());
1126
1127            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1128            if mode != 0 {
1129                // mode<<22: rotation=5, imm8=mode
1130                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1131                bytes.extend_from_slice(&orr.to_le_bytes());
1132            }
1133
1134            // VMSR FPSCR, R12
1135            let vmsr = 0xEEE10A10 | (rt << 12);
1136            bytes.extend_from_slice(&vmsr.to_le_bytes());
1137
1138            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1139            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1140            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1141
1142            // Restore FPSCR: clear rmode bits back to nearest (default)
1143            bytes.extend_from_slice(&vmrs.to_le_bytes());
1144            bytes.extend_from_slice(&bic.to_le_bytes());
1145            bytes.extend_from_slice(&vmsr.to_le_bytes());
1146        }
1147
1148        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1149        let (vd2, d2) = encode_sreg(sd_num);
1150        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1151        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1152
1153        Ok(bytes)
1154    }
1155
1156    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1157    fn encode_arm_f32_minmax(
1158        &self,
1159        sd: &VfpReg,
1160        sn: &VfpReg,
1161        sm: &VfpReg,
1162        is_min: bool,
1163    ) -> Result<Vec<u8>> {
1164        let mut bytes = Vec::new();
1165        let sn_num = vfp_sreg_to_num(sn)?;
1166        let sm_num = vfp_sreg_to_num(sm)?;
1167        let sd_num = vfp_sreg_to_num(sd)?;
1168
1169        // VMOV Sd, Sn (start with first operand)
1170        let (vd, d) = encode_sreg(sd_num);
1171        let (vn, n) = encode_sreg(sn_num);
1172        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1173        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1174
1175        // VCMP.F32 Sn, Sm
1176        let (vm, m) = encode_sreg(sm_num);
1177        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1178        bytes.extend_from_slice(&vcmp.to_le_bytes());
1179
1180        // VMRS APSR_nzcv, FPSCR
1181        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1182
1183        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1184        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1185        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1186
1187        // VMOV{cond} Sd, Sm — conditional VMOV
1188        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1189        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1190
1191        Ok(bytes)
1192    }
1193
1194    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1195    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1196        let mut bytes = Vec::new();
1197
1198        // VMOV R12, Sm (get sign source bits)
1199        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1200        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1201
1202        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1203        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1204        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1205
1206        // AND R12, R12, #0x80000000 (keep only sign bit)
1207        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1208        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1209        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1210        bytes.extend_from_slice(&and_sign.to_le_bytes());
1211
1212        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1213        // R0 = register 0, so Rn and Rd fields are 0
1214        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1215        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1216
1217        // ORR R0, R0, R12 (combine sign + magnitude)
1218        // R0 = register 0, so Rn and Rd fields are 0
1219        let orr = 0xE1800000u32 | 12;
1220        bytes.extend_from_slice(&orr.to_le_bytes());
1221
1222        // VMOV Sd, R0
1223        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1224        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1225
1226        Ok(bytes)
1227    }
1228
1229    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1230    fn encode_arm_f64_compare(
1231        &self,
1232        rd: &Reg,
1233        dn: &VfpReg,
1234        dm: &VfpReg,
1235        cond_code: u32,
1236    ) -> Result<Vec<u8>> {
1237        let mut bytes = Vec::new();
1238
1239        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1240        let dn_num = vfp_dreg_to_num(dn)?;
1241        let dm_num = vfp_dreg_to_num(dm)?;
1242        let (vd, d) = encode_dreg(dn_num);
1243        let (vm, m) = encode_dreg(dm_num);
1244        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1245        bytes.extend_from_slice(&vcmp.to_le_bytes());
1246
1247        // VMRS APSR_nzcv, FPSCR
1248        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1249
1250        // MOV rd, #0
1251        let rd_bits = reg_to_bits(rd);
1252        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1253        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1254
1255        // MOVcond rd, #1
1256        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1257        bytes.extend_from_slice(&mov_one.to_le_bytes());
1258
1259        Ok(bytes)
1260    }
1261
1262    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1263    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1264        let mut bytes = Vec::new();
1265        let bits = value.to_bits();
1266        let lo32 = bits as u32;
1267        let hi32 = (bits >> 32) as u32;
1268
1269        // Load low 32 bits into R0 (Rd field = 0 for R0)
1270        let lo16 = lo32 & 0xFFFF;
1271        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1272        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1273        let hi16 = (lo32 >> 16) & 0xFFFF;
1274        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1275        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1276
1277        // Load high 32 bits into R12
1278        let lo16 = hi32 & 0xFFFF;
1279        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1280        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1281        let hi16 = (hi32 >> 16) & 0xFFFF;
1282        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1283        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1284
1285        // VMOV Dd, R0, R12
1286        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1287        bytes.extend_from_slice(&vmov.to_le_bytes());
1288
1289        Ok(bytes)
1290    }
1291
1292    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1293    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1294        let mut bytes = Vec::new();
1295
1296        // Use S0 as intermediate: VMOV S0, Rm
1297        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1298        bytes.extend_from_slice(&vmov.to_le_bytes());
1299
1300        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1301        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1302        let dd_num = vfp_dreg_to_num(dd)?;
1303        let (vd, d) = encode_dreg(dd_num);
1304        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1305        // S0 is register 0: Vm=0, M=0
1306        let vcvt = base | (d << 22) | (vd << 12);
1307        bytes.extend_from_slice(&vcvt.to_le_bytes());
1308
1309        Ok(bytes)
1310    }
1311
1312    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1313    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1314        let dd_num = vfp_dreg_to_num(dd)?;
1315        let sm_num = vfp_sreg_to_num(sm)?;
1316        let (vd, d) = encode_dreg(dd_num);
1317        let (vm, m) = encode_sreg(sm_num);
1318
1319        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1320        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1321        Ok(vcvt.to_le_bytes().to_vec())
1322    }
1323
1324    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1325    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1326        let mut bytes = Vec::new();
1327        let dm_num = vfp_dreg_to_num(dm)?;
1328        let (vm, m) = encode_dreg(dm_num);
1329
1330        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1331        // S0: Vd=0, D=0
1332        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1333        let vcvt = base | (m << 5) | vm;
1334        bytes.extend_from_slice(&vcvt.to_le_bytes());
1335
1336        // VMOV Rd, S0
1337        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1338        bytes.extend_from_slice(&vmov.to_le_bytes());
1339
1340        Ok(bytes)
1341    }
1342
1343    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1344    /// Encode F64 rounding as ARM32.
1345    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1346    ///
1347    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1348    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1349    /// then restores FPSCR.
1350    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1351        let mut bytes = Vec::new();
1352        let dm_num = vfp_dreg_to_num(dm)?;
1353        let dd_num = vfp_dreg_to_num(dd)?;
1354        let (vm, m) = encode_dreg(dm_num);
1355        let (vd, d) = encode_dreg(dd_num);
1356
1357        if mode == 0b11 {
1358            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1359            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1360            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1361        } else {
1362            // ceil/floor/nearest: manipulate FPSCR rounding mode
1363            let rt: u32 = 12;
1364
1365            // VMRS R12, FPSCR
1366            let vmrs = 0xEEF10A10 | (rt << 12);
1367            bytes.extend_from_slice(&vmrs.to_le_bytes());
1368
1369            // BIC R12, R12, #(3 << 22)
1370            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1371            bytes.extend_from_slice(&bic.to_le_bytes());
1372
1373            // ORR R12, R12, #(mode << 22)
1374            if mode != 0 {
1375                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1376                bytes.extend_from_slice(&orr.to_le_bytes());
1377            }
1378
1379            // VMSR FPSCR, R12
1380            let vmsr = 0xEEE10A10 | (rt << 12);
1381            bytes.extend_from_slice(&vmsr.to_le_bytes());
1382
1383            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1384            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1385            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1386
1387            // Restore FPSCR
1388            bytes.extend_from_slice(&vmrs.to_le_bytes());
1389            bytes.extend_from_slice(&bic.to_le_bytes());
1390            bytes.extend_from_slice(&vmsr.to_le_bytes());
1391        }
1392
1393        // VCVT.F64.S32 Dd, S0 (convert back to double)
1394        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1395        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1396
1397        Ok(bytes)
1398    }
1399
1400    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1401    fn encode_arm_f64_minmax(
1402        &self,
1403        dd: &VfpReg,
1404        dn: &VfpReg,
1405        dm: &VfpReg,
1406        is_min: bool,
1407    ) -> Result<Vec<u8>> {
1408        let mut bytes = Vec::new();
1409        let dn_num = vfp_dreg_to_num(dn)?;
1410        let dm_num = vfp_dreg_to_num(dm)?;
1411        let dd_num = vfp_dreg_to_num(dd)?;
1412
1413        // VMOV.F64 Dd, Dn (start with first operand)
1414        let (vd, d) = encode_dreg(dd_num);
1415        let (vn, n) = encode_dreg(dn_num);
1416        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1417        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1418
1419        // VCMP.F64 Dn, Dm
1420        let (vm, m) = encode_dreg(dm_num);
1421        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1422        bytes.extend_from_slice(&vcmp.to_le_bytes());
1423
1424        // VMRS APSR_nzcv, FPSCR
1425        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1426
1427        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1428        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1429        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1430
1431        Ok(bytes)
1432    }
1433
1434    /// Encode F64 copysign as ARM32
1435    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1436        let mut bytes = Vec::new();
1437
1438        // VMOV R0, R12, Dm (get sign source bits)
1439        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1440        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1441
1442        // VMOV R1, R2, Dn (get magnitude source bits)
1443        // We use R1 (lo) and R2 (hi) for the magnitude
1444        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1445        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1446
1447        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1448        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1449        bytes.extend_from_slice(&and_sign.to_le_bytes());
1450
1451        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1452        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1453        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1454
1455        // ORR R2, R2, R12 (combine sign + magnitude)
1456        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1457        bytes.extend_from_slice(&orr.to_le_bytes());
1458
1459        // VMOV Dd, R1, R2
1460        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1461        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1462
1463        Ok(bytes)
1464    }
1465
1466    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1467    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1468        let mut bytes = Vec::new();
1469
1470        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1471        // We use Sm as both source and destination for the intermediate result
1472        let sm_num = vfp_sreg_to_num(sm)?;
1473        let (vd, d) = encode_sreg(sm_num);
1474        let (vm, m) = encode_sreg(sm_num);
1475        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1476        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1477        bytes.extend_from_slice(&vcvt.to_le_bytes());
1478
1479        // VMOV Rd, Sm — move result back to core register
1480        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1481        bytes.extend_from_slice(&vmov.to_le_bytes());
1482
1483        Ok(bytes)
1484    }
1485
1486    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1487    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1488        // Thumb-2 supports both 16-bit and 32-bit instructions
1489        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1490        match op {
1491            // === 16-bit Thumb encodings ===
1492            ArmOp::Add { rd, rn, op2 } => {
1493                let rd_bits = reg_to_bits(rd) as u16;
1494                let rn_bits = reg_to_bits(rn) as u16;
1495
1496                if let Operand2::Reg(rm) = op2 {
1497                    let rm_bits = reg_to_bits(rm) as u16;
1498                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1499                    // high registers (e.g. R12, the MemLoad/MemStore base
1500                    // scratch) the bits overflow into adjacent fields, silently
1501                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1502                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1503                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1504                    // as the Sub handler below does.
1505                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1506                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1507                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1508                        Ok(instr.to_le_bytes().to_vec())
1509                    } else {
1510                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1511                        self.encode_thumb32_add_reg_raw(
1512                            rd_bits as u32,
1513                            rn_bits as u32,
1514                            rm_bits as u32,
1515                        )
1516                    }
1517                } else if let Operand2::Imm(imm) = op2 {
1518                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1519                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1520                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1521                        Ok(instr.to_le_bytes().to_vec())
1522                    } else {
1523                        // Use 32-bit ADD for larger immediates
1524                        self.encode_thumb32_add(rd, rn, *imm as u32)
1525                    }
1526                } else {
1527                    // Fallback to 32-bit encoding
1528                    self.encode_thumb32_add(rd, rn, 0)
1529                }
1530            }
1531
1532            ArmOp::Sub { rd, rn, op2 } => {
1533                let rd_bits = reg_to_bits(rd) as u16;
1534                let rn_bits = reg_to_bits(rn) as u16;
1535
1536                if let Operand2::Reg(rm) = op2 {
1537                    let rm_bits = reg_to_bits(rm) as u16;
1538                    // 16-bit SUBS can only use low registers (R0-R7)
1539                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1540                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1541                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1542                        Ok(instr.to_le_bytes().to_vec())
1543                    } else {
1544                        // Use 32-bit SUB.W for high registers
1545                        self.encode_thumb32_sub_reg_raw(
1546                            rd_bits as u32,
1547                            rn_bits as u32,
1548                            rm_bits as u32,
1549                        )
1550                    }
1551                } else if let Operand2::Imm(imm) = op2 {
1552                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1553                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1554                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1555                        Ok(instr.to_le_bytes().to_vec())
1556                    } else {
1557                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1558                    }
1559                } else {
1560                    self.encode_thumb32_sub(rd, rn, 0)
1561                }
1562            }
1563
1564            ArmOp::Mov { rd, op2 } => {
1565                let rd_bits = reg_to_bits(rd) as u16;
1566
1567                if let Operand2::Imm(imm) = op2 {
1568                    if *imm <= 255 && rd_bits < 8 {
1569                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1570                        let imm_bits = (*imm as u16) & 0xFF;
1571                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1572                        Ok(instr.to_le_bytes().to_vec())
1573                    } else {
1574                        // Use 32-bit MOVW for larger immediates
1575                        self.encode_thumb32_movw(rd, *imm as u32)
1576                    }
1577                } else if let Operand2::Reg(rm) = op2 {
1578                    let rm_bits = reg_to_bits(rm) as u16;
1579                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1580                    // D = Rd[3], Rd[2:0] in lower bits
1581                    let d_bit = (rd_bits >> 3) & 1;
1582                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1583                    Ok(instr.to_le_bytes().to_vec())
1584                } else {
1585                    let instr: u16 = 0xBF00; // NOP fallback
1586                    Ok(instr.to_le_bytes().to_vec())
1587                }
1588            }
1589
1590            ArmOp::Push { regs } => {
1591                // Thumb-2 PUSH encoding:
1592                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1593                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1594                let mut reg_list: u16 = 0;
1595                let mut need_32bit = false;
1596                for r in regs {
1597                    let bit = reg_to_bits(r);
1598                    if bit >= 8 && *r != Reg::LR {
1599                        need_32bit = true;
1600                    }
1601                    reg_list |= 1 << bit;
1602                }
1603                if !need_32bit {
1604                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1605                    let m_bit = if reg_list & (1 << 14) != 0 {
1606                        1u16
1607                    } else {
1608                        0u16
1609                    };
1610                    let low_regs = reg_list & 0xFF;
1611                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1612                    Ok(instr.to_le_bytes().to_vec())
1613                } else {
1614                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1615                    let hw1: u16 = 0xE92D;
1616                    let hw2: u16 = reg_list;
1617                    let mut bytes = hw1.to_le_bytes().to_vec();
1618                    bytes.extend_from_slice(&hw2.to_le_bytes());
1619                    Ok(bytes)
1620                }
1621            }
1622
1623            ArmOp::Pop { regs } => {
1624                // Thumb-2 POP encoding:
1625                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1626                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1627                let mut reg_list: u16 = 0;
1628                let mut need_32bit = false;
1629                for r in regs {
1630                    let bit = reg_to_bits(r);
1631                    if bit >= 8 && *r != Reg::PC {
1632                        need_32bit = true;
1633                    }
1634                    reg_list |= 1 << bit;
1635                }
1636                if !need_32bit {
1637                    // 16-bit POP: 1011 110 P rrrrrrrr
1638                    let p_bit = if reg_list & (1 << 15) != 0 {
1639                        1u16
1640                    } else {
1641                        0u16
1642                    };
1643                    let low_regs = reg_list & 0xFF;
1644                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1645                    Ok(instr.to_le_bytes().to_vec())
1646                } else {
1647                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1648                    let hw1: u16 = 0xE8BD;
1649                    let hw2: u16 = reg_list;
1650                    let mut bytes = hw1.to_le_bytes().to_vec();
1651                    bytes.extend_from_slice(&hw2.to_le_bytes());
1652                    Ok(bytes)
1653                }
1654            }
1655
1656            ArmOp::Nop => {
1657                let instr: u16 = 0xBF00; // NOP in Thumb-2
1658                Ok(instr.to_le_bytes().to_vec())
1659            }
1660
1661            ArmOp::Udf { imm } => {
1662                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1663                // This triggers UsageFault/HardFault, used for WASM traps
1664                let instr: u16 = 0xDE00 | (*imm as u16);
1665                let bytes = instr.to_le_bytes().to_vec();
1666                encoding_contracts::verify_thumb16(&bytes);
1667                Ok(bytes)
1668            }
1669
1670            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1671            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1672            ArmOp::Adds { rd, rn, op2 } => {
1673                let rd_bits = reg_to_bits(rd) as u16;
1674                let rn_bits = reg_to_bits(rn) as u16;
1675
1676                if let Operand2::Reg(rm) = op2 {
1677                    let rm_bits = reg_to_bits(rm) as u16;
1678                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1679                    // operands in R8-R11, which would overflow the 3-bit fields
1680                    // and corrupt the operands (#178/#180 class). Guard and fall
1681                    // back to 32-bit ADDS.W for high registers.
1682                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1683                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1684                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1685                        Ok(instr.to_le_bytes().to_vec())
1686                    } else {
1687                        self.encode_thumb32_adds_reg_raw(
1688                            rd_bits as u32,
1689                            rn_bits as u32,
1690                            rm_bits as u32,
1691                        )
1692                    }
1693                } else {
1694                    // 32-bit Thumb-2 ADDS with immediate
1695                    self.encode_thumb32_adds(rd, rn, 0)
1696                }
1697            }
1698
1699            // ADC: Add with Carry (Thumb-2 32-bit)
1700            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1701            ArmOp::Adc { rd, rn, op2 } => {
1702                let rd_bits = reg_to_bits(rd);
1703                let rn_bits = reg_to_bits(rn);
1704
1705                if let Operand2::Reg(rm) = op2 {
1706                    let rm_bits = reg_to_bits(rm);
1707                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1708                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1709                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1710
1711                    let mut bytes = hw1.to_le_bytes().to_vec();
1712                    bytes.extend_from_slice(&hw2.to_le_bytes());
1713                    Ok(bytes)
1714                } else {
1715                    // ADC with immediate - use 32-bit encoding
1716                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1717                    let hw2: u16 = (rd_bits << 8) as u16;
1718                    let mut bytes = hw1.to_le_bytes().to_vec();
1719                    bytes.extend_from_slice(&hw2.to_le_bytes());
1720                    Ok(bytes)
1721                }
1722            }
1723
1724            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1725            ArmOp::Subs { rd, rn, op2 } => {
1726                let rd_bits = reg_to_bits(rd) as u16;
1727                let rn_bits = reg_to_bits(rn) as u16;
1728
1729                if let Operand2::Reg(rm) = op2 {
1730                    let rm_bits = reg_to_bits(rm) as u16;
1731                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1732                    // would overflow the 3-bit fields (#178/#180 class). Guard
1733                    // and fall back to 32-bit SUBS.W for high registers.
1734                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1735                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1736                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1737                        Ok(instr.to_le_bytes().to_vec())
1738                    } else {
1739                        self.encode_thumb32_subs_reg_raw(
1740                            rd_bits as u32,
1741                            rn_bits as u32,
1742                            rm_bits as u32,
1743                        )
1744                    }
1745                } else {
1746                    // 32-bit Thumb-2 SUBS with immediate
1747                    self.encode_thumb32_subs(rd, rn, 0)
1748                }
1749            }
1750
1751            // SBC: Subtract with Carry (Thumb-2 32-bit)
1752            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1753            ArmOp::Sbc { rd, rn, op2 } => {
1754                let rd_bits = reg_to_bits(rd);
1755                let rn_bits = reg_to_bits(rn);
1756
1757                if let Operand2::Reg(rm) = op2 {
1758                    let rm_bits = reg_to_bits(rm);
1759                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1760                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1761                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1762
1763                    let mut bytes = hw1.to_le_bytes().to_vec();
1764                    bytes.extend_from_slice(&hw2.to_le_bytes());
1765                    Ok(bytes)
1766                } else {
1767                    // SBC with immediate - use 32-bit encoding
1768                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1769                    let hw2: u16 = (rd_bits << 8) as u16;
1770                    let mut bytes = hw1.to_le_bytes().to_vec();
1771                    bytes.extend_from_slice(&hw2.to_le_bytes());
1772                    Ok(bytes)
1773                }
1774            }
1775
1776            // === 32-bit Thumb-2 encodings ===
1777
1778            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1779            ArmOp::Sdiv { rd, rn, rm } => {
1780                let rd_bits = reg_to_bits(rd);
1781                let rn_bits = reg_to_bits(rn);
1782                let rm_bits = reg_to_bits(rm);
1783                reg_bits_checked(rd_bits)?;
1784                reg_bits_checked(rn_bits)?;
1785                reg_bits_checked(rm_bits)?;
1786
1787                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1788                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1789                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1790                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1791                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1792
1793                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1794                let mut bytes = hw1.to_le_bytes().to_vec();
1795                bytes.extend_from_slice(&hw2.to_le_bytes());
1796                encoding_contracts::verify_thumb32(&bytes);
1797                Ok(bytes)
1798            }
1799
1800            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1801            ArmOp::Udiv { rd, rn, rm } => {
1802                let rd_bits = reg_to_bits(rd);
1803                let rn_bits = reg_to_bits(rn);
1804                let rm_bits = reg_to_bits(rm);
1805                reg_bits_checked(rd_bits)?;
1806                reg_bits_checked(rn_bits)?;
1807                reg_bits_checked(rm_bits)?;
1808
1809                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1810                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1811                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1812
1813                let mut bytes = hw1.to_le_bytes().to_vec();
1814                bytes.extend_from_slice(&hw2.to_le_bytes());
1815                encoding_contracts::verify_thumb32(&bytes);
1816                Ok(bytes)
1817            }
1818
1819            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
1820                let rdlo_bits = reg_to_bits(rdlo);
1821                let rdhi_bits = reg_to_bits(rdhi);
1822                let rn_bits = reg_to_bits(rn);
1823                let rm_bits = reg_to_bits(rm);
1824                reg_bits_checked(rdlo_bits)?;
1825                reg_bits_checked(rdhi_bits)?;
1826                reg_bits_checked(rn_bits)?;
1827                reg_bits_checked(rm_bits)?;
1828
1829                // Thumb-2 UMULL: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm
1830                let hw1: u16 = (0xFBA0 | rn_bits) as u16;
1831                let hw2: u16 = ((rdlo_bits << 12) | (rdhi_bits << 8) | rm_bits) as u16;
1832
1833                let mut bytes = hw1.to_le_bytes().to_vec();
1834                bytes.extend_from_slice(&hw2.to_le_bytes());
1835                encoding_contracts::verify_thumb32(&bytes);
1836                Ok(bytes)
1837            }
1838
1839            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1840            ArmOp::Mul { rd, rn, rm } => {
1841                let rd_bits = reg_to_bits(rd);
1842                let rn_bits = reg_to_bits(rn);
1843                let rm_bits = reg_to_bits(rm);
1844
1845                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1846                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1847                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1848                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1849
1850                let mut bytes = hw1.to_le_bytes().to_vec();
1851                bytes.extend_from_slice(&hw2.to_le_bytes());
1852                Ok(bytes)
1853            }
1854
1855            // MLS: Rd = Ra - Rn * Rm
1856            ArmOp::Mls { rd, rn, rm, ra } => {
1857                let rd_bits = reg_to_bits(rd);
1858                let rn_bits = reg_to_bits(rn);
1859                let rm_bits = reg_to_bits(rm);
1860                let ra_bits = reg_to_bits(ra);
1861
1862                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1863                // 11111011 0000 Rn | Ra Rd 0001 Rm
1864                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1865                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1866
1867                let mut bytes = hw1.to_le_bytes().to_vec();
1868                bytes.extend_from_slice(&hw2.to_le_bytes());
1869                Ok(bytes)
1870            }
1871
1872            // AND (Thumb-2 32-bit)
1873            ArmOp::And { rd, rn, op2 } => {
1874                if let Operand2::Reg(rm) = op2 {
1875                    let rd_bits = reg_to_bits(rd);
1876                    let rn_bits = reg_to_bits(rn);
1877                    let rm_bits = reg_to_bits(rm);
1878
1879                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1880                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1881                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1882
1883                    let mut bytes = hw1.to_le_bytes().to_vec();
1884                    bytes.extend_from_slice(&hw2.to_le_bytes());
1885                    Ok(bytes)
1886                } else if let Operand2::Imm(imm) = op2 {
1887                    let rd_bits = reg_to_bits(rd);
1888                    let rn_bits = reg_to_bits(rn);
1889                    let imm_val = *imm as u32;
1890
1891                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
1892                    let i_bit = (imm_val >> 11) & 1;
1893                    let imm3 = (imm_val >> 8) & 0x7;
1894                    let imm8 = imm_val & 0xFF;
1895
1896                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1897                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1898
1899                    let mut bytes = hw1.to_le_bytes().to_vec();
1900                    bytes.extend_from_slice(&hw2.to_le_bytes());
1901                    Ok(bytes)
1902                } else {
1903                    // RegShift variant - fallback to NOP
1904                    let instr: u16 = 0xBF00;
1905                    Ok(instr.to_le_bytes().to_vec())
1906                }
1907            }
1908
1909            // ORR (Thumb-2 32-bit)
1910            ArmOp::Orr { rd, rn, op2 } => {
1911                if let Operand2::Reg(rm) = op2 {
1912                    let rd_bits = reg_to_bits(rd);
1913                    let rn_bits = reg_to_bits(rn);
1914                    let rm_bits = reg_to_bits(rm);
1915
1916                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1917                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1918                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1919
1920                    let mut bytes = hw1.to_le_bytes().to_vec();
1921                    bytes.extend_from_slice(&hw2.to_le_bytes());
1922                    Ok(bytes)
1923                } else {
1924                    let instr: u16 = 0xBF00;
1925                    Ok(instr.to_le_bytes().to_vec())
1926                }
1927            }
1928
1929            // EOR (Thumb-2 32-bit)
1930            ArmOp::Eor { rd, rn, op2 } => {
1931                if let Operand2::Reg(rm) = op2 {
1932                    let rd_bits = reg_to_bits(rd);
1933                    let rn_bits = reg_to_bits(rn);
1934                    let rm_bits = reg_to_bits(rm);
1935
1936                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
1937                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
1938                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1939
1940                    let mut bytes = hw1.to_le_bytes().to_vec();
1941                    bytes.extend_from_slice(&hw2.to_le_bytes());
1942                    Ok(bytes)
1943                } else {
1944                    let instr: u16 = 0xBF00;
1945                    Ok(instr.to_le_bytes().to_vec())
1946                }
1947            }
1948
1949            // Shift operations (16-bit for low registers)
1950            ArmOp::Lsl { rd, rn, shift } => {
1951                let rd_bits = reg_to_bits(rd) as u16;
1952                let rn_bits = reg_to_bits(rn) as u16;
1953                let shift_bits = (*shift as u16) & 0x1F;
1954
1955                if rd_bits < 8 && rn_bits < 8 {
1956                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
1957                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1958                    Ok(instr.to_le_bytes().to_vec())
1959                } else {
1960                    // Use 32-bit encoding for high registers
1961                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
1962                }
1963            }
1964
1965            ArmOp::Lsr { rd, rn, shift } => {
1966                let rd_bits = reg_to_bits(rd) as u16;
1967                let rn_bits = reg_to_bits(rn) as u16;
1968                let shift_bits = (*shift as u16) & 0x1F;
1969
1970                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1971                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
1972                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1973                    Ok(instr.to_le_bytes().to_vec())
1974                } else {
1975                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
1976                }
1977            }
1978
1979            ArmOp::Asr { rd, rn, shift } => {
1980                let rd_bits = reg_to_bits(rd) as u16;
1981                let rn_bits = reg_to_bits(rn) as u16;
1982                let shift_bits = (*shift as u16) & 0x1F;
1983
1984                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1985                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
1986                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1987                    Ok(instr.to_le_bytes().to_vec())
1988                } else {
1989                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
1990                }
1991            }
1992
1993            ArmOp::Ror { rd, rn, shift } => {
1994                // ROR doesn't have a 16-bit immediate form, use 32-bit
1995                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
1996            }
1997
1998            // Register-based shifts (Thumb-2 32-bit)
1999            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
2000            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2001            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
2002            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
2003            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
2004            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
2005
2006            // RSB (Reverse Subtract): Rd = imm - Rn
2007            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
2008            ArmOp::Rsb { rd, rn, imm } => {
2009                let rd_bits = reg_to_bits(rd);
2010                let rn_bits = reg_to_bits(rn);
2011                let imm_val = *imm;
2012
2013                let i_bit = (imm_val >> 11) & 1;
2014                let imm3 = (imm_val >> 8) & 0x7;
2015                let imm8 = imm_val & 0xFF;
2016
2017                // hw1: 11110 i 01110 0 Rn  (S=0)
2018                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
2019                // hw2: 0 imm3 Rd imm8
2020                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
2021
2022                let mut bytes = hw1.to_le_bytes().to_vec();
2023                bytes.extend_from_slice(&hw2.to_le_bytes());
2024                Ok(bytes)
2025            }
2026
2027            // CLZ (Thumb-2 32-bit)
2028            ArmOp::Clz { rd, rm } => {
2029                let rd_bits = reg_to_bits(rd);
2030                let rm_bits = reg_to_bits(rm);
2031
2032                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
2033                // 11111010 1011 Rm | 1111 1000 Rd Rm
2034                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
2035                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
2036
2037                let mut bytes = hw1.to_le_bytes().to_vec();
2038                bytes.extend_from_slice(&hw2.to_le_bytes());
2039                Ok(bytes)
2040            }
2041
2042            // RBIT (Thumb-2 32-bit)
2043            ArmOp::Rbit { rd, rm } => {
2044                let rd_bits = reg_to_bits(rd);
2045                let rm_bits = reg_to_bits(rm);
2046
2047                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
2048                // 11111010 1001 Rm | 1111 Rd 1010 Rm
2049                let hw1: u16 = (0xFA90 | rm_bits) as u16;
2050                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
2051
2052                let mut bytes = hw1.to_le_bytes().to_vec();
2053                bytes.extend_from_slice(&hw2.to_le_bytes());
2054                Ok(bytes)
2055            }
2056
2057            // SXTB (16-bit for low registers)
2058            ArmOp::Sxtb { rd, rm } => {
2059                let rd_bits = reg_to_bits(rd) as u16;
2060                let rm_bits = reg_to_bits(rm) as u16;
2061
2062                if rd_bits < 8 && rm_bits < 8 {
2063                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
2064                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
2065                    Ok(instr.to_le_bytes().to_vec())
2066                } else {
2067                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
2068                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
2069                    let rd_bits32 = rd_bits as u32;
2070                    let rm_bits32 = rm_bits as u32;
2071                    let hw1: u16 = 0xFA4F;
2072                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2073                    let mut bytes = hw1.to_le_bytes().to_vec();
2074                    bytes.extend_from_slice(&hw2.to_le_bytes());
2075                    Ok(bytes)
2076                }
2077            }
2078
2079            // SXTH (16-bit for low registers)
2080            ArmOp::Sxth { rd, rm } => {
2081                let rd_bits = reg_to_bits(rd) as u16;
2082                let rm_bits = reg_to_bits(rm) as u16;
2083
2084                if rd_bits < 8 && rm_bits < 8 {
2085                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
2086                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
2087                    Ok(instr.to_le_bytes().to_vec())
2088                } else {
2089                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
2090                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
2091                    let rd_bits32 = rd_bits as u32;
2092                    let rm_bits32 = rm_bits as u32;
2093                    let hw1: u16 = 0xFA0F;
2094                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2095                    let mut bytes = hw1.to_le_bytes().to_vec();
2096                    bytes.extend_from_slice(&hw2.to_le_bytes());
2097                    Ok(bytes)
2098                }
2099            }
2100
2101            // CMP (can be 16-bit for low registers)
2102            ArmOp::Cmp { rn, op2 } => {
2103                let rn_bits = reg_to_bits(rn) as u16;
2104
2105                if let Operand2::Imm(imm) = op2 {
2106                    // Only use 16-bit encoding for non-negative immediates 0-255
2107                    // Negative immediates must use 32-bit encoding
2108                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
2109                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
2110                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
2111                        Ok(instr.to_le_bytes().to_vec())
2112                    } else {
2113                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
2114                    }
2115                } else if let Operand2::Reg(rm) = op2 {
2116                    let rm_bits = reg_to_bits(rm) as u16;
2117                    if rn_bits < 8 && rm_bits < 8 {
2118                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
2119                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2120                        Ok(instr.to_le_bytes().to_vec())
2121                    } else {
2122                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
2123                        let n_bit = (rn_bits >> 3) & 1;
2124                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2125                        Ok(instr.to_le_bytes().to_vec())
2126                    }
2127                } else {
2128                    let instr: u16 = 0xBF00;
2129                    Ok(instr.to_le_bytes().to_vec())
2130                }
2131            }
2132
2133            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2134            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2135            ArmOp::Cmn { rn, op2 } => {
2136                let rn_bits = reg_to_bits(rn) as u16;
2137
2138                if let Operand2::Imm(imm) = op2 {
2139                    // CMN.W Rn, #imm (32-bit encoding)
2140                    // Encoding: F110 Rn | 0F00 imm8 (for small immediates 0-255)
2141                    if *imm >= 0 && *imm <= 255 {
2142                        let imm8 = *imm as u16 & 0xFF;
2143                        let hw1: u16 = 0xF110 | rn_bits;
2144                        let hw2: u16 = 0x0F00 | imm8;
2145                        let mut bytes = hw1.to_le_bytes().to_vec();
2146                        bytes.extend_from_slice(&hw2.to_le_bytes());
2147                        Ok(bytes)
2148                    } else {
2149                        // For other immediates, fallback to NOP (should not happen in our use case)
2150                        Ok(vec![0xBF, 0x00])
2151                    }
2152                } else if let Operand2::Reg(rm) = op2 {
2153                    let rm_bits = reg_to_bits(rm) as u16;
2154                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2155                    // the 3-bit fields and corrupt the operands (#184, the #180
2156                    // class). CMN has no high-register 16-bit form, so fall back
2157                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2158                    // Rd discarded as PC/1111).
2159                    if rn_bits < 8 && rm_bits < 8 {
2160                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2161                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2162                        Ok(instr.to_le_bytes().to_vec())
2163                    } else {
2164                        let hw1: u16 = 0xEB10 | rn_bits;
2165                        let hw2: u16 = 0x0F00 | rm_bits;
2166                        let mut bytes = hw1.to_le_bytes().to_vec();
2167                        bytes.extend_from_slice(&hw2.to_le_bytes());
2168                        Ok(bytes)
2169                    }
2170                } else {
2171                    Ok(vec![0xBF, 0x00])
2172                }
2173            }
2174
2175            // LDR (can be 16-bit for simple cases)
2176            ArmOp::Ldr { rd, addr } => {
2177                let rd_bits = reg_to_bits(rd);
2178                let base_bits = reg_to_bits(&addr.base);
2179
2180                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2181                if let Some(offset_reg) = &addr.offset_reg {
2182                    let rm_bits = reg_to_bits(offset_reg);
2183
2184                    // If there's also an immediate offset, we need to ADD it first
2185                    if addr.offset != 0 {
2186                        // Use R12 (IP) as scratch to avoid clobbering the address register
2187                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2188                        let scratch = Reg::R12;
2189                        let mut bytes =
2190                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2191                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2192                        return Ok(bytes);
2193                    }
2194
2195                    // Simple register offset: LDR Rd, [Rn, Rm]
2196                    // 16-bit: only if Rd, Rn, Rm < R8
2197                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2198                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2199                        let instr: u16 = 0x5800
2200                            | ((rm_bits as u16) << 6)
2201                            | ((base_bits as u16) << 3)
2202                            | (rd_bits as u16);
2203                        return Ok(instr.to_le_bytes().to_vec());
2204                    }
2205
2206                    // 32-bit register offset
2207                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2208                }
2209
2210                // Immediate offset mode [base, #imm]
2211                let offset = addr.offset as u32;
2212
2213                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2214                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2215                    let imm5 = (offset >> 2) as u16;
2216                    let instr: u16 =
2217                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2218                    Ok(instr.to_le_bytes().to_vec())
2219                } else {
2220                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2221                }
2222            }
2223
2224            // STR (can be 16-bit for simple cases)
2225            ArmOp::Str { rd, addr } => {
2226                let rd_bits = reg_to_bits(rd);
2227                let base_bits = reg_to_bits(&addr.base);
2228
2229                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2230                if let Some(offset_reg) = &addr.offset_reg {
2231                    let rm_bits = reg_to_bits(offset_reg);
2232
2233                    // If there's also an immediate offset, we need to ADD it first
2234                    if addr.offset != 0 {
2235                        // Use R12 (IP) as scratch to avoid clobbering the address register
2236                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2237                        let scratch = Reg::R12;
2238                        let mut bytes =
2239                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2240                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2241                        return Ok(bytes);
2242                    }
2243
2244                    // Simple register offset: STR Rd, [Rn, Rm]
2245                    // 16-bit: only if Rd, Rn, Rm < R8
2246                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2247                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2248                        let instr: u16 = 0x5000
2249                            | ((rm_bits as u16) << 6)
2250                            | ((base_bits as u16) << 3)
2251                            | (rd_bits as u16);
2252                        return Ok(instr.to_le_bytes().to_vec());
2253                    }
2254
2255                    // 32-bit register offset
2256                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2257                }
2258
2259                // Immediate offset mode [base, #imm]
2260                let offset = addr.offset as u32;
2261
2262                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2263                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2264                    let imm5 = (offset >> 2) as u16;
2265                    let instr: u16 =
2266                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2267                    Ok(instr.to_le_bytes().to_vec())
2268                } else {
2269                    self.encode_thumb32_str(rd, &addr.base, offset)
2270                }
2271            }
2272
2273            // LDRB (Thumb-2)
2274            ArmOp::Ldrb { rd, addr } => {
2275                let rd_bits = reg_to_bits(rd);
2276                let base_bits = reg_to_bits(&addr.base);
2277
2278                if let Some(offset_reg) = &addr.offset_reg {
2279                    if addr.offset != 0 {
2280                        let scratch = Reg::R12;
2281                        let mut bytes =
2282                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2283                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2284                        return Ok(bytes);
2285                    }
2286                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2287                }
2288
2289                let offset = addr.offset as u32;
2290                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2291                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2292                    let instr: u16 = 0x7800
2293                        | ((offset as u16) << 6)
2294                        | ((base_bits as u16) << 3)
2295                        | (rd_bits as u16);
2296                    Ok(instr.to_le_bytes().to_vec())
2297                } else {
2298                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2299                }
2300            }
2301
2302            // LDRSB (Thumb-2)
2303            ArmOp::Ldrsb { rd, addr } => {
2304                let rd_bits = reg_to_bits(rd);
2305                let base_bits = reg_to_bits(&addr.base);
2306
2307                if let Some(offset_reg) = &addr.offset_reg {
2308                    if addr.offset != 0 {
2309                        let scratch = Reg::R12;
2310                        let mut bytes =
2311                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2312                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2313                        return Ok(bytes);
2314                    }
2315                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2316                }
2317
2318                let offset = addr.offset as u32;
2319                // LDRSB has no 16-bit immediate form (only register)
2320                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2321                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2322                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2323                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2324                } else {
2325                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2326                }
2327            }
2328
2329            // LDRH (Thumb-2)
2330            ArmOp::Ldrh { rd, addr } => {
2331                let rd_bits = reg_to_bits(rd);
2332                let base_bits = reg_to_bits(&addr.base);
2333
2334                if let Some(offset_reg) = &addr.offset_reg {
2335                    if addr.offset != 0 {
2336                        let scratch = Reg::R12;
2337                        let mut bytes =
2338                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2339                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2340                        return Ok(bytes);
2341                    }
2342                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2343                }
2344
2345                let offset = addr.offset as u32;
2346                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2347                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2348                    let imm5 = (offset >> 1) as u16;
2349                    let instr: u16 =
2350                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2351                    Ok(instr.to_le_bytes().to_vec())
2352                } else {
2353                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2354                }
2355            }
2356
2357            // LDRSH (Thumb-2)
2358            ArmOp::Ldrsh { rd, addr } => {
2359                if let Some(offset_reg) = &addr.offset_reg {
2360                    if addr.offset != 0 {
2361                        let scratch = Reg::R12;
2362                        let mut bytes =
2363                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2364                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2365                        return Ok(bytes);
2366                    }
2367                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2368                }
2369
2370                let offset = addr.offset as u32;
2371                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2372            }
2373
2374            // STRB (Thumb-2)
2375            ArmOp::Strb { rd, addr } => {
2376                let rd_bits = reg_to_bits(rd);
2377                let base_bits = reg_to_bits(&addr.base);
2378
2379                if let Some(offset_reg) = &addr.offset_reg {
2380                    if addr.offset != 0 {
2381                        let scratch = Reg::R12;
2382                        let mut bytes =
2383                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2384                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2385                        return Ok(bytes);
2386                    }
2387                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2388                }
2389
2390                let offset = addr.offset as u32;
2391                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2392                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2393                    let instr: u16 = 0x7000
2394                        | ((offset as u16) << 6)
2395                        | ((base_bits as u16) << 3)
2396                        | (rd_bits as u16);
2397                    Ok(instr.to_le_bytes().to_vec())
2398                } else {
2399                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2400                }
2401            }
2402
2403            // STRH (Thumb-2)
2404            ArmOp::Strh { rd, addr } => {
2405                let rd_bits = reg_to_bits(rd);
2406                let base_bits = reg_to_bits(&addr.base);
2407
2408                if let Some(offset_reg) = &addr.offset_reg {
2409                    if addr.offset != 0 {
2410                        let scratch = Reg::R12;
2411                        let mut bytes =
2412                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2413                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2414                        return Ok(bytes);
2415                    }
2416                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2417                }
2418
2419                let offset = addr.offset as u32;
2420                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2421                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2422                    let imm5 = (offset >> 1) as u16;
2423                    let instr: u16 =
2424                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2425                    Ok(instr.to_le_bytes().to_vec())
2426                } else {
2427                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2428                }
2429            }
2430
2431            // MemorySize (Thumb-2)
2432            ArmOp::MemorySize { rd } => {
2433                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2434                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2435                let rd_bits = reg_to_bits(rd);
2436                let r10_bits = reg_to_bits(&Reg::R10);
2437                if rd_bits < 8 && r10_bits < 8 {
2438                    let instr: u16 =
2439                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2440                    Ok(instr.to_le_bytes().to_vec())
2441                } else {
2442                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2443                    let imm5: u32 = 16;
2444                    let imm3 = (imm5 >> 2) & 0x7;
2445                    let imm2 = imm5 & 0x3;
2446                    let hw1: u16 = 0xEA4F;
2447                    let hw2: u16 =
2448                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2449                    let mut bytes = hw1.to_le_bytes().to_vec();
2450                    bytes.extend_from_slice(&hw2.to_le_bytes());
2451                    Ok(bytes)
2452                }
2453            }
2454
2455            // MemoryGrow (Thumb-2)
2456            ArmOp::MemoryGrow { rd, .. } => {
2457                // On embedded with fixed memory, always return -1 (failure)
2458                // MVN rd, #0 → MOV rd, #-1
2459                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2460                let rd_bits = reg_to_bits(rd);
2461                let hw1: u16 = 0xF06F; // MVN with i=0
2462                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2463                let mut bytes = hw1.to_le_bytes().to_vec();
2464                bytes.extend_from_slice(&hw2.to_le_bytes());
2465                Ok(bytes)
2466            }
2467
2468            // BX (16-bit)
2469            ArmOp::Bx { rm } => {
2470                let rm_bits = reg_to_bits(rm) as u16;
2471                // BX Rm (16-bit): 0100 0111 0 Rm 000
2472                let instr: u16 = 0x4700 | (rm_bits << 3);
2473                Ok(instr.to_le_bytes().to_vec())
2474            }
2475
2476            // BLX (16-bit) - Branch with Link and Exchange
2477            // BLX Rm: 0100 0111 1 Rm 000
2478            ArmOp::Blx { rm } => {
2479                let rm_bits = reg_to_bits(rm) as u16;
2480                let instr: u16 = 0x4780 | (rm_bits << 3);
2481                Ok(instr.to_le_bytes().to_vec())
2482            }
2483
2484            // CallIndirect - indirect function call via table lookup
2485            // table_index_reg contains the table index
2486            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2487            ArmOp::CallIndirect {
2488                rd: _,
2489                type_idx: _,
2490                table_index_reg,
2491            } => {
2492                let idx_reg = reg_to_bits(table_index_reg);
2493                let mut bytes = Vec::new();
2494
2495                // For now, we generate code that:
2496                // 1. Multiplies index by 4 (function pointer size)
2497                // 2. Loads function pointer from table (assumes table base in R11)
2498                // 3. Calls the function via BLX
2499                //
2500                // Table base setup must be done by caller/runtime.
2501                // This is a simplified implementation - full support needs:
2502                // - Table base address resolution
2503                // - Type signature checking
2504                // - Bounds checking
2505
2506                // LSL R12, idx_reg, #2 (multiply index by 4)
2507                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2508                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2509                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2510                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2511                bytes.extend_from_slice(&hw1.to_le_bytes());
2512                bytes.extend_from_slice(&hw2.to_le_bytes());
2513
2514                // LDR R12, [R11, R12] - load function pointer
2515                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2516                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2517                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2518                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2519                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2520                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2521
2522                // BLX R12 (call function indirectly)
2523                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2524                let blx: u16 = 0x47E0; // BLX R12
2525                bytes.extend_from_slice(&blx.to_le_bytes());
2526
2527                Ok(bytes)
2528            }
2529
2530            // Label pseudo-instruction: emits no machine code
2531            ArmOp::Label { .. } => Ok(Vec::new()),
2532
2533            // Conditional branch to label (generic) - offset 0, will be patched
2534            ArmOp::Bcc { cond, label: _ } => {
2535                use synth_synthesis::Condition;
2536                let cond_bits: u16 = match cond {
2537                    Condition::EQ => 0x0,
2538                    Condition::NE => 0x1,
2539                    Condition::HS => 0x2,
2540                    Condition::LO => 0x3,
2541                    Condition::HI => 0x8,
2542                    Condition::LS => 0x9,
2543                    Condition::GE => 0xA,
2544                    Condition::LT => 0xB,
2545                    Condition::GT => 0xC,
2546                    Condition::LE => 0xD,
2547                };
2548                // 16-bit B<cond> with offset 0: 1101 cond imm8
2549                let instr: u16 = 0xD000 | (cond_bits << 8);
2550                Ok(instr.to_le_bytes().to_vec())
2551            }
2552
2553            // Branch instructions
2554            ArmOp::B { label: _ } => {
2555                // Simplified: B.N with offset 0
2556                // For real usage, would need label resolution
2557                let instr: u16 = 0xE000; // B.N #0
2558                Ok(instr.to_le_bytes().to_vec())
2559            }
2560
2561            // BHS (Branch if Higher or Same) - used for bounds checking
2562            // Condition code: 0x2 (C set)
2563            ArmOp::Bhs { label: _ } => {
2564                // 16-bit B<cond> with offset 0: 1101 cond imm8
2565                // cond = 0x2 (HS)
2566                let instr: u16 = 0xD200; // BHS.N #0
2567                Ok(instr.to_le_bytes().to_vec())
2568            }
2569
2570            // BLO (Branch if Lower) - complementary to BHS
2571            // Condition code: 0x3 (C clear)
2572            ArmOp::Blo { label: _ } => {
2573                // 16-bit B<cond> with offset 0: 1101 cond imm8
2574                // cond = 0x3 (LO)
2575                let instr: u16 = 0xD300; // BLO.N #0
2576                Ok(instr.to_le_bytes().to_vec())
2577            }
2578
2579            // Branch with numeric offset (Thumb-2)
2580            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2581            ArmOp::BOffset { offset } => {
2582                // offset is already the halfword displacement: (target - branch - 4) / 2
2583                // This is the raw encoded value, accounting for variable-length instructions
2584                let halfword_offset = *offset;
2585
2586                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2587                // Range: -1024 to +1022 halfwords
2588                if (-1024..=1022).contains(&halfword_offset) {
2589                    // 16-bit B.N encoding: 1110 0 imm11
2590                    let imm11 = (halfword_offset as u16) & 0x7FF;
2591                    let instr: u16 = 0xE000 | imm11;
2592                    Ok(instr.to_le_bytes().to_vec())
2593                } else {
2594                    // 32-bit B.W encoding for larger offsets
2595                    // First halfword: 1111 0 S imm10
2596                    // Second halfword: 10 J1 0 J2 imm11
2597                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2598                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2599
2600                    // The B.W (T4) encoding packs the signed offset as:
2601                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2602                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2603                    // Input halfword_offset already equals (target - PC - 4) / 2,
2604                    // so the full byte offset = halfword_offset << 1.
2605                    // The encoding fields split that 25-bit signed value (including the
2606                    // implicit trailing zero) as: S | imm10 | imm11
2607                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2608                    let signed_offset = halfword_offset << 1; // byte offset
2609                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2610                    let uoffset = signed_offset as u32;
2611                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2612                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2613                    let i1 = (uoffset >> 23) & 1; // bit 23
2614                    let i2 = (uoffset >> 22) & 1; // bit 22
2615                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2616                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2617
2618                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2619                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2620
2621                    let mut bytes = hw1.to_le_bytes().to_vec();
2622                    bytes.extend_from_slice(&hw2.to_le_bytes());
2623                    Ok(bytes)
2624                }
2625            }
2626
2627            // Conditional branch with numeric offset (Thumb-2)
2628            ArmOp::BCondOffset { cond, offset } => {
2629                use synth_synthesis::Condition;
2630                let cond_bits: u16 = match cond {
2631                    Condition::EQ => 0x0,
2632                    Condition::NE => 0x1,
2633                    Condition::HS => 0x2,
2634                    Condition::LO => 0x3,
2635                    Condition::HI => 0x8,
2636                    Condition::LS => 0x9,
2637                    Condition::GE => 0xA,
2638                    Condition::LT => 0xB,
2639                    Condition::GT => 0xC,
2640                    Condition::LE => 0xD,
2641                };
2642
2643                // offset is already the halfword displacement: (target - branch - 4) / 2
2644                // This is the raw imm8 value for 16-bit B<cond> encoding
2645                let halfword_offset = *offset;
2646
2647                // 16-bit B<cond> encoding: 1101 cond imm8
2648                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2649                if (-128..=127).contains(&halfword_offset) {
2650                    let imm8 = (halfword_offset as u16) & 0xFF;
2651                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2652                    Ok(instr.to_le_bytes().to_vec())
2653                } else {
2654                    // 32-bit B<cond>.W for larger offsets
2655                    // First halfword: 1111 0 S cond imm6
2656                    // Second halfword: 10 J1 0 J2 imm11
2657                    let offset = halfword_offset >> 1;
2658                    let s = if offset < 0 { 1u32 } else { 0u32 };
2659                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2660                    let imm11 = (offset as u32) & 0x7FF;
2661                    let j1 = if s == 1 { 1 } else { 0 };
2662                    let j2 = if s == 1 { 1 } else { 0 };
2663
2664                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2665                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2666
2667                    let mut bytes = hw1.to_le_bytes().to_vec();
2668                    bytes.extend_from_slice(&hw2.to_le_bytes());
2669                    Ok(bytes)
2670                }
2671            }
2672
2673            ArmOp::Bl { label: _ } => {
2674                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2675                // placeholder; an R_ARM_THM_CALL relocation patches the target
2676                // (see arm_backend.rs). The placeholder must carry an embedded
2677                // addend of -4 so the relocation nets to exactly the symbol S.
2678                //
2679                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2680                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2681                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2682                // instruction past the callee entry (#174). The correct
2683                // placeholder is what `gas` emits for `bl <extern>`:
2684                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2685                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2686                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2687                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2688                let hw1: u16 = 0xF7FF;
2689                let hw2: u16 = 0xFFFE;
2690                let mut bytes = hw1.to_le_bytes().to_vec();
2691                bytes.extend_from_slice(&hw2.to_le_bytes());
2692                Ok(bytes)
2693            }
2694
2695            // MVN
2696            ArmOp::Mvn { rd, op2 } => {
2697                if let Operand2::Reg(rm) = op2 {
2698                    let rd_bits = reg_to_bits(rd) as u16;
2699                    let rm_bits = reg_to_bits(rm) as u16;
2700
2701                    if rd_bits < 8 && rm_bits < 8 {
2702                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2703                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2704                        Ok(instr.to_le_bytes().to_vec())
2705                    } else {
2706                        // 32-bit MVN
2707                        let hw1: u16 = 0xEA6F_u16;
2708                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2709                        let mut bytes = hw1.to_le_bytes().to_vec();
2710                        bytes.extend_from_slice(&hw2.to_le_bytes());
2711                        Ok(bytes)
2712                    }
2713                } else {
2714                    let instr: u16 = 0xBF00;
2715                    Ok(instr.to_le_bytes().to_vec())
2716                }
2717            }
2718
2719            // MOVW - Move Wide (Thumb-2 32-bit)
2720            ArmOp::Movw { rd, imm16 } => {
2721                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2722            }
2723
2724            // MOVT - Move Top (Thumb-2 32-bit)
2725            ArmOp::Movt { rd, imm16 } => {
2726                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2727            }
2728
2729            // #237: symbol-relative MOVW/MOVT. Encode the addend's low/high 16
2730            // bits in place; the backend records an R_ARM_MOVW_ABS_NC /
2731            // R_ARM_MOVT_ABS relocation against `symbol`, so the linker adds the
2732            // symbol's final address to the in-place addend (REL semantics).
2733            ArmOp::MovwSym { rd, addend, .. } => {
2734                self.encode_thumb32_movw_raw(reg_to_bits(rd), (*addend as u32) & 0xffff)
2735            }
2736            ArmOp::MovtSym { rd, addend, .. } => {
2737                self.encode_thumb32_movt_raw(reg_to_bits(rd), ((*addend as u32) >> 16) & 0xffff)
2738            }
2739
2740            // SetCond: Materialize condition flag into register (0 or 1)
2741            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2742            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2743            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2744            // any MOV instruction clobbers the flags from CMP.
2745            ArmOp::SetCond { rd, cond } => {
2746                let rd_bits = reg_to_bits(rd) as u16;
2747
2748                // Condition code encoding for IT block
2749                use synth_synthesis::Condition;
2750                let cond_bits: u16 = match cond {
2751                    Condition::EQ => 0x0,
2752                    Condition::NE => 0x1,
2753                    Condition::LT => 0xB,
2754                    Condition::LE => 0xD,
2755                    Condition::GT => 0xC,
2756                    Condition::GE => 0xA,
2757                    Condition::LO => 0x3, // CC/LO (unsigned <)
2758                    Condition::LS => 0x9, // LS (unsigned <=)
2759                    Condition::HI => 0x8, // HI (unsigned >)
2760                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2761                };
2762
2763                // ITE <cond>: encodes If-Then-Else block
2764                // The mask field depends on firstcond[0]:
2765                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2766                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2767                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2768                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2769
2770                // Materialize 0/1 into Rd. The 16-bit MOVS (T1) encodes Rd in a
2771                // 3-bit field (bits[10:8]) — only R0–R7. For a high register
2772                // (R8–R12) `rd_bits << 8` overflows into bit 11 and silently
2773                // turns MOVS into CMP (00100 → 00101), corrupting the result
2774                // (this mis-materialized gale's `has_waiter`, so its `local.set`
2775                // stored a stale register → the binary-sem WAKE dispatch read
2776                // garbage). Use the 32-bit MOV.W (T2) for high registers, which
2777                // has a 4-bit Rd field. MOV.W with S=0 doesn't set flags, which
2778                // is fine inside the ITE (the materialized value is the result;
2779                // the flags are not consumed afterwards).
2780                let mut bytes = ite_instr.to_le_bytes().to_vec();
2781                let push_mov = |bytes: &mut Vec<u8>, imm: u16| {
2782                    if rd_bits <= 7 {
2783                        let m: u16 = 0x2000 | (rd_bits << 8) | imm; // 16-bit MOVS Rd,#imm
2784                        bytes.extend_from_slice(&m.to_le_bytes());
2785                    } else {
2786                        // 32-bit MOV.W Rd, #imm (T2): F04F | (Rd<<8) | imm8
2787                        let hw1: u16 = 0xF04F;
2788                        let hw2: u16 = (rd_bits << 8) | imm;
2789                        bytes.extend_from_slice(&hw1.to_le_bytes());
2790                        bytes.extend_from_slice(&hw2.to_le_bytes());
2791                    }
2792                };
2793                push_mov(&mut bytes, 1); // Then branch (condition true)  → 1
2794                push_mov(&mut bytes, 0); // Else branch (condition false) → 0
2795                Ok(bytes)
2796            }
2797
2798            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2799            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2800            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2801            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2802            ArmOp::I64SetCond {
2803                rd,
2804                rn_lo,
2805                rn_hi,
2806                rm_lo,
2807                rm_hi,
2808                cond,
2809            } => {
2810                use synth_synthesis::Condition;
2811                let rd_bits = reg_to_bits(rd) as u16;
2812                let mut bytes = Vec::new();
2813
2814                // Helper: encode CMP Rn, Rm (16-bit)
2815                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2816                                      rm: &synth_synthesis::Reg|
2817                 -> Vec<u8> {
2818                    let rn_bits = reg_to_bits(rn) as u16;
2819                    let rm_bits = reg_to_bits(rm) as u16;
2820                    if rn_bits < 8 && rm_bits < 8 {
2821                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2822                        instr.to_le_bytes().to_vec()
2823                    } else {
2824                        let n_bit = (rn_bits >> 3) & 1;
2825                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2826                        instr.to_le_bytes().to_vec()
2827                    }
2828                };
2829
2830                // Helper: encode ITE <cond> (2 bytes)
2831                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2832                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2833                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2834                    ite_instr.to_le_bytes().to_vec()
2835                };
2836
2837                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2838                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2839                    let mut b = encode_ite(cond_bits);
2840                    let mov_one: u16 = 0x2001 | (rd_bits << 8);
2841                    let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2842                    b.extend_from_slice(&mov_one.to_le_bytes());
2843                    b.extend_from_slice(&mov_zero.to_le_bytes());
2844                    b
2845                };
2846
2847                match cond {
2848                    Condition::EQ | Condition::NE => {
2849                        // CMP rn_lo, rm_lo (compare low words)
2850                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2851
2852                        // IT EQ (execute next instruction only if Z=1)
2853                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
2854                        bytes.extend_from_slice(&it_eq.to_le_bytes());
2855
2856                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
2857                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
2858
2859                        // ITE <cond>; MOV rd, #1; MOV rd, #0
2860                        let cond_bits: u16 = match cond {
2861                            Condition::EQ => 0x0,
2862                            Condition::NE => 0x1,
2863                            _ => unreachable!(),
2864                        };
2865                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
2866                    }
2867
2868                    Condition::LT => {
2869                        // CMP rn_lo, rm_lo (sets C flag for borrow)
2870                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2871
2872                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
2873                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
2874                        let rn_hi_bits = reg_to_bits(rn_hi);
2875                        let rm_hi_bits = reg_to_bits(rm_hi);
2876                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2877                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2878                        bytes.extend_from_slice(&hw1.to_le_bytes());
2879                        bytes.extend_from_slice(&hw2.to_le_bytes());
2880
2881                        // ITE LT; MOV rd, #1; MOV rd, #0
2882                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2883                    }
2884
2885                    Condition::GT => {
2886                        // GT(a,b) = LT(b,a): swap operands
2887                        // CMP rm_lo, rn_lo (swapped)
2888                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2889
2890                        // SBCS rd, rm_hi, rn_hi (swapped)
2891                        let rm_hi_bits = reg_to_bits(rm_hi);
2892                        let rn_hi_bits = reg_to_bits(rn_hi);
2893                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2894                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2895                        bytes.extend_from_slice(&hw1.to_le_bytes());
2896                        bytes.extend_from_slice(&hw2.to_le_bytes());
2897
2898                        // ITE LT; MOV rd, #1; MOV rd, #0
2899                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2900                    }
2901
2902                    Condition::LE => {
2903                        // LE(a,b) = !GT(a,b): use GT logic but invert result
2904                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
2905                        // CMP rm_lo, rn_lo (swapped, same as GT)
2906                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2907
2908                        // SBCS rd, rm_hi, rn_hi (swapped)
2909                        let rm_hi_bits = reg_to_bits(rm_hi);
2910                        let rn_hi_bits = reg_to_bits(rn_hi);
2911                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2912                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2913                        bytes.extend_from_slice(&hw1.to_le_bytes());
2914                        bytes.extend_from_slice(&hw2.to_le_bytes());
2915
2916                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
2917                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2918                    }
2919
2920                    Condition::GE => {
2921                        // GE(a,b) = !LT(a,b): use LT logic but invert result
2922                        // CMP rn_lo, rm_lo (same as LT)
2923                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2924
2925                        // SBCS rd, rn_hi, rm_hi (same as LT)
2926                        let rn_hi_bits = reg_to_bits(rn_hi);
2927                        let rm_hi_bits = reg_to_bits(rm_hi);
2928                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2929                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2930                        bytes.extend_from_slice(&hw1.to_le_bytes());
2931                        bytes.extend_from_slice(&hw2.to_le_bytes());
2932
2933                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
2934                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2935                    }
2936
2937                    // Unsigned comparisons - same instruction sequence, different conditions
2938                    Condition::LO => {
2939                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
2940                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2941                        let rn_hi_bits = reg_to_bits(rn_hi);
2942                        let rm_hi_bits = reg_to_bits(rm_hi);
2943                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2944                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2945                        bytes.extend_from_slice(&hw1.to_le_bytes());
2946                        bytes.extend_from_slice(&hw2.to_le_bytes());
2947                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2948                    }
2949
2950                    Condition::HI => {
2951                        // HI (unsigned GT): swap operands and check LO
2952                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2953                        let rm_hi_bits = reg_to_bits(rm_hi);
2954                        let rn_hi_bits = reg_to_bits(rn_hi);
2955                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2956                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2957                        bytes.extend_from_slice(&hw1.to_le_bytes());
2958                        bytes.extend_from_slice(&hw2.to_le_bytes());
2959                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2960                    }
2961
2962                    Condition::LS => {
2963                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
2964                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2965                        let rm_hi_bits = reg_to_bits(rm_hi);
2966                        let rn_hi_bits = reg_to_bits(rn_hi);
2967                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2968                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2969                        bytes.extend_from_slice(&hw1.to_le_bytes());
2970                        bytes.extend_from_slice(&hw2.to_le_bytes());
2971                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2972                    }
2973
2974                    Condition::HS => {
2975                        // HS (unsigned GE): !(a < b) = !(LO)
2976                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2977                        let rn_hi_bits = reg_to_bits(rn_hi);
2978                        let rm_hi_bits = reg_to_bits(rm_hi);
2979                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2980                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2981                        bytes.extend_from_slice(&hw1.to_le_bytes());
2982                        bytes.extend_from_slice(&hw2.to_le_bytes());
2983                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2984                    }
2985                }
2986
2987                Ok(bytes)
2988            }
2989
2990            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
2991            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
2992            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
2993                let rd_bits = reg_to_bits(rd);
2994                let rn_lo_bits = reg_to_bits(rn_lo);
2995                let rn_hi_bits = reg_to_bits(rn_hi);
2996                let mut bytes = Vec::new();
2997
2998                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
2999                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
3000                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
3001                bytes.extend_from_slice(&hw1.to_le_bytes());
3002                bytes.extend_from_slice(&hw2.to_le_bytes());
3003
3004                // CMP rd, #0 (16-bit): 0010 1 Rd 0000 0000
3005                let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
3006                bytes.extend_from_slice(&cmp_instr.to_le_bytes());
3007
3008                // ITE EQ; MOV rd, #1; MOV rd, #0
3009                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
3010                let ite_instr: u16 = 0xBF00 | mask;
3011                bytes.extend_from_slice(&ite_instr.to_le_bytes());
3012                let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
3013                let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
3014                bytes.extend_from_slice(&mov_one.to_le_bytes());
3015                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3016
3017                Ok(bytes)
3018            }
3019
3020            // I64Mul: 64-bit multiply using UMULL + MLA cross products
3021            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
3022            // Uses R12 as scratch register
3023            ArmOp::I64Mul {
3024                rd_lo,
3025                rd_hi,
3026                rn_lo,
3027                rn_hi,
3028                rm_lo,
3029                rm_hi,
3030            } => {
3031                let rd_lo_bits = reg_to_bits(rd_lo);
3032                let rd_hi_bits = reg_to_bits(rd_hi);
3033                let rn_lo_bits = reg_to_bits(rn_lo);
3034                let rn_hi_bits = reg_to_bits(rn_hi);
3035                let rm_lo_bits = reg_to_bits(rm_lo);
3036                let rm_hi_bits = reg_to_bits(rm_hi);
3037                let r12: u32 = 12; // IP scratch register
3038                let mut bytes = Vec::new();
3039
3040                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
3041                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
3042                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
3043                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
3044                bytes.extend_from_slice(&hw1.to_le_bytes());
3045                bytes.extend_from_slice(&hw2.to_le_bytes());
3046
3047                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
3048                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
3049                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
3050                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
3051                bytes.extend_from_slice(&hw1.to_le_bytes());
3052                bytes.extend_from_slice(&hw2.to_le_bytes());
3053
3054                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
3055                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
3056                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
3057                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3058                bytes.extend_from_slice(&hw1.to_le_bytes());
3059                bytes.extend_from_slice(&hw2.to_le_bytes());
3060
3061                // 4. ADD rd_hi, R12  (rd_hi += cross products)
3062                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
3063                let d_bit = (rd_hi_bits >> 3) & 1;
3064                let add_instr: u16 =
3065                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
3066                bytes.extend_from_slice(&add_instr.to_le_bytes());
3067
3068                Ok(bytes)
3069            }
3070
3071            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
3072            // rm_hi (R3) is used as temp register
3073            ArmOp::I64Shl {
3074                rd_lo,
3075                rd_hi,
3076                rn_lo,
3077                rn_hi,
3078                rm_lo,
3079                rm_hi,
3080            } => {
3081                let rd_lo_bits = reg_to_bits(rd_lo);
3082                let rd_hi_bits = reg_to_bits(rd_hi);
3083                let rn_lo_bits = reg_to_bits(rn_lo);
3084                let rn_hi_bits = reg_to_bits(rn_hi);
3085                let rm_lo_bits = reg_to_bits(rm_lo);
3086                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3087                let mut bytes = Vec::new();
3088
3089                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
3090                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3091                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3092                bytes.extend_from_slice(&hw1.to_le_bytes());
3093                bytes.extend_from_slice(&hw2.to_le_bytes());
3094
3095                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
3096                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3097                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3098                bytes.extend_from_slice(&hw1.to_le_bytes());
3099                bytes.extend_from_slice(&hw2.to_le_bytes());
3100
3101                // BPL .large (branch if n >= 32, offset = +10 halfwords)
3102                let bpl: u16 = 0xD50A;
3103                bytes.extend_from_slice(&bpl.to_le_bytes());
3104
3105                // --- Small shift (n < 32) ---
3106                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3107                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3108                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3109                bytes.extend_from_slice(&hw1.to_le_bytes());
3110                bytes.extend_from_slice(&hw2.to_le_bytes());
3111
3112                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
3113                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3114                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3115                bytes.extend_from_slice(&hw1.to_le_bytes());
3116                bytes.extend_from_slice(&hw2.to_le_bytes());
3117
3118                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
3119                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3120                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3121                bytes.extend_from_slice(&hw1.to_le_bytes());
3122                bytes.extend_from_slice(&hw2.to_le_bytes());
3123
3124                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
3125                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3126                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
3127                bytes.extend_from_slice(&hw1.to_le_bytes());
3128                bytes.extend_from_slice(&hw2.to_le_bytes());
3129
3130                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
3131                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3132                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3133                bytes.extend_from_slice(&hw1.to_le_bytes());
3134                bytes.extend_from_slice(&hw2.to_le_bytes());
3135
3136                // B .done (skip large shift: +2 halfwords)
3137                let b_done: u16 = 0xE002;
3138                bytes.extend_from_slice(&b_done.to_le_bytes());
3139
3140                // --- Large shift (n >= 32) ---
3141                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
3142                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3143                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
3144                bytes.extend_from_slice(&hw1.to_le_bytes());
3145                bytes.extend_from_slice(&hw2.to_le_bytes());
3146
3147                // MOV rd_lo, #0
3148                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
3149                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3150
3151                Ok(bytes) // Total: 38 bytes
3152            }
3153
3154            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3155            ArmOp::I64ShrU {
3156                rd_lo,
3157                rd_hi,
3158                rn_lo,
3159                rn_hi,
3160                rm_lo,
3161                rm_hi,
3162            } => {
3163                let rd_lo_bits = reg_to_bits(rd_lo);
3164                let rd_hi_bits = reg_to_bits(rd_hi);
3165                let rn_lo_bits = reg_to_bits(rn_lo);
3166                let rn_hi_bits = reg_to_bits(rn_hi);
3167                let rm_lo_bits = reg_to_bits(rm_lo);
3168                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3169                let mut bytes = Vec::new();
3170
3171                // AND.W rm_lo, rm_lo, #63
3172                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3173                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3174                bytes.extend_from_slice(&hw1.to_le_bytes());
3175                bytes.extend_from_slice(&hw2.to_le_bytes());
3176
3177                // SUBS.W rm_hi, rm_lo, #32
3178                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3179                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3180                bytes.extend_from_slice(&hw1.to_le_bytes());
3181                bytes.extend_from_slice(&hw2.to_le_bytes());
3182
3183                // BPL .large (+10 halfwords)
3184                let bpl: u16 = 0xD50A;
3185                bytes.extend_from_slice(&bpl.to_le_bytes());
3186
3187                // --- Small shift (n < 32) ---
3188                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3189                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3190                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3191                bytes.extend_from_slice(&hw1.to_le_bytes());
3192                bytes.extend_from_slice(&hw2.to_le_bytes());
3193
3194                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3195                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3196                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3197                bytes.extend_from_slice(&hw1.to_le_bytes());
3198                bytes.extend_from_slice(&hw2.to_le_bytes());
3199
3200                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3201                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3202                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3203                bytes.extend_from_slice(&hw1.to_le_bytes());
3204                bytes.extend_from_slice(&hw2.to_le_bytes());
3205
3206                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3207                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3208                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3209                bytes.extend_from_slice(&hw1.to_le_bytes());
3210                bytes.extend_from_slice(&hw2.to_le_bytes());
3211
3212                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3213                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3214                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3215                bytes.extend_from_slice(&hw1.to_le_bytes());
3216                bytes.extend_from_slice(&hw2.to_le_bytes());
3217
3218                // B .done (+2 halfwords)
3219                let b_done: u16 = 0xE002;
3220                bytes.extend_from_slice(&b_done.to_le_bytes());
3221
3222                // --- Large shift (n >= 32) ---
3223                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3224                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3225                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3226                bytes.extend_from_slice(&hw1.to_le_bytes());
3227                bytes.extend_from_slice(&hw2.to_le_bytes());
3228
3229                // MOV rd_hi, #0
3230                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3231                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3232
3233                Ok(bytes) // Total: 38 bytes
3234            }
3235
3236            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3237            ArmOp::I64ShrS {
3238                rd_lo,
3239                rd_hi,
3240                rn_lo,
3241                rn_hi,
3242                rm_lo,
3243                rm_hi,
3244            } => {
3245                let rd_lo_bits = reg_to_bits(rd_lo);
3246                let rd_hi_bits = reg_to_bits(rd_hi);
3247                let rn_lo_bits = reg_to_bits(rn_lo);
3248                let rn_hi_bits = reg_to_bits(rn_hi);
3249                let rm_lo_bits = reg_to_bits(rm_lo);
3250                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3251                let mut bytes = Vec::new();
3252
3253                // AND.W rm_lo, rm_lo, #63
3254                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3255                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3256                bytes.extend_from_slice(&hw1.to_le_bytes());
3257                bytes.extend_from_slice(&hw2.to_le_bytes());
3258
3259                // SUBS.W rm_hi, rm_lo, #32
3260                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3261                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3262                bytes.extend_from_slice(&hw1.to_le_bytes());
3263                bytes.extend_from_slice(&hw2.to_le_bytes());
3264
3265                // BPL .large (+10 halfwords)
3266                let bpl: u16 = 0xD50A;
3267                bytes.extend_from_slice(&bpl.to_le_bytes());
3268
3269                // --- Small shift (n < 32) ---
3270                // RSB.W rm_hi, rm_lo, #32
3271                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3272                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3273                bytes.extend_from_slice(&hw1.to_le_bytes());
3274                bytes.extend_from_slice(&hw2.to_le_bytes());
3275
3276                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3277                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3278                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3279                bytes.extend_from_slice(&hw1.to_le_bytes());
3280                bytes.extend_from_slice(&hw2.to_le_bytes());
3281
3282                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3283                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3284                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3285                bytes.extend_from_slice(&hw1.to_le_bytes());
3286                bytes.extend_from_slice(&hw2.to_le_bytes());
3287
3288                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3289                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3290                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3291                bytes.extend_from_slice(&hw1.to_le_bytes());
3292                bytes.extend_from_slice(&hw2.to_le_bytes());
3293
3294                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3295                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3296                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3297                bytes.extend_from_slice(&hw1.to_le_bytes());
3298                bytes.extend_from_slice(&hw2.to_le_bytes());
3299
3300                // B .done (+3 halfwords, large shift is 8 bytes)
3301                let b_done: u16 = 0xE003;
3302                bytes.extend_from_slice(&b_done.to_le_bytes());
3303
3304                // --- Large shift (n >= 32) ---
3305                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3306                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3307                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3308                bytes.extend_from_slice(&hw1.to_le_bytes());
3309                bytes.extend_from_slice(&hw2.to_le_bytes());
3310
3311                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3312                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3313                // imm5=31=11111 → imm3=111, imm2=11
3314                let hw1: u16 = 0xEA4F;
3315                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3316                bytes.extend_from_slice(&hw1.to_le_bytes());
3317                bytes.extend_from_slice(&hw2.to_le_bytes());
3318
3319                Ok(bytes) // Total: 40 bytes
3320            }
3321
3322            // I64Rotl: 64-bit rotate left
3323            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3324            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3325            // Uses R4 (saved/restored) and R12 as scratch
3326            ArmOp::I64Rotl {
3327                rdlo,
3328                rdhi,
3329                rnlo,
3330                rnhi,
3331                shift,
3332            } => {
3333                let rd_lo_bits = reg_to_bits(rdlo);
3334                let rd_hi_bits = reg_to_bits(rdhi);
3335                let rn_lo_bits = reg_to_bits(rnlo);
3336                let rn_hi_bits = reg_to_bits(rnhi);
3337                let shift_bits = reg_to_bits(shift);
3338                let r12: u32 = 12; // IP scratch
3339                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3340                let r4: u32 = 4; // Scratch (saved/restored)
3341                let mut bytes = Vec::new();
3342
3343                // PUSH {R4}
3344                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3345
3346                // AND.W shift, shift, #63 (mask to 6 bits)
3347                let hw1: u16 = (0xF000 | shift_bits) as u16;
3348                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3349                bytes.extend_from_slice(&hw1.to_le_bytes());
3350                bytes.extend_from_slice(&hw2.to_le_bytes());
3351
3352                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3353                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3354                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3355                bytes.extend_from_slice(&hw1.to_le_bytes());
3356                bytes.extend_from_slice(&hw2.to_le_bytes());
3357
3358                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3359                let bpl: u16 = 0xD50E;
3360                bytes.extend_from_slice(&bpl.to_le_bytes());
3361
3362                // === Small rotation (n < 32) ===
3363                // RSB.W R3, shift, #32 (R3 = 32-n)
3364                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3365                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3366                bytes.extend_from_slice(&hw1.to_le_bytes());
3367                bytes.extend_from_slice(&hw2.to_le_bytes());
3368
3369                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3370                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3371                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3372                bytes.extend_from_slice(&hw1.to_le_bytes());
3373                bytes.extend_from_slice(&hw2.to_le_bytes());
3374
3375                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3376                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3377                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3378                bytes.extend_from_slice(&hw1.to_le_bytes());
3379                bytes.extend_from_slice(&hw2.to_le_bytes());
3380
3381                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3382                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3383                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3384                bytes.extend_from_slice(&hw1.to_le_bytes());
3385                bytes.extend_from_slice(&hw2.to_le_bytes());
3386
3387                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3388                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3389                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3390                bytes.extend_from_slice(&hw1.to_le_bytes());
3391                bytes.extend_from_slice(&hw2.to_le_bytes());
3392
3393                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3394                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3395                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3396                bytes.extend_from_slice(&hw1.to_le_bytes());
3397                bytes.extend_from_slice(&hw2.to_le_bytes());
3398
3399                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3400                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3401                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3402                bytes.extend_from_slice(&hw1.to_le_bytes());
3403                bytes.extend_from_slice(&hw2.to_le_bytes());
3404
3405                // B .done (skip large block, offset = +14 halfwords)
3406                let b_done: u16 = 0xE00E;
3407                bytes.extend_from_slice(&b_done.to_le_bytes());
3408
3409                // === Large rotation (n >= 32) ===
3410                // R3 already has n-32 from the SUBS
3411                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3412                let hw1: u16 = (0xF1C0 | r3) as u16;
3413                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3414                bytes.extend_from_slice(&hw1.to_le_bytes());
3415                bytes.extend_from_slice(&hw2.to_le_bytes());
3416
3417                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3418                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3419                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3420                bytes.extend_from_slice(&hw1.to_le_bytes());
3421                bytes.extend_from_slice(&hw2.to_le_bytes());
3422
3423                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3424                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3425                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3426                bytes.extend_from_slice(&hw1.to_le_bytes());
3427                bytes.extend_from_slice(&hw2.to_le_bytes());
3428
3429                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3430                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3431                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3432                bytes.extend_from_slice(&hw1.to_le_bytes());
3433                bytes.extend_from_slice(&hw2.to_le_bytes());
3434
3435                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3436                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3437                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3438                bytes.extend_from_slice(&hw1.to_le_bytes());
3439                bytes.extend_from_slice(&hw2.to_le_bytes());
3440
3441                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3442                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3443                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3444                bytes.extend_from_slice(&hw1.to_le_bytes());
3445                bytes.extend_from_slice(&hw2.to_le_bytes());
3446
3447                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3448                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3449                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3450                bytes.extend_from_slice(&hw1.to_le_bytes());
3451                bytes.extend_from_slice(&hw2.to_le_bytes());
3452
3453                // MOV rd_hi, shift (rd_hi = new_hi)
3454                let d_bit = (rd_hi_bits >> 3) & 1;
3455                let mov_instr: u16 =
3456                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3457                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3458
3459                // POP {R4}
3460                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3461
3462                Ok(bytes) // Total: 74 bytes
3463            }
3464
3465            // I64Rotr: 64-bit rotate right
3466            // rotr(x, n) = rotl(x, 64-n)
3467            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3468            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3469            ArmOp::I64Rotr {
3470                rdlo,
3471                rdhi,
3472                rnlo,
3473                rnhi,
3474                shift,
3475            } => {
3476                let rd_lo_bits = reg_to_bits(rdlo);
3477                let rd_hi_bits = reg_to_bits(rdhi);
3478                let rn_lo_bits = reg_to_bits(rnlo);
3479                let rn_hi_bits = reg_to_bits(rnhi);
3480                let shift_bits = reg_to_bits(shift);
3481                let r12: u32 = 12;
3482                let r3: u32 = 3;
3483                let r4: u32 = 4;
3484                let mut bytes = Vec::new();
3485
3486                // PUSH {R4}
3487                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3488
3489                // AND.W shift, shift, #63
3490                let hw1: u16 = (0xF000 | shift_bits) as u16;
3491                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3492                bytes.extend_from_slice(&hw1.to_le_bytes());
3493                bytes.extend_from_slice(&hw2.to_le_bytes());
3494
3495                // SUBS.W R3, shift, #32
3496                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3497                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3498                bytes.extend_from_slice(&hw1.to_le_bytes());
3499                bytes.extend_from_slice(&hw2.to_le_bytes());
3500
3501                // BPL .large (+14 halfwords)
3502                let bpl: u16 = 0xD50E;
3503                bytes.extend_from_slice(&bpl.to_le_bytes());
3504
3505                // === Small rotation (n < 32) ===
3506                // RSB.W R3, shift, #32 (R3 = 32-n)
3507                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3508                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3509                bytes.extend_from_slice(&hw1.to_le_bytes());
3510                bytes.extend_from_slice(&hw2.to_le_bytes());
3511
3512                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3513                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3514                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3515                bytes.extend_from_slice(&hw1.to_le_bytes());
3516                bytes.extend_from_slice(&hw2.to_le_bytes());
3517
3518                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3519                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3520                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3521                bytes.extend_from_slice(&hw1.to_le_bytes());
3522                bytes.extend_from_slice(&hw2.to_le_bytes());
3523
3524                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3525                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3526                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3527                bytes.extend_from_slice(&hw1.to_le_bytes());
3528                bytes.extend_from_slice(&hw2.to_le_bytes());
3529
3530                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3531                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3532                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3533                bytes.extend_from_slice(&hw1.to_le_bytes());
3534                bytes.extend_from_slice(&hw2.to_le_bytes());
3535
3536                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3537                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3538                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3539                bytes.extend_from_slice(&hw1.to_le_bytes());
3540                bytes.extend_from_slice(&hw2.to_le_bytes());
3541
3542                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3543                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3544                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3545                bytes.extend_from_slice(&hw1.to_le_bytes());
3546                bytes.extend_from_slice(&hw2.to_le_bytes());
3547
3548                // B .done (+14 halfwords)
3549                let b_done: u16 = 0xE00E;
3550                bytes.extend_from_slice(&b_done.to_le_bytes());
3551
3552                // === Large rotation (n >= 32) ===
3553                // RSB.W R4, R3, #32 (R4 = 64-n)
3554                let hw1: u16 = (0xF1C0 | r3) as u16;
3555                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3556                bytes.extend_from_slice(&hw1.to_le_bytes());
3557                bytes.extend_from_slice(&hw2.to_le_bytes());
3558
3559                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3560                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3561                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3562                bytes.extend_from_slice(&hw1.to_le_bytes());
3563                bytes.extend_from_slice(&hw2.to_le_bytes());
3564
3565                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3566                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3567                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3568                bytes.extend_from_slice(&hw1.to_le_bytes());
3569                bytes.extend_from_slice(&hw2.to_le_bytes());
3570
3571                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3572                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3573                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3574                bytes.extend_from_slice(&hw1.to_le_bytes());
3575                bytes.extend_from_slice(&hw2.to_le_bytes());
3576
3577                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3578                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3579                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3580                bytes.extend_from_slice(&hw1.to_le_bytes());
3581                bytes.extend_from_slice(&hw2.to_le_bytes());
3582
3583                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3584                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3585                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3586                bytes.extend_from_slice(&hw1.to_le_bytes());
3587                bytes.extend_from_slice(&hw2.to_le_bytes());
3588
3589                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3590                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3591                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3592                bytes.extend_from_slice(&hw1.to_le_bytes());
3593                bytes.extend_from_slice(&hw2.to_le_bytes());
3594
3595                // MOV rd_lo, shift (rd_lo = new_lo)
3596                let d_bit = (rd_lo_bits >> 3) & 1;
3597                let mov_instr: u16 =
3598                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3599                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3600
3601                // POP {R4}
3602                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3603
3604                Ok(bytes) // Total: 74 bytes
3605            }
3606
3607            // I64Clz: Count leading zeros in 64-bit value
3608            // If hi != 0: result = CLZ(hi)
3609            // If hi == 0: result = 32 + CLZ(lo)
3610            //
3611            // Layout (using CMP+BNE approach for consistency):
3612            // 0: CMP.W rnhi, #0 (4 bytes)
3613            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3614            // 6: CLZ.W rd, rnhi (4 bytes)
3615            // 10: B .done (2 bytes) - branch forward to offset 22
3616            // 12: NOP (2 bytes) - padding for alignment
3617            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3618            // 18: ADD.W rd, rd, #32 (4 bytes)
3619            // 22: .done
3620            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3621                let rd_bits = reg_to_bits(rd);
3622                let rn_lo_bits = reg_to_bits(rnlo);
3623                let rn_hi_bits = reg_to_bits(rnhi);
3624                let mut bytes = Vec::new();
3625
3626                // CMP.W rnhi, #0 (4 bytes at offset 0)
3627                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3628                let hw2: u16 = 0x0F00;
3629                bytes.extend_from_slice(&hw1.to_le_bytes());
3630                bytes.extend_from_slice(&hw2.to_le_bytes());
3631
3632                // BEQ .hi_zero (2 bytes at offset 4)
3633                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3634                let beq: u16 = 0xD003;
3635                bytes.extend_from_slice(&beq.to_le_bytes());
3636
3637                // CLZ.W rd, rnhi (4 bytes at offset 6)
3638                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3639                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3640                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3641                bytes.extend_from_slice(&hw1.to_le_bytes());
3642                bytes.extend_from_slice(&hw2.to_le_bytes());
3643
3644                // B .done (2 bytes at offset 10)
3645                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3646                let b_done: u16 = 0xE004;
3647                bytes.extend_from_slice(&b_done.to_le_bytes());
3648
3649                // NOP (2 bytes at offset 12) - padding
3650                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3651
3652                // .hi_zero: (offset 14)
3653                // CLZ.W rd, rnlo (4 bytes)
3654                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3655                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3656                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3657                bytes.extend_from_slice(&hw1.to_le_bytes());
3658                bytes.extend_from_slice(&hw2.to_le_bytes());
3659
3660                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3661                let hw1: u16 = (0xF100 | rd_bits) as u16;
3662                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3663                bytes.extend_from_slice(&hw1.to_le_bytes());
3664                bytes.extend_from_slice(&hw2.to_le_bytes());
3665
3666                // .done: (offset 22)
3667                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3668                // MOVS Rn, #0: 0010 0 Rn 00000000
3669                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3670                bytes.extend_from_slice(&mov0.to_le_bytes());
3671
3672                Ok(bytes)
3673            }
3674
3675            // I64Ctz: Count trailing zeros in 64-bit value
3676            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3677            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3678            //
3679            // Layout:
3680            // 0: CMP.W rnlo, #0 (4 bytes)
3681            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3682            // 6: RBIT.W rd, rnlo (4 bytes)
3683            // 10: CLZ.W rd, rd (4 bytes)
3684            // 14: B .done (2 bytes) - branch to offset 30
3685            // 16: NOP (2 bytes) - padding
3686            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3687            // 22: CLZ.W rd, rd (4 bytes)
3688            // 26: ADD.W rd, rd, #32 (4 bytes)
3689            // 30: .done
3690            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3691                let rd_bits = reg_to_bits(rd);
3692                let rn_lo_bits = reg_to_bits(rnlo);
3693                let rn_hi_bits = reg_to_bits(rnhi);
3694                let mut bytes = Vec::new();
3695
3696                // CMP.W rnlo, #0 (4 bytes at offset 0)
3697                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3698                let hw2: u16 = 0x0F00;
3699                bytes.extend_from_slice(&hw1.to_le_bytes());
3700                bytes.extend_from_slice(&hw2.to_le_bytes());
3701
3702                // BEQ .lo_zero (2 bytes at offset 4)
3703                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3704                let beq: u16 = 0xD005;
3705                bytes.extend_from_slice(&beq.to_le_bytes());
3706
3707                // RBIT.W rd, rnlo (4 bytes at offset 6)
3708                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3709                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3710                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3711                bytes.extend_from_slice(&hw1.to_le_bytes());
3712                bytes.extend_from_slice(&hw2.to_le_bytes());
3713
3714                // CLZ.W rd, rd (4 bytes at offset 10)
3715                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3716                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3717                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3718                bytes.extend_from_slice(&hw1.to_le_bytes());
3719                bytes.extend_from_slice(&hw2.to_le_bytes());
3720
3721                // B .done (2 bytes at offset 14)
3722                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3723                let b_done: u16 = 0xE006;
3724                bytes.extend_from_slice(&b_done.to_le_bytes());
3725
3726                // NOP (2 bytes at offset 16) - padding
3727                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3728
3729                // .lo_zero: (offset 18)
3730                // RBIT.W rd, rnhi (4 bytes)
3731                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3732                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3733                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3734                bytes.extend_from_slice(&hw1.to_le_bytes());
3735                bytes.extend_from_slice(&hw2.to_le_bytes());
3736
3737                // CLZ.W rd, rd (4 bytes at offset 22)
3738                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3739                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3740                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3741                bytes.extend_from_slice(&hw1.to_le_bytes());
3742                bytes.extend_from_slice(&hw2.to_le_bytes());
3743
3744                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3745                let hw1: u16 = (0xF100 | rd_bits) as u16;
3746                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3747                bytes.extend_from_slice(&hw1.to_le_bytes());
3748                bytes.extend_from_slice(&hw2.to_le_bytes());
3749
3750                // .done: (offset 30)
3751                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3752                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3753                bytes.extend_from_slice(&mov0.to_le_bytes());
3754
3755                Ok(bytes)
3756            }
3757
3758            // I64Popcnt: Population count of 64-bit value
3759            // result = POPCNT(lo) + POPCNT(hi)
3760            // Using SIMD-style parallel bit counting algorithm
3761            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3762                let rd_bits = reg_to_bits(rd);
3763                let rn_lo_bits = reg_to_bits(rnlo);
3764                let rn_hi_bits = reg_to_bits(rnhi);
3765                let r12: u32 = 12; // IP scratch
3766                let r3: u32 = 3; // Scratch for hi popcnt result
3767                let mut bytes = Vec::new();
3768
3769                // PUSH {R3, R4, R5} - save scratch registers
3770                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3771
3772                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3773                // Using lookup table approach for each byte would be too large
3774                // Using shift-and-add approach instead
3775
3776                // For simplicity and correctness, use the efficient parallel algorithm
3777                // but implement it as a series of inline operations
3778
3779                // MOV R4, rnlo
3780                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3781                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3782                bytes.extend_from_slice(&mov.to_le_bytes());
3783
3784                // MOV R5, rnhi
3785                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3786                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3787                bytes.extend_from_slice(&mov.to_le_bytes());
3788
3789                // --- POPCNT for R4 (lo word) ---
3790                // Step 1: x = x - ((x >> 1) & 0x55555555)
3791                // LSR.W R12, R4, #1
3792                let hw1: u16 = 0xEA4F;
3793                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3794                bytes.extend_from_slice(&hw1.to_le_bytes());
3795                bytes.extend_from_slice(&hw2.to_le_bytes());
3796
3797                // Load 0x55555555 into R3 using MOVW/MOVT
3798                // MOVW R3, #0x5555
3799                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3800                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3801                // MOVT R3, #0x5555
3802                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3803                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3804
3805                // AND.W R12, R12, R3
3806                let hw1: u16 = (0xEA00 | r12) as u16;
3807                let hw2: u16 = ((r12 << 8) | r3) as u16;
3808                bytes.extend_from_slice(&hw1.to_le_bytes());
3809                bytes.extend_from_slice(&hw2.to_le_bytes());
3810
3811                // SUB.W R4, R4, R12
3812                let hw1: u16 = (0xEBA0 | 4) as u16;
3813                let hw2: u16 = ((4 << 8) | r12) as u16;
3814                bytes.extend_from_slice(&hw1.to_le_bytes());
3815                bytes.extend_from_slice(&hw2.to_le_bytes());
3816
3817                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
3818                // Load 0x33333333 into R3
3819                // MOVW R3, #0x3333
3820                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3821                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3822                // MOVT R3, #0x3333
3823                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3824                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3825
3826                // AND.W R12, R4, R3
3827                let hw1: u16 = (0xEA00 | 4) as u16;
3828                let hw2: u16 = ((r12 << 8) | r3) as u16;
3829                bytes.extend_from_slice(&hw1.to_le_bytes());
3830                bytes.extend_from_slice(&hw2.to_le_bytes());
3831
3832                // LSR.W R4, R4, #2
3833                let hw1: u16 = 0xEA4F;
3834                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
3835                bytes.extend_from_slice(&hw1.to_le_bytes());
3836                bytes.extend_from_slice(&hw2.to_le_bytes());
3837
3838                // AND.W R4, R4, R3
3839                let hw1: u16 = (0xEA00 | 4) as u16;
3840                let hw2: u16 = ((4 << 8) | r3) as u16;
3841                bytes.extend_from_slice(&hw1.to_le_bytes());
3842                bytes.extend_from_slice(&hw2.to_le_bytes());
3843
3844                // ADD.W R4, R4, R12
3845                let hw1: u16 = (0xEB00 | 4) as u16;
3846                let hw2: u16 = ((4 << 8) | r12) as u16;
3847                bytes.extend_from_slice(&hw1.to_le_bytes());
3848                bytes.extend_from_slice(&hw2.to_le_bytes());
3849
3850                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
3851                // LSR.W R12, R4, #4
3852                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
3853                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3854                let hw1: u16 = 0xEA4F;
3855                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
3856                bytes.extend_from_slice(&hw1.to_le_bytes());
3857                bytes.extend_from_slice(&hw2.to_le_bytes());
3858
3859                // ADD.W R4, R4, R12
3860                let hw1: u16 = (0xEB00 | 4) as u16;
3861                let hw2: u16 = ((4 << 8) | r12) as u16;
3862                bytes.extend_from_slice(&hw1.to_le_bytes());
3863                bytes.extend_from_slice(&hw2.to_le_bytes());
3864
3865                // Load 0x0F0F0F0F into R3
3866                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
3867                // hw1 = 11110 1 10 0100 0000 = 0xF640
3868                // hw2 = 0 111 0011 00001111 = 0x730F
3869                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3870                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3871                // MOVT R3, #0x0F0F
3872                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3873                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3874
3875                // AND.W R4, R4, R3
3876                let hw1: u16 = (0xEA00 | 4) as u16;
3877                let hw2: u16 = ((4 << 8) | r3) as u16;
3878                bytes.extend_from_slice(&hw1.to_le_bytes());
3879                bytes.extend_from_slice(&hw2.to_le_bytes());
3880
3881                // Step 4: x = x * 0x01010101 >> 24
3882                // Load 0x01010101 into R3
3883                // MOVW R3, #0x0101
3884                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3885                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3886                // MOVT R3, #0x0101
3887                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3888                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3889
3890                // MUL R4, R4, R3
3891                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3892                let hw1: u16 = (0xFB00 | 4) as u16;
3893                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
3894                bytes.extend_from_slice(&hw1.to_le_bytes());
3895                bytes.extend_from_slice(&hw2.to_le_bytes());
3896
3897                // LSR.W R4, R4, #24
3898                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3899                let hw1: u16 = 0xEA4F;
3900                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
3901                bytes.extend_from_slice(&hw1.to_le_bytes());
3902                bytes.extend_from_slice(&hw2.to_le_bytes());
3903
3904                // --- POPCNT for R5 (hi word) - same algorithm ---
3905                // Step 1
3906                let hw1: u16 = 0xEA4F;
3907                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
3908                bytes.extend_from_slice(&hw1.to_le_bytes());
3909                bytes.extend_from_slice(&hw2.to_le_bytes());
3910
3911                // Load 0x55555555 into R3
3912                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3913                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3914                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3915                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3916
3917                let hw1: u16 = (0xEA00 | r12) as u16;
3918                let hw2: u16 = ((r12 << 8) | r3) as u16;
3919                bytes.extend_from_slice(&hw1.to_le_bytes());
3920                bytes.extend_from_slice(&hw2.to_le_bytes());
3921
3922                let hw1: u16 = (0xEBA0 | 5) as u16;
3923                let hw2: u16 = ((5 << 8) | r12) as u16;
3924                bytes.extend_from_slice(&hw1.to_le_bytes());
3925                bytes.extend_from_slice(&hw2.to_le_bytes());
3926
3927                // Step 2
3928                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3929                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3930                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3931                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3932
3933                let hw1: u16 = (0xEA00 | 5) as u16;
3934                let hw2: u16 = ((r12 << 8) | r3) as u16;
3935                bytes.extend_from_slice(&hw1.to_le_bytes());
3936                bytes.extend_from_slice(&hw2.to_le_bytes());
3937
3938                let hw1: u16 = 0xEA4F;
3939                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
3940                bytes.extend_from_slice(&hw1.to_le_bytes());
3941                bytes.extend_from_slice(&hw2.to_le_bytes());
3942
3943                let hw1: u16 = (0xEA00 | 5) as u16;
3944                let hw2: u16 = ((5 << 8) | r3) as u16;
3945                bytes.extend_from_slice(&hw1.to_le_bytes());
3946                bytes.extend_from_slice(&hw2.to_le_bytes());
3947
3948                let hw1: u16 = (0xEB00 | 5) as u16;
3949                let hw2: u16 = ((5 << 8) | r12) as u16;
3950                bytes.extend_from_slice(&hw1.to_le_bytes());
3951                bytes.extend_from_slice(&hw2.to_le_bytes());
3952
3953                // Step 3: LSR.W R12, R5, #4
3954                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3955                let hw1: u16 = 0xEA4F;
3956                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
3957                bytes.extend_from_slice(&hw1.to_le_bytes());
3958                bytes.extend_from_slice(&hw2.to_le_bytes());
3959
3960                let hw1: u16 = (0xEB00 | 5) as u16;
3961                let hw2: u16 = ((5 << 8) | r12) as u16;
3962                bytes.extend_from_slice(&hw1.to_le_bytes());
3963                bytes.extend_from_slice(&hw2.to_le_bytes());
3964
3965                // Load 0x0F0F0F0F into R3 (for hi-word)
3966                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3967                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3968                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3969                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3970
3971                let hw1: u16 = (0xEA00 | 5) as u16;
3972                let hw2: u16 = ((5 << 8) | r3) as u16;
3973                bytes.extend_from_slice(&hw1.to_le_bytes());
3974                bytes.extend_from_slice(&hw2.to_le_bytes());
3975
3976                // Step 4
3977                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3978                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3979                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3980                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3981
3982                // MUL R5, R5, R3
3983                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3984                let hw1: u16 = (0xFB00 | 5) as u16;
3985                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
3986                bytes.extend_from_slice(&hw1.to_le_bytes());
3987                bytes.extend_from_slice(&hw2.to_le_bytes());
3988
3989                // LSR.W R5, R5, #24
3990                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3991                let hw1: u16 = 0xEA4F;
3992                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
3993                bytes.extend_from_slice(&hw1.to_le_bytes());
3994                bytes.extend_from_slice(&hw2.to_le_bytes());
3995
3996                // ADD rd, R4, R5 (combine lo and hi counts)
3997                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
3998                let rd_bits_u16 = rd_bits as u16;
3999                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
4000                bytes.extend_from_slice(&instr.to_le_bytes());
4001
4002                // POP {R3, R4, R5}
4003                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
4004
4005                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
4006                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
4007                bytes.extend_from_slice(&mov0.to_le_bytes());
4008
4009                Ok(bytes)
4010            }
4011
4012            // I64Extend8S: Sign-extend low 8 bits to 64 bits
4013            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
4014            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
4015                let rdlo_bits = reg_to_bits(rdlo);
4016                let rdhi_bits = reg_to_bits(rdhi);
4017                let rnlo_bits = reg_to_bits(rnlo);
4018                let mut bytes = Vec::new();
4019
4020                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
4021                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
4022                let hw1: u16 = 0xFA4F_u16;
4023                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4024                bytes.extend_from_slice(&hw1.to_le_bytes());
4025                bytes.extend_from_slice(&hw2.to_le_bytes());
4026
4027                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4028                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
4029                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
4030                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
4031                let hw1: u16 = 0xEA4F;
4032                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4033                bytes.extend_from_slice(&hw1.to_le_bytes());
4034                bytes.extend_from_slice(&hw2.to_le_bytes());
4035
4036                Ok(bytes)
4037            }
4038
4039            // I64Extend16S: Sign-extend low 16 bits to 64 bits
4040            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
4041            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
4042                let rdlo_bits = reg_to_bits(rdlo);
4043                let rdhi_bits = reg_to_bits(rdhi);
4044                let rnlo_bits = reg_to_bits(rnlo);
4045                let mut bytes = Vec::new();
4046
4047                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
4048                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
4049                let hw1: u16 = 0xFA0F_u16;
4050                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4051                bytes.extend_from_slice(&hw1.to_le_bytes());
4052                bytes.extend_from_slice(&hw2.to_le_bytes());
4053
4054                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4055                let hw1: u16 = 0xEA4F;
4056                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4057                bytes.extend_from_slice(&hw1.to_le_bytes());
4058                bytes.extend_from_slice(&hw2.to_le_bytes());
4059
4060                Ok(bytes)
4061            }
4062
4063            // I64Extend32S: Sign-extend low 32 bits to 64 bits
4064            // Result: rdlo = rnlo, rdhi = rnlo >> 31
4065            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
4066                let rdlo_bits = reg_to_bits(rdlo);
4067                let rdhi_bits = reg_to_bits(rdhi);
4068                let rnlo_bits = reg_to_bits(rnlo);
4069                let mut bytes = Vec::new();
4070
4071                // MOV rdlo, rnlo (if different)
4072                if rdlo_bits != rnlo_bits {
4073                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4074                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
4075                    let mov: u16 = 0x4600
4076                        | (d_bit << 7)
4077                        | ((rnlo_bits as u16) << 3)
4078                        | ((rdlo_bits & 0x7) as u16);
4079                    bytes.extend_from_slice(&mov.to_le_bytes());
4080                }
4081
4082                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
4083                let hw1: u16 = 0xEA4F;
4084                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
4085                bytes.extend_from_slice(&hw1.to_le_bytes());
4086                bytes.extend_from_slice(&hw2.to_le_bytes());
4087
4088                Ok(bytes)
4089            }
4090
4091            // SelectMove: IT <cond>; MOV{cond} rd, rm
4092            // Conditional move: only execute MOV if condition is true
4093            ArmOp::SelectMove { rd, rm, cond } => {
4094                let rd_bits = reg_to_bits(rd) as u16;
4095                let rm_bits = reg_to_bits(rm) as u16;
4096
4097                // Condition code encoding for IT block
4098                use synth_synthesis::Condition;
4099                let cond_bits: u16 = match cond {
4100                    Condition::EQ => 0x0, // Equal
4101                    Condition::NE => 0x1, // Not equal
4102                    Condition::HS => 0x2, // Higher or same (unsigned >=)
4103                    Condition::LO => 0x3, // Lower (unsigned <)
4104                    Condition::HI => 0x8, // Higher (unsigned >)
4105                    Condition::LS => 0x9, // Lower or same (unsigned <=)
4106                    Condition::GE => 0xA, // Greater or equal (signed)
4107                    Condition::LT => 0xB, // Less than (signed)
4108                    Condition::GT => 0xC, // Greater than (signed)
4109                    Condition::LE => 0xD, // Less or equal (signed)
4110                };
4111
4112                // IT <cond>: single Then block (mask = 0x8 for T only)
4113                // IT instruction: 1011 1111 firstcond mask
4114                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
4115
4116                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4117                // This MOV will only execute if condition is true due to IT block
4118                let d_bit = (rd_bits >> 3) & 1;
4119                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4120
4121                // Emit: IT <cond>, MOV rd, rm
4122                let mut bytes = it_instr.to_le_bytes().to_vec();
4123                bytes.extend_from_slice(&mov_instr.to_le_bytes());
4124                Ok(bytes)
4125            }
4126
4127            // Popcnt: Population count (count set bits)
4128            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
4129            // x = x - ((x >> 1) & 0x55555555);
4130            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
4131            // x = (x + (x >> 4)) & 0x0F0F0F0F;
4132            // x = x + (x >> 8);
4133            // x = x + (x >> 16);
4134            // return x & 0x3F;
4135            //
4136            // Uses rd as working register and R12 as scratch for constants
4137            ArmOp::Popcnt { rd, rm } => {
4138                let mut bytes = Vec::new();
4139
4140                // First, move rm to rd if they're different
4141                if rd != rm {
4142                    let rd_bits = reg_to_bits(rd) as u16;
4143                    let rm_bits = reg_to_bits(rm) as u16;
4144                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4145                    let d_bit = (rd_bits >> 3) & 1;
4146                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4147                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
4148                }
4149
4150                // Step 1: x = x - ((x >> 1) & 0x55555555)
4151                // Load 0x55555555 into R12
4152                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4153                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4154
4155                // R12_temp = rd >> 1
4156                // We need a second scratch register. Use R11.
4157                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4158
4159                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4160                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4161
4162                // rd = rd - R11
4163                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4164                    reg_to_bits(rd),
4165                    reg_to_bits(rd),
4166                    11,
4167                )?);
4168
4169                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4170                // Load 0x33333333 into R12
4171                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4172                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4173
4174                // R11 = rd & R12
4175                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4176                    11,
4177                    reg_to_bits(rd),
4178                    12,
4179                )?);
4180
4181                // rd = rd >> 2
4182                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4183                    reg_to_bits(rd),
4184                    reg_to_bits(rd),
4185                    2,
4186                )?);
4187
4188                // rd = rd & R12
4189                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4190                    reg_to_bits(rd),
4191                    reg_to_bits(rd),
4192                    12,
4193                )?);
4194
4195                // rd = rd + R11
4196                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4197                    reg_to_bits(rd),
4198                    reg_to_bits(rd),
4199                    11,
4200                )?);
4201
4202                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4203                // R11 = rd >> 4
4204                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4205
4206                // rd = rd + R11
4207                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4208                    reg_to_bits(rd),
4209                    reg_to_bits(rd),
4210                    11,
4211                )?);
4212
4213                // Load 0x0F0F0F0F into R12
4214                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4215                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4216
4217                // rd = rd & R12
4218                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4219                    reg_to_bits(rd),
4220                    reg_to_bits(rd),
4221                    12,
4222                )?);
4223
4224                // Step 4: x = x + (x >> 8)
4225                // R11 = rd >> 8
4226                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4227
4228                // rd = rd + R11
4229                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4230                    reg_to_bits(rd),
4231                    reg_to_bits(rd),
4232                    11,
4233                )?);
4234
4235                // Step 5: x = x + (x >> 16)
4236                // R11 = rd >> 16
4237                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4238
4239                // rd = rd + R11
4240                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4241                    reg_to_bits(rd),
4242                    reg_to_bits(rd),
4243                    11,
4244                )?);
4245
4246                // Step 6: return x & 0x3F
4247                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4248                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4249                    reg_to_bits(rd),
4250                    reg_to_bits(rd),
4251                    0x3F,
4252                )?);
4253
4254                Ok(bytes)
4255            }
4256
4257            // I64DivU: 64-bit unsigned division using binary long division
4258            // Input: R0:R1 = dividend, R2:R3 = divisor
4259            // Output: R0:R1 = quotient
4260            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4261            ArmOp::I64DivU {
4262                rdlo: _,
4263                rdhi: _,
4264                rnlo: _,
4265                rnhi: _,
4266                rmlo: _,
4267                rmhi: _,
4268            } => {
4269                let mut bytes = Vec::new();
4270
4271                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4272                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4273                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4274                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4275
4276                // Initialize quotient (R4:R5) = 0
4277                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4278                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4279
4280                // Initialize remainder (R6:R7) = 0
4281                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4282                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4283
4284                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4285                // MOV.W R12, #64: F04F 0C40
4286                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4287                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4288
4289                // Loop start
4290                let loop_start = bytes.len();
4291
4292                // === Loop body: process one bit ===
4293
4294                // 1. Shift quotient R4:R5 left by 1
4295                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4296                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4297                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4298                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4299                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4300                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4301                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4302                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4303                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4304                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4305                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4306
4307                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4308                // LSLS R7, R7, #1
4309                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4310                // ORR.W R7, R7, R6, LSR #31
4311                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4312                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4313                // LSLS R6, R6, #1
4314                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4315                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4316                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4317                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4318
4319                // 3. Shift dividend R0:R1 left by 1
4320                // LSLS R1, R1, #1
4321                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4322                // ORR.W R1, R1, R0, LSR #31
4323                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4324                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4325                // LSLS R0, R0, #1
4326                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4327
4328                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4329                // Compare high words first: CMP R7, R3
4330                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4331                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4332                // BHI means R7 > R3 (unsigned) - definitely subtract
4333                // BLO means R7 < R3 - definitely don't subtract
4334                // BEQ means need to check low words
4335
4336                // If high > divisor high: branch to subtract (forward +offset)
4337                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4338                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4339                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4340
4341                // If high < divisor high: branch past subtract
4342                // BLO.N +10 (skip to decrement)
4343                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4344
4345                // High words equal, compare low: CMP R6, R2
4346                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4347                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4348                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4349
4350                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4351                // SUBS R6, R6, R2
4352                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4353                // SBC R7, R7, R3 (with borrow)
4354                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4355                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4356                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4357                // ORR R4, R4, #1 (set bit 0 of quotient low)
4358                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4359                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4360
4361                // === Decrement counter and loop ===
4362                // SUBS.W R12, R12, #1 (decrement loop counter)
4363                // SUBS.W R12, R12, #1: F1BC 0C01
4364                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4365                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4366
4367                // BNE back to loop_start
4368                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4369                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4370                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4371                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4372
4373                // === Loop done, move quotient to R0:R1 ===
4374                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4375                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4376
4377                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4378                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4379                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4380                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4381
4382                Ok(bytes)
4383            }
4384
4385            // I64DivS: 64-bit signed division
4386            // Converts to unsigned, divides, then applies sign
4387            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4388            // Output: R0:R1 = quotient (signed)
4389            ArmOp::I64DivS {
4390                rdlo: _,
4391                rdhi: _,
4392                rnlo: _,
4393                rnhi: _,
4394                rmlo: _,
4395                rmhi: _,
4396            } => {
4397                let mut bytes = Vec::new();
4398
4399                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4400                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4401                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4402
4403                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4404                // EOR.W R9, R1, R3
4405                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4406                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4407
4408                // If dividend negative (R1 MSB set), negate it
4409                // TST R1, R1 (check sign)
4410                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4411                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4412                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4413
4414                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4415                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4416                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4417                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4418                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4419                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4420                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4421
4422                // If divisor negative (R3 MSB set), negate it
4423                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4424                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4425
4426                // Negate R2:R3
4427                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4428                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4429                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4430                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4431                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4432
4433                // === Now do unsigned division (same as I64DivU) ===
4434                // Initialize quotient (R4:R5) = 0
4435                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4436                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4437                // Initialize remainder (R6:R7) = 0
4438                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4439                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4440                // Initialize loop counter R8 = 64
4441                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4442                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4443
4444                let loop_start = bytes.len();
4445
4446                // Shift quotient left
4447                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4448                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4449                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4450                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4451
4452                // Shift remainder left, OR in MSB of dividend
4453                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4454                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4455                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4456                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4457                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4458                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4459
4460                // Shift dividend left
4461                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4462                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4463                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4464                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4465
4466                // Compare and conditionally subtract
4467                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4468                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4469                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4470                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4471                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4472
4473                // Subtract and set quotient bit
4474                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4475                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4476                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4477                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4478                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4479
4480                // Decrement and loop
4481                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4482                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4483
4484                let branch_offset_bytes = bytes.len() - loop_start + 4;
4485                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4486                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4487                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4488
4489                // Move quotient to R0:R1
4490                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4491                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4492
4493                // If result should be negative (R9 MSB set), negate R0:R1
4494                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4495                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4496                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4497
4498                // Negate result R0:R1
4499                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4500                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4501                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4502                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4503                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4504
4505                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4506                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4507                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4508
4509                Ok(bytes)
4510            }
4511
4512            // I64RemU: 64-bit unsigned remainder using binary long division
4513            // Same algorithm as I64DivU but returns remainder instead of quotient
4514            // Input: R0:R1 = dividend, R2:R3 = divisor
4515            // Output: R0:R1 = remainder
4516            ArmOp::I64RemU {
4517                rdlo: _,
4518                rdhi: _,
4519                rnlo: _,
4520                rnhi: _,
4521                rmlo: _,
4522                rmhi: _,
4523            } => {
4524                let mut bytes = Vec::new();
4525
4526                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4527                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4528                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4529
4530                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4531                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4532                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4533                // Initialize remainder (R6:R7) = 0
4534                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4535                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4536                // Initialize loop counter R8 = 64
4537                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4538                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4539
4540                let loop_start = bytes.len();
4541
4542                // Shift quotient left (not needed for result, but keeps algorithm same)
4543                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4544                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4545                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4546                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4547
4548                // Shift remainder left, OR in MSB of dividend
4549                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4550                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4551                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4552                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4553                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4554                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4555
4556                // Shift dividend left
4557                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4558                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4559                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4560                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4561
4562                // Compare and conditionally subtract
4563                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4564                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4565                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4566                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4567                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4568
4569                // Subtract and set quotient bit
4570                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4571                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4572                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4573                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4574                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4575
4576                // Decrement and loop
4577                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4578                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4579
4580                let branch_offset_bytes = bytes.len() - loop_start + 4;
4581                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4582                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4583                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4584
4585                // Move REMAINDER to R0:R1 (difference from I64DivU)
4586                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4587                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4588
4589                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4590                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4591                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4592
4593                Ok(bytes)
4594            }
4595
4596            // I64RemS: 64-bit signed remainder
4597            // Remainder sign follows dividend sign (not quotient rule)
4598            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4599            // Output: R0:R1 = remainder (signed, same sign as dividend)
4600            ArmOp::I64RemS {
4601                rdlo: _,
4602                rdhi: _,
4603                rnlo: _,
4604                rnhi: _,
4605                rmlo: _,
4606                rmhi: _,
4607            } => {
4608                let mut bytes = Vec::new();
4609
4610                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4611                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4612                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4613
4614                // Save dividend sign in R9 (remainder sign = dividend sign)
4615                // MOV R9, R1 (just need the sign bit)
4616                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4617
4618                // If dividend negative (R1 MSB set), negate it
4619                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4620                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4621
4622                // Negate R0:R1
4623                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4624                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4625                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4626                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4627                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4628
4629                // If divisor negative (R3 MSB set), negate it
4630                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4631                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4632
4633                // Negate R2:R3
4634                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4635                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4636                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4637                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4638                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4639
4640                // === Unsigned division algorithm ===
4641                // Initialize quotient (R4:R5) = 0
4642                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4643                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4644                // Initialize remainder (R6:R7) = 0
4645                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4646                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4647                // Initialize loop counter R8 = 64
4648                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4649                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4650
4651                let loop_start = bytes.len();
4652
4653                // Shift quotient left
4654                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4655                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4656                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4657                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4658
4659                // Shift remainder left, OR in MSB of dividend
4660                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4661                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4662                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4663                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4664                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4665                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4666
4667                // Shift dividend left
4668                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4669                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4670                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4671                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4672
4673                // Compare and conditionally subtract
4674                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4675                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4676                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4677                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4678                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4679
4680                // Subtract and set quotient bit
4681                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4682                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4683                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4684                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4685                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4686
4687                // Decrement and loop
4688                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4689                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4690
4691                let branch_offset_bytes = bytes.len() - loop_start + 4;
4692                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4693                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4694                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4695
4696                // Move remainder to R0:R1
4697                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4698                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4699
4700                // If original dividend was negative (R9 MSB set), negate remainder
4701                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4702                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4703                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4704
4705                // Negate result R0:R1
4706                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4707                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4708                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4709                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4710                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4711
4712                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4713                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4714                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4715
4716                Ok(bytes)
4717            }
4718
4719            // === F32 VFP single-precision Thumb-2 encodings ===
4720            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4721            ArmOp::F32Add { sd, sn, sm } => {
4722                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4723            }
4724            ArmOp::F32Sub { sd, sn, sm } => {
4725                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4726            }
4727            ArmOp::F32Mul { sd, sn, sm } => {
4728                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4729            }
4730            ArmOp::F32Div { sd, sn, sm } => {
4731                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4732            }
4733            ArmOp::F32Abs { sd, sm } => {
4734                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4735            }
4736            ArmOp::F32Neg { sd, sm } => {
4737                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4738            }
4739            ArmOp::F32Sqrt { sd, sm } => {
4740                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4741            }
4742
4743            // f32 pseudo-ops — multi-instruction sequences
4744            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4745            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4746            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4747            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4748            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4749            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4750            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4751            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4752
4753            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4754            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4755            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4756            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4757            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4758            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4759            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4760
4761            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4762
4763            ArmOp::F32Load { sd, addr } => {
4764                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4765            }
4766            ArmOp::F32Store { sd, addr } => {
4767                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4768            }
4769
4770            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4771            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4772            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4773                Err(synth_core::Error::synthesis(
4774                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4775                ))
4776            }
4777            ArmOp::F32ReinterpretI32 { sd, rm } => {
4778                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4779            }
4780            ArmOp::I32ReinterpretF32 { rd, sm } => {
4781                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4782            }
4783            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4784            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4785
4786            // === F64 VFP double-precision Thumb-2 encodings ===
4787            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4788            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4789                0xEE300B00, dd, dn, dm,
4790            )?)),
4791            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4792                0xEE300B40, dd, dn, dm,
4793            )?)),
4794            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4795                0xEE200B00, dd, dn, dm,
4796            )?)),
4797            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4798                0xEE800B00, dd, dn, dm,
4799            )?)),
4800            ArmOp::F64Abs { dd, dm } => {
4801                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4802            }
4803            ArmOp::F64Neg { dd, dm } => {
4804                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4805            }
4806            ArmOp::F64Sqrt { dd, dm } => {
4807                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4808            }
4809
4810            // f64 pseudo-ops
4811            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4812            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
4813            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
4814            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
4815            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
4816            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
4817            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
4818            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
4819
4820            // f64 comparisons
4821            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
4822            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
4823            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
4824            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
4825            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
4826            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
4827
4828            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
4829
4830            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4831                0xED900B00, dd, addr,
4832            )?)),
4833            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4834                0xED800B00, dd, addr,
4835            )?)),
4836
4837            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
4838            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
4839            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
4840                Err(synth_core::Error::synthesis(
4841                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4842                ))
4843            }
4844            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
4845            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
4846                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
4847            )),
4848            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
4849                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
4850            )),
4851            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
4852                Err(synth_core::Error::synthesis(
4853                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
4854                ))
4855            }
4856            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
4857            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
4858
4859            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
4860
4861            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
4862            ArmOp::I64Add {
4863                rdlo,
4864                rdhi,
4865                rnlo,
4866                rnhi,
4867                rmlo,
4868                rmhi,
4869            } => {
4870                let mut bytes = Vec::new();
4871                // ADDS rdlo, rnlo, rmlo (16-bit)
4872                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
4873                    rd: *rdlo,
4874                    rn: *rnlo,
4875                    op2: Operand2::Reg(*rmlo),
4876                })?);
4877                // ADC.W rdhi, rnhi, rmhi (32-bit)
4878                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
4879                    rd: *rdhi,
4880                    rn: *rnhi,
4881                    op2: Operand2::Reg(*rmhi),
4882                })?);
4883                Ok(bytes)
4884            }
4885
4886            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
4887            ArmOp::I64Sub {
4888                rdlo,
4889                rdhi,
4890                rnlo,
4891                rnhi,
4892                rmlo,
4893                rmhi,
4894            } => {
4895                let mut bytes = Vec::new();
4896                // SUBS rdlo, rnlo, rmlo (16-bit)
4897                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
4898                    rd: *rdlo,
4899                    rn: *rnlo,
4900                    op2: Operand2::Reg(*rmlo),
4901                })?);
4902                // SBC.W rdhi, rnhi, rmhi (32-bit)
4903                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
4904                    rd: *rdhi,
4905                    rn: *rnhi,
4906                    op2: Operand2::Reg(*rmhi),
4907                })?);
4908                Ok(bytes)
4909            }
4910
4911            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
4912            ArmOp::I64And {
4913                rdlo,
4914                rdhi,
4915                rnlo,
4916                rnhi,
4917                rmlo,
4918                rmhi,
4919            } => {
4920                let mut bytes = Vec::new();
4921                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4922                    rd: *rdlo,
4923                    rn: *rnlo,
4924                    op2: Operand2::Reg(*rmlo),
4925                })?);
4926                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4927                    rd: *rdhi,
4928                    rn: *rnhi,
4929                    op2: Operand2::Reg(*rmhi),
4930                })?);
4931                Ok(bytes)
4932            }
4933
4934            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
4935            ArmOp::I64Or {
4936                rdlo,
4937                rdhi,
4938                rnlo,
4939                rnhi,
4940                rmlo,
4941                rmhi,
4942            } => {
4943                let mut bytes = Vec::new();
4944                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4945                    rd: *rdlo,
4946                    rn: *rnlo,
4947                    op2: Operand2::Reg(*rmlo),
4948                })?);
4949                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4950                    rd: *rdhi,
4951                    rn: *rnhi,
4952                    op2: Operand2::Reg(*rmhi),
4953                })?);
4954                Ok(bytes)
4955            }
4956
4957            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
4958            ArmOp::I64Xor {
4959                rdlo,
4960                rdhi,
4961                rnlo,
4962                rnhi,
4963                rmlo,
4964                rmhi,
4965            } => {
4966                let mut bytes = Vec::new();
4967                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4968                    rd: *rdlo,
4969                    rn: *rnlo,
4970                    op2: Operand2::Reg(*rmlo),
4971                })?);
4972                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4973                    rd: *rdhi,
4974                    rn: *rnhi,
4975                    op2: Operand2::Reg(*rmhi),
4976                })?);
4977                Ok(bytes)
4978            }
4979
4980            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
4981            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
4982                rd: *rd,
4983                rn_lo: *rnlo,
4984                rn_hi: *rnhi,
4985            }),
4986
4987            // I64 comparisons: delegate to I64SetCond
4988            ArmOp::I64Eq {
4989                rd,
4990                rnlo,
4991                rnhi,
4992                rmlo,
4993                rmhi,
4994            } => self.encode_thumb(&ArmOp::I64SetCond {
4995                rd: *rd,
4996                rn_lo: *rnlo,
4997                rn_hi: *rnhi,
4998                rm_lo: *rmlo,
4999                rm_hi: *rmhi,
5000                cond: synth_synthesis::Condition::EQ,
5001            }),
5002
5003            ArmOp::I64Ne {
5004                rd,
5005                rnlo,
5006                rnhi,
5007                rmlo,
5008                rmhi,
5009            } => self.encode_thumb(&ArmOp::I64SetCond {
5010                rd: *rd,
5011                rn_lo: *rnlo,
5012                rn_hi: *rnhi,
5013                rm_lo: *rmlo,
5014                rm_hi: *rmhi,
5015                cond: synth_synthesis::Condition::NE,
5016            }),
5017
5018            ArmOp::I64LtS {
5019                rd,
5020                rnlo,
5021                rnhi,
5022                rmlo,
5023                rmhi,
5024            } => self.encode_thumb(&ArmOp::I64SetCond {
5025                rd: *rd,
5026                rn_lo: *rnlo,
5027                rn_hi: *rnhi,
5028                rm_lo: *rmlo,
5029                rm_hi: *rmhi,
5030                cond: synth_synthesis::Condition::LT,
5031            }),
5032
5033            ArmOp::I64LtU {
5034                rd,
5035                rnlo,
5036                rnhi,
5037                rmlo,
5038                rmhi,
5039            } => self.encode_thumb(&ArmOp::I64SetCond {
5040                rd: *rd,
5041                rn_lo: *rnlo,
5042                rn_hi: *rnhi,
5043                rm_lo: *rmlo,
5044                rm_hi: *rmhi,
5045                cond: synth_synthesis::Condition::LO,
5046            }),
5047
5048            ArmOp::I64LeS {
5049                rd,
5050                rnlo,
5051                rnhi,
5052                rmlo,
5053                rmhi,
5054            } => self.encode_thumb(&ArmOp::I64SetCond {
5055                rd: *rd,
5056                rn_lo: *rnlo,
5057                rn_hi: *rnhi,
5058                rm_lo: *rmlo,
5059                rm_hi: *rmhi,
5060                cond: synth_synthesis::Condition::LE,
5061            }),
5062
5063            ArmOp::I64LeU {
5064                rd,
5065                rnlo,
5066                rnhi,
5067                rmlo,
5068                rmhi,
5069            } => self.encode_thumb(&ArmOp::I64SetCond {
5070                rd: *rd,
5071                rn_lo: *rnlo,
5072                rn_hi: *rnhi,
5073                rm_lo: *rmlo,
5074                rm_hi: *rmhi,
5075                cond: synth_synthesis::Condition::LS,
5076            }),
5077
5078            ArmOp::I64GtS {
5079                rd,
5080                rnlo,
5081                rnhi,
5082                rmlo,
5083                rmhi,
5084            } => self.encode_thumb(&ArmOp::I64SetCond {
5085                rd: *rd,
5086                rn_lo: *rnlo,
5087                rn_hi: *rnhi,
5088                rm_lo: *rmlo,
5089                rm_hi: *rmhi,
5090                cond: synth_synthesis::Condition::GT,
5091            }),
5092
5093            ArmOp::I64GtU {
5094                rd,
5095                rnlo,
5096                rnhi,
5097                rmlo,
5098                rmhi,
5099            } => self.encode_thumb(&ArmOp::I64SetCond {
5100                rd: *rd,
5101                rn_lo: *rnlo,
5102                rn_hi: *rnhi,
5103                rm_lo: *rmlo,
5104                rm_hi: *rmhi,
5105                cond: synth_synthesis::Condition::HI,
5106            }),
5107
5108            ArmOp::I64GeS {
5109                rd,
5110                rnlo,
5111                rnhi,
5112                rmlo,
5113                rmhi,
5114            } => self.encode_thumb(&ArmOp::I64SetCond {
5115                rd: *rd,
5116                rn_lo: *rnlo,
5117                rn_hi: *rnhi,
5118                rm_lo: *rmlo,
5119                rm_hi: *rmhi,
5120                cond: synth_synthesis::Condition::GE,
5121            }),
5122
5123            ArmOp::I64GeU {
5124                rd,
5125                rnlo,
5126                rnhi,
5127                rmlo,
5128                rmhi,
5129            } => self.encode_thumb(&ArmOp::I64SetCond {
5130                rd: *rd,
5131                rn_lo: *rnlo,
5132                rn_hi: *rnhi,
5133                rm_lo: *rmlo,
5134                rm_hi: *rmhi,
5135                cond: synth_synthesis::Condition::HS,
5136            }),
5137
5138            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
5139            ArmOp::I64Const { rdlo, rdhi, value } => {
5140                let lo32 = *value as u32;
5141                let hi32 = (*value >> 32) as u32;
5142                let mut bytes = Vec::new();
5143                // Load low 32 bits into rdlo
5144                bytes.extend_from_slice(
5145                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
5146                );
5147                if lo32 > 0xFFFF {
5148                    bytes.extend_from_slice(
5149                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
5150                    );
5151                }
5152                // Load high 32 bits into rdhi
5153                bytes.extend_from_slice(
5154                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5155                );
5156                if hi32 > 0xFFFF {
5157                    bytes.extend_from_slice(
5158                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5159                    );
5160                }
5161                Ok(bytes)
5162            }
5163
5164            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5165            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5166                let mut bytes = Vec::new();
5167                let offset = if addr.offset < 0 {
5168                    0u32
5169                } else {
5170                    addr.offset as u32
5171                };
5172                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &addr.base, offset)?);
5173                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5174                    rdhi,
5175                    &addr.base,
5176                    offset.wrapping_add(4),
5177                )?);
5178                Ok(bytes)
5179            }
5180
5181            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5182            ArmOp::I64Str { rdlo, rdhi, addr } => {
5183                let mut bytes = Vec::new();
5184                let offset = if addr.offset < 0 {
5185                    0u32
5186                } else {
5187                    addr.offset as u32
5188                };
5189                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &addr.base, offset)?);
5190                bytes.extend_from_slice(&self.encode_thumb32_str(
5191                    rdhi,
5192                    &addr.base,
5193                    offset.wrapping_add(4),
5194                )?);
5195                Ok(bytes)
5196            }
5197
5198            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5199            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5200                let mut bytes = Vec::new();
5201                if rdlo != rn {
5202                    // MOV rdlo, rn (16-bit)
5203                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5204                        rd: *rdlo,
5205                        op2: Operand2::Reg(*rn),
5206                    })?);
5207                }
5208                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5209                bytes.extend_from_slice(
5210                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5211                );
5212                Ok(bytes)
5213            }
5214
5215            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5216            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5217                let mut bytes = Vec::new();
5218                if rdlo != rn {
5219                    // MOV rdlo, rn
5220                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5221                        rd: *rdlo,
5222                        op2: Operand2::Reg(*rn),
5223                    })?);
5224                }
5225                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5226                let rdhi_bits = reg_to_bits(rdhi) as u16;
5227                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5228                bytes.extend_from_slice(&instr.to_le_bytes());
5229                Ok(bytes)
5230            }
5231
5232            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5233            ArmOp::I32WrapI64 { rd, rnlo } => {
5234                if rd == rnlo {
5235                    // No-op: already in the right register
5236                    let instr: u16 = 0xBF00; // NOP
5237                    Ok(instr.to_le_bytes().to_vec())
5238                } else {
5239                    // MOV rd, rnlo
5240                    self.encode_thumb(&ArmOp::Mov {
5241                        rd: *rd,
5242                        op2: Operand2::Reg(*rnlo),
5243                    })
5244                }
5245            }
5246
5247            // ===== Helium MVE operations (Thumb-2 encoding) =====
5248            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5249            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5250            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5251            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5252                0xEF000150, qd, qn, qm,
5253            ))),
5254            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5255                0xEF200150, qd, qn, qm,
5256            ))),
5257            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5258                0xFF000150, qd, qn, qm,
5259            ))),
5260            ArmOp::MveMvn { qd, qm } => {
5261                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5262                let qd_enc = qreg_to_num(qd);
5263                let qm_enc = qreg_to_num(qm);
5264                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5265                Ok(vfp_to_thumb_bytes(instr))
5266            }
5267            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5268                0xEF100150, qd, qn, qm,
5269            ))),
5270            ArmOp::MveAddI { qd, qn, qm, size } => {
5271                let sz = mve_size_bits(size);
5272                let base: u32 = 0xEF000840 | (sz << 20);
5273                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5274            }
5275            ArmOp::MveSubI { qd, qn, qm, size } => {
5276                let sz = mve_size_bits(size);
5277                let base: u32 = 0xFF000840 | (sz << 20);
5278                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5279            }
5280            ArmOp::MveMulI { qd, qn, qm, size } => {
5281                let sz = mve_size_bits(size);
5282                let base: u32 = 0xEF000950 | (sz << 20);
5283                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5284            }
5285            ArmOp::MveNegI { qd, qm, size } => {
5286                let sz = mve_size_bits(size);
5287                // VNEG.Sx Qd, Qm
5288                let qd_enc = qreg_to_num(qd);
5289                let qm_enc = qreg_to_num(qm);
5290                let base: u32 = 0xFFB103C0 | (sz << 18);
5291                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5292                Ok(vfp_to_thumb_bytes(instr))
5293            }
5294            ArmOp::MveDup { qd, rn, size } => {
5295                let sz = mve_size_bits(size);
5296                let qd_enc = qreg_to_num(qd);
5297                let rn_bits = reg_to_bits(rn);
5298                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5299                // size encoding: 00=32, 01=16, 10=8
5300                let be = match sz {
5301                    0 => 0b00u32, // 8-bit
5302                    1 => 0b01,    // 16-bit
5303                    _ => 0b00,    // 32-bit (default)
5304                };
5305                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5306                Ok(vfp_to_thumb_bytes(instr))
5307            }
5308            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5309                let qn_enc = qreg_to_num(qn);
5310                let rd_bits = reg_to_bits(rd);
5311                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5312                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5313                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5314                let lane_in_d = (*lane as u32) & 1;
5315                let _sz = mve_size_bits(size);
5316                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5317                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5318                Ok(vfp_to_thumb_bytes(instr))
5319            }
5320            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5321                let qd_enc = qreg_to_num(qd);
5322                let rn_bits = reg_to_bits(rn);
5323                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5324                let lane_in_d = (*lane as u32) & 1;
5325                let _sz = mve_size_bits(size);
5326                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5327                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5328                Ok(vfp_to_thumb_bytes(instr))
5329            }
5330
5331            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5332            ArmOp::MveCmpEqI { qd, qn, qm, size }
5333            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5334            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5335            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5336            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5337            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5338            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5339            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5340            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5341            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5342                // Encode as VADD (placeholder encoding — real implementation
5343                // would use VCMP + VPSEL pair)
5344                let sz = mve_size_bits(size);
5345                let base: u32 = 0xEF000840 | (sz << 20);
5346                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5347            }
5348
5349            // f32x4 MVE arithmetic
5350            ArmOp::MveAddF32 { qd, qn, qm } => {
5351                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5352                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5353            }
5354            ArmOp::MveSubF32 { qd, qn, qm } => {
5355                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5356                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5357            }
5358            ArmOp::MveMulF32 { qd, qn, qm } => {
5359                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5360                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5361            }
5362            ArmOp::MveNegF32 { qd, qm } => {
5363                let qd_enc = qreg_to_num(qd);
5364                let qm_enc = qreg_to_num(qm);
5365                // VNEG.F32 Qd, Qm: FFB907C0
5366                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5367                Ok(vfp_to_thumb_bytes(instr))
5368            }
5369            ArmOp::MveAbsF32 { qd, qm } => {
5370                let qd_enc = qreg_to_num(qd);
5371                let qm_enc = qreg_to_num(qm);
5372                // VABS.F32 Qd, Qm: FFB90740
5373                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5374                Ok(vfp_to_thumb_bytes(instr))
5375            }
5376            ArmOp::MveCmpEqF32 { qd, qn, qm }
5377            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5378            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5379            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5380            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5381            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5382                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5383                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5384            }
5385            ArmOp::MveDupF32 { qd, rn } => {
5386                let qd_enc = qreg_to_num(qd);
5387                let rn_bits = reg_to_bits(rn);
5388                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5389                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5390                Ok(vfp_to_thumb_bytes(instr))
5391            }
5392            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5393                let qn_enc = qreg_to_num(qn);
5394                let rd_bits = reg_to_bits(rd);
5395                // VMOV Rd, Sn where Sn = Q*4 + lane
5396                let s_num = qn_enc * 4 + (*lane as u32);
5397                let (vn, n) = encode_sreg(s_num);
5398                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5399                Ok(vfp_to_thumb_bytes(instr))
5400            }
5401            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5402                let qd_enc = qreg_to_num(qd);
5403                let rn_bits = reg_to_bits(rn);
5404                // VMOV Sn, Rn where Sn = Q*4 + lane
5405                let s_num = qd_enc * 4 + (*lane as u32);
5406                let (vn, n) = encode_sreg(s_num);
5407                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5408                Ok(vfp_to_thumb_bytes(instr))
5409            }
5410            ArmOp::MveDivF32 { qd, qn, qm } => {
5411                // Lane-wise: extract 4 S-regs, VDIV, insert back
5412                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5413            }
5414            ArmOp::MveSqrtF32 { qd, qm } => {
5415                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5416                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5417            }
5418
5419            // Catch-all for any remaining ops
5420            _ => {
5421                let instr: u16 = 0xBF00; // NOP
5422                Ok(instr.to_le_bytes().to_vec())
5423            }
5424        }
5425    }
5426
5427    // === Thumb-2 VFP multi-instruction helpers ===
5428
5429    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5430    fn encode_thumb_f32_compare(
5431        &self,
5432        rd: &Reg,
5433        sn: &VfpReg,
5434        sm: &VfpReg,
5435        cond_code: u32,
5436    ) -> Result<Vec<u8>> {
5437        let mut bytes = Vec::new();
5438        let rd_bits = reg_to_bits(rd);
5439
5440        // VCMP.F32 Sn, Sm
5441        let sn_num = vfp_sreg_to_num(sn)?;
5442        let sm_num = vfp_sreg_to_num(sm)?;
5443        let (vd, d) = encode_sreg(sn_num);
5444        let (vm, m) = encode_sreg(sm_num);
5445        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5446        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5447
5448        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5449        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5450
5451        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5452        if rd_bits < 8 {
5453            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5454            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5455        } else {
5456            // MOV.W Rd, #0 (32-bit Thumb-2)
5457            let hw1: u16 = 0xF04F;
5458            let hw2: u16 = (rd_bits as u16) << 8;
5459            bytes.extend_from_slice(&hw1.to_le_bytes());
5460            bytes.extend_from_slice(&hw2.to_le_bytes());
5461        }
5462
5463        // IT<cond> — If-Then for conditional MOV
5464        // IT encoding: 1011 1111 cond(4) mask(4)
5465        // mask = 0x8 for single "then" (IT)
5466        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5467        bytes.extend_from_slice(&it.to_le_bytes());
5468
5469        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5470        if rd_bits < 8 {
5471            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5472            bytes.extend_from_slice(&mov_one.to_le_bytes());
5473        } else {
5474            // MOV.W Rd, #1 (32-bit)
5475            let hw1: u16 = 0xF04F;
5476            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5477            bytes.extend_from_slice(&hw1.to_le_bytes());
5478            bytes.extend_from_slice(&hw2.to_le_bytes());
5479        }
5480
5481        Ok(bytes)
5482    }
5483
5484    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5485    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5486        let mut bytes = Vec::new();
5487        let bits = value.to_bits();
5488        let rt: u32 = 12; // R12/IP as temp
5489
5490        // MOVW R12, #lo16
5491        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5492        let lo16 = bits & 0xFFFF;
5493        let imm4 = (lo16 >> 12) & 0xF;
5494        let i_bit = (lo16 >> 11) & 1;
5495        let imm3 = (lo16 >> 8) & 0x7;
5496        let imm8 = lo16 & 0xFF;
5497        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5498        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5499        bytes.extend_from_slice(&hw1.to_le_bytes());
5500        bytes.extend_from_slice(&hw2.to_le_bytes());
5501
5502        // MOVT R12, #hi16
5503        let hi16 = (bits >> 16) & 0xFFFF;
5504        let imm4 = (hi16 >> 12) & 0xF;
5505        let i_bit = (hi16 >> 11) & 1;
5506        let imm3 = (hi16 >> 8) & 0x7;
5507        let imm8 = hi16 & 0xFF;
5508        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5509        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5510        bytes.extend_from_slice(&hw1.to_le_bytes());
5511        bytes.extend_from_slice(&hw2.to_le_bytes());
5512
5513        // VMOV Sd, R12
5514        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5515        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5516
5517        Ok(bytes)
5518    }
5519
5520    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5521    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5522        let mut bytes = Vec::new();
5523
5524        // VMOV Sd, Rm
5525        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5526        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5527
5528        // VCVT.F32.S32/U32 Sd, Sd
5529        let sd_num = vfp_sreg_to_num(sd)?;
5530        let (vd, d) = encode_sreg(sd_num);
5531        let (vm, m) = encode_sreg(sd_num);
5532        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5533        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5534        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5535
5536        Ok(bytes)
5537    }
5538
5539    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5540    /// Encode F32 rounding as Thumb-2.
5541    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5542    ///
5543    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5544    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5545    /// then restores FPSCR.
5546    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5547        let mut bytes = Vec::new();
5548        let sm_num = vfp_sreg_to_num(sm)?;
5549        let sd_num = vfp_sreg_to_num(sd)?;
5550        let (vd_s, d_s) = encode_sreg(sd_num);
5551        let (vm_s, m_s) = encode_sreg(sm_num);
5552
5553        if mode == 0b11 {
5554            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5555            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5556            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5557        } else {
5558            // ceil/floor/nearest: manipulate FPSCR rounding mode
5559            let rt: u32 = 12; // R12/IP as temp
5560
5561            // VMRS R12, FPSCR
5562            let vmrs = 0xEEF10A10 | (rt << 12);
5563            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5564
5565            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5566            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5567            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5568            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5569            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5570            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5571            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5572            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5573
5574            // ORR.W R12, R12, #(mode << 22)
5575            if mode != 0 {
5576                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5577                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5578                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5579                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5580            }
5581
5582            // VMSR FPSCR, R12
5583            let vmsr = 0xEEE10A10 | (rt << 12);
5584            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5585
5586            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5587            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5588            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5589
5590            // Restore FPSCR: clear rmode bits back to nearest (default)
5591            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5592            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5593            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5594            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5595        }
5596
5597        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5598        let (vd2, d2) = encode_sreg(sd_num);
5599        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5600        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5601
5602        Ok(bytes)
5603    }
5604
5605    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5606    fn encode_thumb_f32_minmax(
5607        &self,
5608        sd: &VfpReg,
5609        sn: &VfpReg,
5610        sm: &VfpReg,
5611        is_min: bool,
5612    ) -> Result<Vec<u8>> {
5613        let mut bytes = Vec::new();
5614        let sn_num = vfp_sreg_to_num(sn)?;
5615        let sm_num = vfp_sreg_to_num(sm)?;
5616        let sd_num = vfp_sreg_to_num(sd)?;
5617
5618        // VMOV.F32 Sd, Sn
5619        let (vd, d) = encode_sreg(sd_num);
5620        let (vn, n) = encode_sreg(sn_num);
5621        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5622        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5623
5624        // VCMP.F32 Sn, Sm
5625        let (vm, m) = encode_sreg(sm_num);
5626        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5627        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5628
5629        // VMRS APSR_nzcv, FPSCR
5630        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5631
5632        // IT GT (for min) or IT MI (for max)
5633        let cond: u16 = if is_min { 0xC } else { 0x4 };
5634        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5635        bytes.extend_from_slice(&it.to_le_bytes());
5636
5637        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5638        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5639        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5640
5641        Ok(bytes)
5642    }
5643
5644    /// Encode F32 copysign as Thumb-2
5645    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5646        let mut bytes = Vec::new();
5647
5648        // VMOV R12, Sm (get sign source bits)
5649        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5650            false,
5651            sm,
5652            &Reg::R12,
5653        )?));
5654
5655        // VMOV R0, Sn (get magnitude source bits)
5656        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5657            false,
5658            sn,
5659            &Reg::R0,
5660        )?));
5661
5662        // AND.W R12, R12, #0x80000000
5663        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5664        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5665        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5666        // Actually encoding #0x80000000 as modified constant:
5667        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5668        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5669        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5670        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5671        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5672        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5673        bytes.extend_from_slice(&hw1.to_le_bytes());
5674        bytes.extend_from_slice(&hw2.to_le_bytes());
5675
5676        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5677        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5678        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5679        bytes.extend_from_slice(&hw1.to_le_bytes());
5680        bytes.extend_from_slice(&hw2.to_le_bytes());
5681
5682        // ORR.W R0, R0, R12 (R0 = register 0)
5683        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5684        let hw2: u16 = 12; // Rd=R0, Rm=R12
5685        bytes.extend_from_slice(&hw1.to_le_bytes());
5686        bytes.extend_from_slice(&hw2.to_le_bytes());
5687
5688        // VMOV Sd, R0
5689        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5690            true,
5691            sd,
5692            &Reg::R0,
5693        )?));
5694
5695        Ok(bytes)
5696    }
5697
5698    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5699    fn encode_thumb_f64_compare(
5700        &self,
5701        rd: &Reg,
5702        dn: &VfpReg,
5703        dm: &VfpReg,
5704        cond_code: u32,
5705    ) -> Result<Vec<u8>> {
5706        let mut bytes = Vec::new();
5707        let rd_bits = reg_to_bits(rd);
5708
5709        // VCMP.F64 Dn, Dm
5710        let dn_num = vfp_dreg_to_num(dn)?;
5711        let dm_num = vfp_dreg_to_num(dm)?;
5712        let (vd, d) = encode_dreg(dn_num);
5713        let (vm, m) = encode_dreg(dm_num);
5714        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5715        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5716
5717        // VMRS APSR_nzcv, FPSCR
5718        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5719
5720        // MOVS Rd, #0
5721        if rd_bits < 8 {
5722            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5723            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5724        } else {
5725            let hw1: u16 = 0xF04F;
5726            let hw2: u16 = (rd_bits as u16) << 8;
5727            bytes.extend_from_slice(&hw1.to_le_bytes());
5728            bytes.extend_from_slice(&hw2.to_le_bytes());
5729        }
5730
5731        // IT<cond>
5732        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5733        bytes.extend_from_slice(&it.to_le_bytes());
5734
5735        // MOV Rd, #1
5736        if rd_bits < 8 {
5737            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5738            bytes.extend_from_slice(&mov_one.to_le_bytes());
5739        } else {
5740            let hw1: u16 = 0xF04F;
5741            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5742            bytes.extend_from_slice(&hw1.to_le_bytes());
5743            bytes.extend_from_slice(&hw2.to_le_bytes());
5744        }
5745
5746        Ok(bytes)
5747    }
5748
5749    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5750    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5751        let mut bytes = Vec::new();
5752        let bits = value.to_bits();
5753        let lo32 = bits as u32;
5754        let hi32 = (bits >> 32) as u32;
5755
5756        // MOVW R0, #lo16(lo32)
5757        let lo16 = lo32 & 0xFFFF;
5758        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5759
5760        // MOVT R0, #hi16(lo32)
5761        let hi16 = (lo32 >> 16) & 0xFFFF;
5762        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5763
5764        // MOVW R12, #lo16(hi32)
5765        let lo16 = hi32 & 0xFFFF;
5766        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5767
5768        // MOVT R12, #hi16(hi32)
5769        let hi16 = (hi32 >> 16) & 0xFFFF;
5770        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5771
5772        // VMOV Dd, R0, R12
5773        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5774        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5775
5776        Ok(bytes)
5777    }
5778
5779    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5780    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5781        let mut bytes = Vec::new();
5782
5783        // VMOV S0, Rm
5784        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5785        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5786
5787        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5788        let dd_num = vfp_dreg_to_num(dd)?;
5789        let (vd, d) = encode_dreg(dd_num);
5790        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5791        let vcvt = base | (d << 22) | (vd << 12);
5792        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5793
5794        Ok(bytes)
5795    }
5796
5797    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5798    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5799        let dd_num = vfp_dreg_to_num(dd)?;
5800        let sm_num = vfp_sreg_to_num(sm)?;
5801        let (vd, d) = encode_dreg(dd_num);
5802        let (vm, m) = encode_sreg(sm_num);
5803
5804        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
5805        Ok(vfp_to_thumb_bytes(vcvt))
5806    }
5807
5808    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
5809    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5810        let mut bytes = Vec::new();
5811        let dm_num = vfp_dreg_to_num(dm)?;
5812        let (vm, m) = encode_dreg(dm_num);
5813
5814        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
5815        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
5816        let vcvt = base | (m << 5) | vm;
5817        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5818
5819        // VMOV Rd, S0
5820        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
5821        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5822
5823        Ok(bytes)
5824    }
5825
5826    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5827    /// Encode F64 rounding as Thumb-2.
5828    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5829    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5830        let mut bytes = Vec::new();
5831        let dm_num = vfp_dreg_to_num(dm)?;
5832        let dd_num = vfp_dreg_to_num(dd)?;
5833        let (vm, m) = encode_dreg(dm_num);
5834        let (vd, d) = encode_dreg(dd_num);
5835
5836        if mode == 0b11 {
5837            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
5838            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
5839            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5840        } else {
5841            let rt: u32 = 12;
5842
5843            // VMRS R12, FPSCR
5844            let vmrs = 0xEEF10A10 | (rt << 12);
5845            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5846
5847            // BIC.W R12, R12, #(3 << 22)
5848            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
5849            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5850            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5851            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5852
5853            // ORR.W R12, R12, #(mode << 22)
5854            if mode != 0 {
5855                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
5856                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5857                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5858                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5859            }
5860
5861            // VMSR FPSCR, R12
5862            let vmsr = 0xEEE10A10 | (rt << 12);
5863            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5864
5865            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
5866            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
5867            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5868
5869            // Restore FPSCR
5870            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5871            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5872            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5873            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5874        }
5875
5876        // VCVT.F64.S32 Dd, S0
5877        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
5878        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5879
5880        Ok(bytes)
5881    }
5882
5883    /// Encode F64 min/max as Thumb-2
5884    fn encode_thumb_f64_minmax(
5885        &self,
5886        dd: &VfpReg,
5887        dn: &VfpReg,
5888        dm: &VfpReg,
5889        is_min: bool,
5890    ) -> Result<Vec<u8>> {
5891        let mut bytes = Vec::new();
5892        let dn_num = vfp_dreg_to_num(dn)?;
5893        let dm_num = vfp_dreg_to_num(dm)?;
5894        let dd_num = vfp_dreg_to_num(dd)?;
5895
5896        // VMOV.F64 Dd, Dn
5897        let (vd, d) = encode_dreg(dd_num);
5898        let (vn, n) = encode_dreg(dn_num);
5899        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5900        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
5901
5902        // VCMP.F64 Dn, Dm
5903        let (vm, m) = encode_dreg(dm_num);
5904        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5905        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5906
5907        // VMRS APSR_nzcv, FPSCR
5908        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5909
5910        // IT GT (for min) or IT MI (for max)
5911        let cond: u16 = if is_min { 0xC } else { 0x4 };
5912        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5913        bytes.extend_from_slice(&it.to_le_bytes());
5914
5915        // VMOV{cond}.F64 Dd, Dm
5916        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5917        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
5918
5919        Ok(bytes)
5920    }
5921
5922    /// Encode F64 copysign as Thumb-2
5923    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
5924        let mut bytes = Vec::new();
5925
5926        // VMOV R0, R12, Dm (get sign source)
5927        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5928            false,
5929            dm,
5930            &Reg::R0,
5931            &Reg::R12,
5932        )?));
5933
5934        // VMOV R1, R2, Dn (get magnitude source)
5935        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5936            false,
5937            dn,
5938            &Reg::R1,
5939            &Reg::R2,
5940        )?));
5941
5942        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
5943        let hw1: u16 = 0xF000 | 12;
5944        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
5945        bytes.extend_from_slice(&hw1.to_le_bytes());
5946        bytes.extend_from_slice(&hw2.to_le_bytes());
5947
5948        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
5949        let hw1: u16 = 0xF020 | 2;
5950        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
5951        bytes.extend_from_slice(&hw1.to_le_bytes());
5952        bytes.extend_from_slice(&hw2.to_le_bytes());
5953
5954        // ORR.W R2, R2, R12
5955        let hw1: u16 = 0xEA40 | 2;
5956        let hw2: u16 = (2 << 8) | 12;
5957        bytes.extend_from_slice(&hw1.to_le_bytes());
5958        bytes.extend_from_slice(&hw2.to_le_bytes());
5959
5960        // VMOV Dd, R1, R2
5961        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5962            true,
5963            dd,
5964            &Reg::R1,
5965            &Reg::R2,
5966        )?));
5967
5968        Ok(bytes)
5969    }
5970
5971    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
5972    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5973        let mut bytes = Vec::new();
5974
5975        let sm_num = vfp_sreg_to_num(sm)?;
5976        let (vd, d) = encode_sreg(sm_num);
5977        let (vm, m) = encode_sreg(sm_num);
5978        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
5979        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5980        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5981
5982        // VMOV Rd, Sm
5983        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
5984        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5985
5986        Ok(bytes)
5987    }
5988
5989    // === Thumb-2 32-bit encoding helpers ===
5990
5991    /// Encode Thumb-2 32-bit ADD with immediate
5992    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5993        let rd_bits = reg_to_bits(rd);
5994        let rn_bits = reg_to_bits(rn);
5995
5996        // ADD.W Rd, Rn, #imm12
5997        // First halfword: 1111 0 i 0 1000 S Rn
5998        // Second halfword: 0 imm3 Rd imm8
5999        let i_bit = (imm >> 11) & 1;
6000        let imm3 = (imm >> 8) & 0x7;
6001        let imm8 = imm & 0xFF;
6002
6003        let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6004        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6005
6006        let mut bytes = hw1.to_le_bytes().to_vec();
6007        bytes.extend_from_slice(&hw2.to_le_bytes());
6008        Ok(bytes)
6009    }
6010
6011    /// Encode Thumb-2 32-bit SUB with immediate
6012    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6013        let rd_bits = reg_to_bits(rd);
6014        let rn_bits = reg_to_bits(rn);
6015
6016        let i_bit = (imm >> 11) & 1;
6017        let imm3 = (imm >> 8) & 0x7;
6018        let imm8 = imm & 0xFF;
6019
6020        let hw1: u16 = (0xF1A0 | (i_bit << 10) | rn_bits) as u16;
6021        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6022
6023        let mut bytes = hw1.to_le_bytes().to_vec();
6024        bytes.extend_from_slice(&hw2.to_le_bytes());
6025        Ok(bytes)
6026    }
6027
6028    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
6029    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6030        let rd_bits = reg_to_bits(rd);
6031        let rn_bits = reg_to_bits(rn);
6032
6033        let i_bit = (imm >> 11) & 1;
6034        let imm3 = (imm >> 8) & 0x7;
6035        let imm8 = imm & 0xFF;
6036
6037        // ADDS.W Rd, Rn, #imm (with S=1)
6038        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
6039        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
6040        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6041
6042        let mut bytes = hw1.to_le_bytes().to_vec();
6043        bytes.extend_from_slice(&hw2.to_le_bytes());
6044        Ok(bytes)
6045    }
6046
6047    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
6048    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6049        let rd_bits = reg_to_bits(rd);
6050        let rn_bits = reg_to_bits(rn);
6051
6052        let i_bit = (imm >> 11) & 1;
6053        let imm3 = (imm >> 8) & 0x7;
6054        let imm8 = imm & 0xFF;
6055
6056        // SUBS.W Rd, Rn, #imm (with S=1)
6057        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
6058        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6059        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6060
6061        let mut bytes = hw1.to_le_bytes().to_vec();
6062        bytes.extend_from_slice(&hw2.to_le_bytes());
6063        Ok(bytes)
6064    }
6065
6066    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
6067    ///
6068    /// # Contract (Verus-style)
6069    /// ```text
6070    /// requires rd <= R14
6071    /// ensures result.len() == 4
6072    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
6073    /// ```
6074    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
6075        let rd_bits = reg_to_bits(rd);
6076        reg_bits_checked(rd_bits)?;
6077        let imm16 = imm & 0xFFFF;
6078
6079        // MOVW Rd, #imm16
6080        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6081        let imm4 = (imm16 >> 12) & 0xF;
6082        let i_bit = (imm16 >> 11) & 1;
6083        let imm3 = (imm16 >> 8) & 0x7;
6084        let imm8 = imm16 & 0xFF;
6085
6086        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6087        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6088
6089        let mut bytes = hw1.to_le_bytes().to_vec();
6090        bytes.extend_from_slice(&hw2.to_le_bytes());
6091        encoding_contracts::verify_thumb32(&bytes);
6092        Ok(bytes)
6093    }
6094
6095    /// Encode Thumb-2 32-bit shift with immediate
6096    ///
6097    /// # Contract (Verus-style)
6098    /// ```text
6099    /// requires rd <= R14, rm <= R14
6100    /// ensures result.len() == 4
6101    /// ```
6102    fn encode_thumb32_shift(
6103        &self,
6104        rd: &Reg,
6105        rm: &Reg,
6106        shift: u32,
6107        shift_type: u8,
6108    ) -> Result<Vec<u8>> {
6109        let rd_bits = reg_to_bits(rd);
6110        let rm_bits = reg_to_bits(rm);
6111        reg_bits_checked(rd_bits)?;
6112        reg_bits_checked(rm_bits)?;
6113        let imm5 = shift & 0x1F;
6114        let imm2 = imm5 & 0x3;
6115        let imm3 = (imm5 >> 2) & 0x7;
6116
6117        // MOV.W Rd, Rm, <shift> #imm
6118        // EA4F 0 imm3 Rd imm2 type Rm
6119        let hw1: u16 = 0xEA4F;
6120        let hw2: u16 =
6121            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
6122                as u16;
6123
6124        let mut bytes = hw1.to_le_bytes().to_vec();
6125        bytes.extend_from_slice(&hw2.to_le_bytes());
6126        Ok(bytes)
6127    }
6128
6129    /// Encode Thumb-2 32-bit shift by register
6130    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
6131    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
6132    fn encode_thumb32_shift_reg(
6133        &self,
6134        rd: &Reg,
6135        rn: &Reg,
6136        rm: &Reg,
6137        shift_type: u8,
6138    ) -> Result<Vec<u8>> {
6139        let rd_bits = reg_to_bits(rd);
6140        let rn_bits = reg_to_bits(rn);
6141        let rm_bits = reg_to_bits(rm);
6142
6143        // hw1: 1111 1010 0xx0 Rn
6144        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
6145        // hw2: 1111 Rd 0000 Rm
6146        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
6147
6148        let mut bytes = hw1.to_le_bytes().to_vec();
6149        bytes.extend_from_slice(&hw2.to_le_bytes());
6150        Ok(bytes)
6151    }
6152
6153    /// Encode Thumb-2 32-bit CMP with immediate
6154    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6155        let rn_bits = reg_to_bits(rn);
6156
6157        let i_bit = (imm >> 11) & 1;
6158        let imm3 = (imm >> 8) & 0x7;
6159        let imm8 = imm & 0xFF;
6160
6161        // CMP.W Rn, #imm
6162        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6163        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6164
6165        let mut bytes = hw1.to_le_bytes().to_vec();
6166        bytes.extend_from_slice(&hw2.to_le_bytes());
6167        Ok(bytes)
6168    }
6169
6170    /// Encode Thumb-2 32-bit LDR
6171    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6172        let rd_bits = reg_to_bits(rd);
6173        let base_bits = reg_to_bits(base);
6174
6175        // LDR.W Rd, [Rn, #imm12]
6176        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6177        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6178
6179        let mut bytes = hw1.to_le_bytes().to_vec();
6180        bytes.extend_from_slice(&hw2.to_le_bytes());
6181        Ok(bytes)
6182    }
6183
6184    /// Encode Thumb-2 32-bit STR
6185    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6186        let rd_bits = reg_to_bits(rd);
6187        let base_bits = reg_to_bits(base);
6188
6189        // STR.W Rd, [Rn, #imm12]
6190        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6191        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6192
6193        let mut bytes = hw1.to_le_bytes().to_vec();
6194        bytes.extend_from_slice(&hw2.to_le_bytes());
6195        Ok(bytes)
6196    }
6197
6198    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6199    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6200        let rd_bits = reg_to_bits(rd);
6201        let base_bits = reg_to_bits(base);
6202        let rm_bits = reg_to_bits(offset_reg);
6203
6204        // LDR.W Rd, [Rn, Rm, LSL #0]
6205        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6206        // imm2 = 00 for no shift (LSL #0)
6207        let hw1: u16 = (0xF850 | base_bits) as u16;
6208        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6209
6210        let mut bytes = hw1.to_le_bytes().to_vec();
6211        bytes.extend_from_slice(&hw2.to_le_bytes());
6212        Ok(bytes)
6213    }
6214
6215    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6216    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6217        let rd_bits = reg_to_bits(rd);
6218        let base_bits = reg_to_bits(base);
6219        let rm_bits = reg_to_bits(offset_reg);
6220
6221        // STR.W Rd, [Rn, Rm, LSL #0]
6222        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6223        // imm2 = 00 for no shift (LSL #0)
6224        let hw1: u16 = (0xF840 | base_bits) as u16;
6225        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6226
6227        let mut bytes = hw1.to_le_bytes().to_vec();
6228        bytes.extend_from_slice(&hw2.to_le_bytes());
6229        Ok(bytes)
6230    }
6231
6232    // === Sub-word load/store Thumb-2 encoding helpers ===
6233
6234    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6235    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6236        let rd_bits = reg_to_bits(rd);
6237        let base_bits = reg_to_bits(base);
6238        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6239        let hw1: u16 = (0xF890 | base_bits) as u16;
6240        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6241        let mut bytes = hw1.to_le_bytes().to_vec();
6242        bytes.extend_from_slice(&hw2.to_le_bytes());
6243        Ok(bytes)
6244    }
6245
6246    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6247    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6248        let rd_bits = reg_to_bits(rd);
6249        let base_bits = reg_to_bits(base);
6250        let rm_bits = reg_to_bits(offset_reg);
6251        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6252        let hw1: u16 = (0xF810 | base_bits) as u16;
6253        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6254        let mut bytes = hw1.to_le_bytes().to_vec();
6255        bytes.extend_from_slice(&hw2.to_le_bytes());
6256        Ok(bytes)
6257    }
6258
6259    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6260    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6261        let rd_bits = reg_to_bits(rd);
6262        let base_bits = reg_to_bits(base);
6263        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6264        let hw1: u16 = (0xF990 | base_bits) as u16;
6265        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6266        let mut bytes = hw1.to_le_bytes().to_vec();
6267        bytes.extend_from_slice(&hw2.to_le_bytes());
6268        Ok(bytes)
6269    }
6270
6271    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6272    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6273        let rd_bits = reg_to_bits(rd);
6274        let base_bits = reg_to_bits(base);
6275        let rm_bits = reg_to_bits(offset_reg);
6276        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6277        let hw1: u16 = (0xF910 | base_bits) as u16;
6278        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6279        let mut bytes = hw1.to_le_bytes().to_vec();
6280        bytes.extend_from_slice(&hw2.to_le_bytes());
6281        Ok(bytes)
6282    }
6283
6284    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6285    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6286        let rd_bits = reg_to_bits(rd);
6287        let base_bits = reg_to_bits(base);
6288        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6289        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6290        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6291        let mut bytes = hw1.to_le_bytes().to_vec();
6292        bytes.extend_from_slice(&hw2.to_le_bytes());
6293        Ok(bytes)
6294    }
6295
6296    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6297    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6298        let rd_bits = reg_to_bits(rd);
6299        let base_bits = reg_to_bits(base);
6300        let rm_bits = reg_to_bits(offset_reg);
6301        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6302        let hw1: u16 = (0xF830 | base_bits) as u16;
6303        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6304        let mut bytes = hw1.to_le_bytes().to_vec();
6305        bytes.extend_from_slice(&hw2.to_le_bytes());
6306        Ok(bytes)
6307    }
6308
6309    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6310    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6311        let rd_bits = reg_to_bits(rd);
6312        let base_bits = reg_to_bits(base);
6313        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6314        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6315        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6316        let mut bytes = hw1.to_le_bytes().to_vec();
6317        bytes.extend_from_slice(&hw2.to_le_bytes());
6318        Ok(bytes)
6319    }
6320
6321    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6322    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6323        let rd_bits = reg_to_bits(rd);
6324        let base_bits = reg_to_bits(base);
6325        let rm_bits = reg_to_bits(offset_reg);
6326        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6327        let hw1: u16 = (0xF930 | base_bits) as u16;
6328        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6329        let mut bytes = hw1.to_le_bytes().to_vec();
6330        bytes.extend_from_slice(&hw2.to_le_bytes());
6331        Ok(bytes)
6332    }
6333
6334    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6335    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6336        let rd_bits = reg_to_bits(rd);
6337        let base_bits = reg_to_bits(base);
6338        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6339        let hw1: u16 = (0xF880 | base_bits) as u16;
6340        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6341        let mut bytes = hw1.to_le_bytes().to_vec();
6342        bytes.extend_from_slice(&hw2.to_le_bytes());
6343        Ok(bytes)
6344    }
6345
6346    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6347    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6348        let rd_bits = reg_to_bits(rd);
6349        let base_bits = reg_to_bits(base);
6350        let rm_bits = reg_to_bits(offset_reg);
6351        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6352        let hw1: u16 = (0xF800 | base_bits) as u16;
6353        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6354        let mut bytes = hw1.to_le_bytes().to_vec();
6355        bytes.extend_from_slice(&hw2.to_le_bytes());
6356        Ok(bytes)
6357    }
6358
6359    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6360    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6361        let rd_bits = reg_to_bits(rd);
6362        let base_bits = reg_to_bits(base);
6363        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6364        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6365        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6366        let mut bytes = hw1.to_le_bytes().to_vec();
6367        bytes.extend_from_slice(&hw2.to_le_bytes());
6368        Ok(bytes)
6369    }
6370
6371    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6372    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6373        let rd_bits = reg_to_bits(rd);
6374        let base_bits = reg_to_bits(base);
6375        let rm_bits = reg_to_bits(offset_reg);
6376        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6377        let hw1: u16 = (0xF820 | base_bits) as u16;
6378        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6379        let mut bytes = hw1.to_le_bytes().to_vec();
6380        bytes.extend_from_slice(&hw2.to_le_bytes());
6381        Ok(bytes)
6382    }
6383
6384    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6385    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6386        let rd_bits = reg_to_bits(rd);
6387        let rn_bits = reg_to_bits(rn);
6388
6389        // For small immediates, use ADD.W Rd, Rn, #imm12
6390        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6391        // S = 0 (don't update flags)
6392        // The 12-bit immediate is encoded as: i:imm3:imm8
6393        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6394        if imm <= 0xFFF {
6395            let i_bit = (imm >> 11) & 1;
6396            let imm3 = (imm >> 8) & 0x7;
6397            let imm8 = imm & 0xFF;
6398
6399            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6400            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6401
6402            let mut bytes = hw1.to_le_bytes().to_vec();
6403            bytes.extend_from_slice(&hw2.to_le_bytes());
6404            Ok(bytes)
6405        } else {
6406            // For larger immediates, would need MOVW/MOVT + ADD
6407            // For now, return error
6408            Err(synth_core::Error::synthesis(
6409                "ADD immediate too large for single instruction",
6410            ))
6411        }
6412    }
6413
6414    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6415
6416    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6417    ///
6418    /// # Contract (Verus-style)
6419    /// ```text
6420    /// requires rd <= 14, imm16 <= 0xFFFF
6421    /// ensures result.len() == 4
6422    /// ```
6423    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6424        reg_bits_checked(rd)?;
6425        encoding_contracts::verify_imm16(imm16);
6426        // MOVW Rd, #imm16
6427        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6428        let imm16 = imm16 & 0xFFFF;
6429        let imm4 = (imm16 >> 12) & 0xF;
6430        let i_bit = (imm16 >> 11) & 1;
6431        let imm3 = (imm16 >> 8) & 0x7;
6432        let imm8 = imm16 & 0xFF;
6433
6434        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6435        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6436
6437        let mut bytes = hw1.to_le_bytes().to_vec();
6438        bytes.extend_from_slice(&hw2.to_le_bytes());
6439        encoding_contracts::verify_thumb32(&bytes);
6440        Ok(bytes)
6441    }
6442
6443    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6444    ///
6445    /// # Contract (Verus-style)
6446    /// ```text
6447    /// requires rd <= 14, imm16 <= 0xFFFF
6448    /// ensures result.len() == 4
6449    /// ```
6450    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6451        reg_bits_checked(rd)?;
6452        encoding_contracts::verify_imm16(imm16);
6453        // MOVT Rd, #imm16
6454        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6455        let imm16 = imm16 & 0xFFFF;
6456        let imm4 = (imm16 >> 12) & 0xF;
6457        let i_bit = (imm16 >> 11) & 1;
6458        let imm3 = (imm16 >> 8) & 0x7;
6459        let imm8 = imm16 & 0xFF;
6460
6461        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6462        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6463
6464        let mut bytes = hw1.to_le_bytes().to_vec();
6465        bytes.extend_from_slice(&hw2.to_le_bytes());
6466        encoding_contracts::verify_thumb32(&bytes);
6467        Ok(bytes)
6468    }
6469
6470    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6471    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6472        // MOV.W Rd, Rm, LSR #imm
6473        // EA4F 0 imm3 Rd imm2 01 Rm
6474        let imm5 = shift & 0x1F;
6475        let imm2 = imm5 & 0x3;
6476        let imm3 = (imm5 >> 2) & 0x7;
6477
6478        let hw1: u16 = 0xEA4F;
6479        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6480
6481        let mut bytes = hw1.to_le_bytes().to_vec();
6482        bytes.extend_from_slice(&hw2.to_le_bytes());
6483        Ok(bytes)
6484    }
6485
6486    /// Encode Thumb-2 32-bit AND (register) - raw version
6487    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6488        // AND.W Rd, Rn, Rm
6489        // EA00 Rn | 0 Rd 00 00 Rm
6490        let hw1: u16 = (0xEA00 | rn) as u16;
6491        let hw2: u16 = ((rd << 8) | rm) as u16;
6492
6493        let mut bytes = hw1.to_le_bytes().to_vec();
6494        bytes.extend_from_slice(&hw2.to_le_bytes());
6495        Ok(bytes)
6496    }
6497
6498    /// Encode Thumb-2 32-bit AND with immediate - raw version
6499    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6500        // AND.W Rd, Rn, #<modified_immediate>
6501        // For small immediates (0-255), the encoding is simpler
6502        // F0 00 Rn | 0 imm3 Rd imm8
6503        let i_bit = (imm >> 11) & 1;
6504        let imm3 = (imm >> 8) & 0x7;
6505        let imm8 = imm & 0xFF;
6506
6507        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6508        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6509
6510        let mut bytes = hw1.to_le_bytes().to_vec();
6511        bytes.extend_from_slice(&hw2.to_le_bytes());
6512        Ok(bytes)
6513    }
6514
6515    /// Encode Thumb-2 32-bit SUB (register) - raw version
6516    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6517        // SUB.W Rd, Rn, Rm
6518        // EBA0 Rn | 0 Rd 00 00 Rm
6519        let hw1: u16 = (0xEBA0 | rn) as u16;
6520        let hw2: u16 = ((rd << 8) | rm) as u16;
6521
6522        let mut bytes = hw1.to_le_bytes().to_vec();
6523        bytes.extend_from_slice(&hw2.to_le_bytes());
6524        Ok(bytes)
6525    }
6526
6527    /// Encode Thumb-2 32-bit ADD (register) - raw version
6528    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6529        // ADD.W Rd, Rn, Rm
6530        // EB00 Rn | 0 Rd 00 00 Rm
6531        let hw1: u16 = (0xEB00 | rn) as u16;
6532        let hw2: u16 = ((rd << 8) | rm) as u16;
6533
6534        let mut bytes = hw1.to_le_bytes().to_vec();
6535        bytes.extend_from_slice(&hw2.to_le_bytes());
6536        Ok(bytes)
6537    }
6538
6539    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6540    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6541    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6542    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6543        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6544        let hw1: u16 = (0xEB10 | rn) as u16;
6545        let hw2: u16 = ((rd << 8) | rm) as u16;
6546        let mut bytes = hw1.to_le_bytes().to_vec();
6547        bytes.extend_from_slice(&hw2.to_le_bytes());
6548        Ok(bytes)
6549    }
6550
6551    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6552    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6553    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6554        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6555        let hw1: u16 = (0xEBB0 | rn) as u16;
6556        let hw2: u16 = ((rd << 8) | rm) as u16;
6557        let mut bytes = hw1.to_le_bytes().to_vec();
6558        bytes.extend_from_slice(&hw2.to_le_bytes());
6559        Ok(bytes)
6560    }
6561
6562    /// Encode a sequence of ARM instructions
6563    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6564        let mut code = Vec::new();
6565
6566        for op in ops {
6567            let encoded = self.encode(op)?;
6568            code.extend_from_slice(&encoded);
6569        }
6570
6571        Ok(code)
6572    }
6573}
6574
6575/// Convert register to bit encoding (0-15)
6576fn reg_to_bits(reg: &Reg) -> u32 {
6577    match reg {
6578        Reg::R0 => 0,
6579        Reg::R1 => 1,
6580        Reg::R2 => 2,
6581        Reg::R3 => 3,
6582        Reg::R4 => 4,
6583        Reg::R5 => 5,
6584        Reg::R6 => 6,
6585        Reg::R7 => 7,
6586        Reg::R8 => 8,
6587        Reg::R9 => 9,
6588        Reg::R10 => 10,
6589        Reg::R11 => 11,
6590        Reg::R12 => 12,
6591        Reg::SP => 13,
6592        Reg::LR => 14,
6593        Reg::PC => 15,
6594    }
6595}
6596
6597/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6598/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6599/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6600/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6601/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6602/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6603/// Returns a typed Err instead. See #185.
6604fn reg_bits_checked(bits: u32) -> Result<()> {
6605    if bits > 14 {
6606        return Err(synth_core::Error::synthesis(format!(
6607            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6608        )));
6609    }
6610    Ok(())
6611}
6612
6613/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6614/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6615fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6616    if val == 0 {
6617        return Some((0, 1));
6618    }
6619    for rot in 0..16u32 {
6620        let shift = rot * 2;
6621        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6622        let unrotated = val.rotate_left(shift);
6623        if unrotated <= 0xFF {
6624            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6625            return Some(((rot << 8) | unrotated, 1));
6626        }
6627    }
6628    None
6629}
6630
6631/// Encode operand2 field and return (bits, immediate_flag).
6632/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
6633/// Panics if an immediate value cannot be represented. Callers that need large
6634/// immediates should use MOVW/MOVT instead of Operand2::Imm.
6635fn encode_operand2(op2: &Operand2) -> (u32, u32) {
6636    match op2 {
6637        Operand2::Imm(val) => {
6638            let uval = *val as u32;
6639            // Attempt rotated-immediate encoding (ARM32 Operand2)
6640            if let Some(encoded) = try_encode_rotated_imm(uval) {
6641                encoded
6642            } else {
6643                // Fallback: mask to 8 bits (legacy behavior for values that
6644                // cannot be represented). This should not be reached for
6645                // correctly-selected instructions; the instruction selector
6646                // must use MOVW/MOVT for large constants.
6647                let imm = uval & 0xFF;
6648                (imm, 1)
6649            }
6650        }
6651
6652        Operand2::Reg(reg) => {
6653            let reg_bits = reg_to_bits(reg);
6654            (reg_bits, 0) // I=0 for register
6655        }
6656
6657        Operand2::RegShift {
6658            rm,
6659            shift: _,
6660            amount,
6661        } => {
6662            // Simplified encoding with shift
6663            let rm_bits = reg_to_bits(rm);
6664            let shift_bits = (*amount & 0x1F) << 7;
6665            (shift_bits | rm_bits, 0)
6666        }
6667    }
6668}
6669
6670/// Encode memory address to (base_reg, offset)
6671fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
6672    let base_bits = reg_to_bits(&addr.base);
6673    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
6674    (base_bits, offset_bits)
6675}
6676
6677/// S-register number: S0=0, S1=1, ..., S31=31
6678fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
6679    match reg {
6680        VfpReg::S0 => Ok(0),
6681        VfpReg::S1 => Ok(1),
6682        VfpReg::S2 => Ok(2),
6683        VfpReg::S3 => Ok(3),
6684        VfpReg::S4 => Ok(4),
6685        VfpReg::S5 => Ok(5),
6686        VfpReg::S6 => Ok(6),
6687        VfpReg::S7 => Ok(7),
6688        VfpReg::S8 => Ok(8),
6689        VfpReg::S9 => Ok(9),
6690        VfpReg::S10 => Ok(10),
6691        VfpReg::S11 => Ok(11),
6692        VfpReg::S12 => Ok(12),
6693        VfpReg::S13 => Ok(13),
6694        VfpReg::S14 => Ok(14),
6695        VfpReg::S15 => Ok(15),
6696        VfpReg::S16 => Ok(16),
6697        VfpReg::S17 => Ok(17),
6698        VfpReg::S18 => Ok(18),
6699        VfpReg::S19 => Ok(19),
6700        VfpReg::S20 => Ok(20),
6701        VfpReg::S21 => Ok(21),
6702        VfpReg::S22 => Ok(22),
6703        VfpReg::S23 => Ok(23),
6704        VfpReg::S24 => Ok(24),
6705        VfpReg::S25 => Ok(25),
6706        VfpReg::S26 => Ok(26),
6707        VfpReg::S27 => Ok(27),
6708        VfpReg::S28 => Ok(28),
6709        VfpReg::S29 => Ok(29),
6710        VfpReg::S30 => Ok(30),
6711        VfpReg::S31 => Ok(31),
6712        // D-registers are not used in F32 single-precision encodings
6713        _ => Err(synth_core::Error::SynthesisError(
6714            "D-register not supported in single-precision VFP encoding".to_string(),
6715        )),
6716    }
6717}
6718
6719/// D-register number: D0=0, D1=1, ..., D15=15
6720fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
6721    match reg {
6722        VfpReg::D0 => Ok(0),
6723        VfpReg::D1 => Ok(1),
6724        VfpReg::D2 => Ok(2),
6725        VfpReg::D3 => Ok(3),
6726        VfpReg::D4 => Ok(4),
6727        VfpReg::D5 => Ok(5),
6728        VfpReg::D6 => Ok(6),
6729        VfpReg::D7 => Ok(7),
6730        VfpReg::D8 => Ok(8),
6731        VfpReg::D9 => Ok(9),
6732        VfpReg::D10 => Ok(10),
6733        VfpReg::D11 => Ok(11),
6734        VfpReg::D12 => Ok(12),
6735        VfpReg::D13 => Ok(13),
6736        VfpReg::D14 => Ok(14),
6737        VfpReg::D15 => Ok(15),
6738        // S-registers are not used in F64 double-precision encodings
6739        _ => Err(synth_core::Error::SynthesisError(
6740            "S-register not supported in double-precision VFP encoding".to_string(),
6741        )),
6742    }
6743}
6744
6745/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
6746/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
6747/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
6748fn encode_sreg(s: u32) -> (u32, u32) {
6749    (s >> 1, s & 1)
6750}
6751
6752/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
6753/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
6754/// For D0-D15, qualifier is always 0.
6755fn encode_dreg(d: u32) -> (u32, u32) {
6756    (d & 0xF, (d >> 4) & 1)
6757}
6758
6759/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
6760/// Returns the full 32-bit instruction word.
6761///
6762/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
6763/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
6764fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
6765    let sd_num = vfp_sreg_to_num(sd)?;
6766    let sn_num = vfp_sreg_to_num(sn)?;
6767    let sm_num = vfp_sreg_to_num(sm)?;
6768    let (vd, d) = encode_sreg(sd_num);
6769    let (vn, n) = encode_sreg(sn_num);
6770    let (vm, m) = encode_sreg(sm_num);
6771
6772    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6773}
6774
6775/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
6776/// Returns the full 32-bit instruction word.
6777fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
6778    let sd_num = vfp_sreg_to_num(sd)?;
6779    let sm_num = vfp_sreg_to_num(sm)?;
6780    let (vd, d) = encode_sreg(sd_num);
6781    let (vm, m) = encode_sreg(sm_num);
6782
6783    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6784}
6785
6786/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
6787/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6788/// U bit (bit 23) controls add/subtract offset.
6789fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6790    let sd_num = vfp_sreg_to_num(sd)?;
6791    let (vd, d) = encode_sreg(sd_num);
6792    let rn = reg_to_bits(&addr.base);
6793
6794    let offset = addr.offset;
6795    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6796    let abs_offset = offset.unsigned_abs();
6797    let imm8 = (abs_offset / 4) & 0xFF;
6798
6799    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6800}
6801
6802/// Encode VMOV between core register and S-register.
6803/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6804/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6805fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
6806    let s_num = vfp_sreg_to_num(sreg)?;
6807    let (vn, n) = encode_sreg(s_num);
6808    let rt = reg_to_bits(core);
6809
6810    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
6811    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
6812}
6813
6814/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
6815/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
6816/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
6817fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
6818    let dd_num = vfp_dreg_to_num(dd)?;
6819    let dn_num = vfp_dreg_to_num(dn)?;
6820    let dm_num = vfp_dreg_to_num(dm)?;
6821    let (vd, d) = encode_dreg(dd_num);
6822    let (vn, n) = encode_dreg(dn_num);
6823    let (vm, m) = encode_dreg(dm_num);
6824
6825    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6826}
6827
6828/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
6829fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
6830    let dd_num = vfp_dreg_to_num(dd)?;
6831    let dm_num = vfp_dreg_to_num(dm)?;
6832    let (vd, d) = encode_dreg(dd_num);
6833    let (vm, m) = encode_dreg(dm_num);
6834
6835    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6836}
6837
6838/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
6839/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6840fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6841    let dd_num = vfp_dreg_to_num(dd)?;
6842    let (vd, d) = encode_dreg(dd_num);
6843    let rn = reg_to_bits(&addr.base);
6844
6845    let offset = addr.offset;
6846    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6847    let abs_offset = offset.unsigned_abs();
6848    let imm8 = (abs_offset / 4) & 0xFF;
6849
6850    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6851}
6852
6853/// Encode VMOV between two core registers and a D-register.
6854/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6855/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6856fn encode_vmov_core_dreg(
6857    to_dreg: bool,
6858    dreg: &VfpReg,
6859    core_lo: &Reg,
6860    core_hi: &Reg,
6861) -> Result<u32> {
6862    let d_num = vfp_dreg_to_num(dreg)?;
6863    let (vm, m) = encode_dreg(d_num);
6864    let rt = reg_to_bits(core_lo);
6865    let rt2 = reg_to_bits(core_hi);
6866
6867    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
6868    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
6869}
6870
6871/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
6872fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
6873    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
6874    let hw2 = (instr & 0xFFFF) as u16;
6875    let mut bytes = hw1.to_le_bytes().to_vec();
6876    bytes.extend_from_slice(&hw2.to_le_bytes());
6877    bytes
6878}
6879
6880// ============================================================================
6881// Helium MVE encoding helpers
6882// ============================================================================
6883
6884/// Q-register number: Q0=0, Q1=1, ..., Q7=7
6885fn qreg_to_num(reg: &QReg) -> u32 {
6886    match reg {
6887        QReg::Q0 => 0,
6888        QReg::Q1 => 1,
6889        QReg::Q2 => 2,
6890        QReg::Q3 => 3,
6891        QReg::Q4 => 4,
6892        QReg::Q5 => 5,
6893        QReg::Q6 => 6,
6894        QReg::Q7 => 7,
6895    }
6896}
6897
6898/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
6899fn mve_size_bits(size: &MveSize) -> u32 {
6900    match size {
6901        MveSize::S8 => 0b00,
6902        MveSize::S16 => 0b01,
6903        MveSize::S32 => 0b10,
6904    }
6905}
6906
6907/// Encode MVE 3-register instruction.
6908/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
6909/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
6910fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6911    let d = qreg_to_num(qd) * 2;
6912    let n = qreg_to_num(qn) * 2;
6913    let m = qreg_to_num(qm) * 2;
6914
6915    // Standard NEON/MVE 3-register encoding:
6916    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
6917    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
6918    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
6919    let vd = d & 0xF;
6920    let d_bit = (d >> 4) & 1;
6921    let vn = n & 0xF;
6922    let n_bit = (n >> 4) & 1;
6923    let vm = m & 0xF;
6924    let m_bit = (m >> 4) & 1;
6925
6926    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
6927}
6928
6929/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
6930fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6931    encode_mve_3reg(base, qd, qn, qm)
6932}
6933
6934/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
6935/// Format: EC9x xxxx - contiguous load, word-sized elements
6936fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
6937    let qd_enc = qreg_to_num(qd) * 2;
6938    let rn = reg_to_bits(&addr.base);
6939    let offset = addr.offset;
6940    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6941    let abs_offset = offset.unsigned_abs();
6942    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
6943
6944    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
6945    0xED100E80
6946        | (u_bit << 23)
6947        | ((qd_enc >> 4) << 22)
6948        | (rn << 16)
6949        | ((qd_enc & 0xF) << 12)
6950        | (imm7 & 0x7F)
6951}
6952
6953/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
6954fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
6955    let qd_enc = qreg_to_num(qd) * 2;
6956    let rn = reg_to_bits(&addr.base);
6957    let offset = addr.offset;
6958    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6959    let abs_offset = offset.unsigned_abs();
6960    let imm7 = (abs_offset / 4) & 0x7F;
6961
6962    0xED000E80
6963        | (u_bit << 23)
6964        | ((qd_enc >> 4) << 22)
6965        | (rn << 16)
6966        | ((qd_enc & 0xF) << 12)
6967        | (imm7 & 0x7F)
6968}
6969
6970impl ArmEncoder {
6971    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
6972    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
6973        let mut result = Vec::new();
6974        let qd_num = qreg_to_num(qd);
6975
6976        // Load each 32-bit word into R12 (temp) then VMOV into S-register
6977        for i in 0..4 {
6978            let word = u32::from_le_bytes([
6979                bytes[i * 4],
6980                bytes[i * 4 + 1],
6981                bytes[i * 4 + 2],
6982                bytes[i * 4 + 3],
6983            ]);
6984            let lo16 = word & 0xFFFF;
6985            let hi16 = (word >> 16) & 0xFFFF;
6986
6987            // MOVW R12, #lo16
6988            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
6989            // MOVT R12, #hi16
6990            if hi16 != 0 {
6991                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
6992            }
6993
6994            // VMOV Sn, R12 where Sn = Qd*4 + i
6995            let s_num = qd_num * 4 + i as u32;
6996            let (vn, n) = encode_sreg(s_num);
6997            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
6998            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6999        }
7000
7001        Ok(result)
7002    }
7003
7004    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
7005    fn encode_thumb_mve_lane_wise_f32_binop(
7006        &self,
7007        qd: &QReg,
7008        qn: &QReg,
7009        qm: &QReg,
7010        vfp_base: u32,
7011    ) -> Result<Vec<u8>> {
7012        let mut result = Vec::new();
7013        let qd_num = qreg_to_num(qd);
7014        let qn_num = qreg_to_num(qn);
7015        let qm_num = qreg_to_num(qm);
7016
7017        // For each lane 0..3: use S-registers directly (Q aliasing)
7018        for i in 0..4u32 {
7019            let sd = qd_num * 4 + i;
7020            let sn = qn_num * 4 + i;
7021            let sm = qm_num * 4 + i;
7022
7023            let (vd, d) = encode_sreg(sd);
7024            let (vn, n) = encode_sreg(sn);
7025            let (vm, m) = encode_sreg(sm);
7026
7027            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
7028            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7029        }
7030
7031        Ok(result)
7032    }
7033
7034    /// Encode lane-wise f32 VSQRT via S-register extraction
7035    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
7036        let mut result = Vec::new();
7037        let qd_num = qreg_to_num(qd);
7038        let qm_num = qreg_to_num(qm);
7039
7040        // VSQRT.F32 base: 0xEEB10AC0
7041        for i in 0..4u32 {
7042            let sd = qd_num * 4 + i;
7043            let sm = qm_num * 4 + i;
7044
7045            let (vd, d) = encode_sreg(sd);
7046            let (vm, m) = encode_sreg(sm);
7047
7048            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
7049            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7050        }
7051
7052        Ok(result)
7053    }
7054}
7055
7056#[cfg(test)]
7057mod tests {
7058    use super::*;
7059
7060    #[test]
7061    fn test_encoder_creation() {
7062        let encoder_arm = ArmEncoder::new_arm32();
7063        assert!(!encoder_arm.thumb_mode);
7064
7065        let encoder_thumb = ArmEncoder::new_thumb2();
7066        assert!(encoder_thumb.thumb_mode);
7067    }
7068
7069    /// #204 WAKE-path regression: `SetCond` materialized 0/1 with the 16-bit
7070    /// `MOVS Rd,#imm` (T1), whose Rd field is 3 bits (R0–R7). For a high Rd
7071    /// (R8–R12) `rd_bits << 8` overflows bit 11, flipping the opcode MOVS→CMP
7072    /// (`0x2c00`), so the boolean was never written — gale's `has_waiter` kept a
7073    /// stale value and the binary-sem WAKE dispatch read garbage. High Rd must
7074    /// use the 32-bit `MOV.W` (T2). Verify the bytes, not the IR.
7075    #[test]
7076    fn test_encode_setcond_high_reg_uses_mov_w_204() {
7077        use synth_synthesis::{ArmOp, Condition, Reg};
7078        let enc = ArmEncoder::new_thumb2();
7079        // R12 (high): must be ITE + MOV.W #1 + MOV.W #0, never a 16-bit MOVS/CMP.
7080        let hi = enc
7081            .encode(&ArmOp::SetCond {
7082                rd: Reg::R12,
7083                cond: Condition::NE,
7084            })
7085            .unwrap();
7086        assert_eq!(hi.len(), 10, "ITE(2) + MOV.W(4) + MOV.W(4): {hi:02x?}");
7087        // both value halfwords are MOV.W (0xF04F) — NOT the corrupt CMP (0x2c..).
7088        assert_eq!(&hi[2..4], &[0x4F, 0xF0], "then = MOV.W: {hi:02x?}");
7089        assert_eq!(&hi[6..8], &[0x4F, 0xF0], "else = MOV.W: {hi:02x?}");
7090        assert_eq!(hi[4] & 0x0F, 0x01, "then imm = #1");
7091        assert_eq!(hi[8] & 0x0F, 0x00, "else imm = #0");
7092        // Low Rd keeps the compact 16-bit MOVS form.
7093        let lo = enc
7094            .encode(&ArmOp::SetCond {
7095                rd: Reg::R0,
7096                cond: Condition::NE,
7097            })
7098            .unwrap();
7099        assert_eq!(lo.len(), 6, "ITE(2) + MOVS(2) + MOVS(2): {lo:02x?}");
7100        assert_eq!(lo[2..4], [0x01, 0x20], "then = MOVS R0,#1");
7101        assert_eq!(lo[4..6], [0x00, 0x20], "else = MOVS R0,#0");
7102    }
7103
7104    /// #209 Opt 1b: UMULL RdLo, RdHi, Rn, Rm encodes correctly on both ISAs.
7105    /// Thumb-2 T1: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm.
7106    /// A32:        cond 0000 1000 RdHi RdLo Rm 1001 Rn.
7107    #[test]
7108    fn test_encode_umull_209b() {
7109        use synth_synthesis::{ArmOp, Reg};
7110        let op = ArmOp::Umull {
7111            rdlo: Reg::R4,
7112            rdhi: Reg::R5,
7113            rn: Reg::R0,
7114            rm: Reg::R3,
7115        };
7116        // Thumb-2: hw1 = 0xFBA0 | 0 = 0xFBA0; hw2 = (4<<12)|(5<<8)|3 = 0x4503.
7117        let t = ArmEncoder::new_thumb2().encode(&op).unwrap();
7118        assert_eq!(
7119            t,
7120            vec![0xA0, 0xFB, 0x03, 0x45],
7121            "umull r4,r5,r0,r3 (T2): {t:02x?}"
7122        );
7123        // A32: 0xE0800090 | (5<<16) | (4<<12) | (3<<8) | 0 = 0xE0854390.
7124        let a = ArmEncoder::new_arm32().encode(&op).unwrap();
7125        assert_eq!(
7126            a,
7127            0xE085_4390u32.to_le_bytes().to_vec(),
7128            "umull (A32): {a:02x?}"
7129        );
7130    }
7131
7132    /// #206 regression: the ARM32 (A32) `Ldr`/`Str` encoders fed `addr` through
7133    /// `encode_mem_addr`, which returns only the 12-bit immediate — so a register
7134    /// offset (`[rn, rm, #off]`) was silently dropped to `[rn, #off]`, sending
7135    /// the access to the wrong runtime address (silent miscompile on the default
7136    /// `--target arm`). A register offset must materialize `ip = rn + rm` and
7137    /// load from `[ip, #off]`. Verify the bytes.
7138    #[test]
7139    fn test_encode_arm32_indexed_load_keeps_index_206() {
7140        use synth_synthesis::{ArmOp, MemAddr, Reg};
7141        let enc = ArmEncoder::new_arm32();
7142        // ldr r0, [r11, r1, #8]  must NOT collapse to a single immediate ldr.
7143        let bytes = enc
7144            .encode(&ArmOp::Ldr {
7145                rd: Reg::R0,
7146                addr: MemAddr::reg_imm(Reg::R11, Reg::R1, 8),
7147            })
7148            .unwrap();
7149        assert_eq!(
7150            bytes.len(),
7151            8,
7152            "expected ADD ip + LDR (2 words): {bytes:02x?}"
7153        );
7154        let add = u32::from_le_bytes(bytes[0..4].try_into().unwrap());
7155        let ldr = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
7156        // ADD ip, r11, r1  = 0xE08BC001
7157        assert_eq!(add, 0xE08B_C001, "ADD ip,r11,r1: {add:#010x}");
7158        // LDR r0, [ip, #8] = 0xE59C0008
7159        assert_eq!(ldr, 0xE59C_0008, "LDR r0,[ip,#8]: {ldr:#010x}");
7160        // A bare immediate ldr (the bug) would be 0xE59B0008 (base=r11) — reject.
7161        assert_ne!(ldr, 0xE59B_0008, "index must not be dropped");
7162    }
7163
7164    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
7165    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
7166    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
7167    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
7168    /// dropping the address operand and miscompiling every optimized memory
7169    /// access. High registers must use the 32-bit `.W` forms.
7170    #[test]
7171    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
7172        let encoder = ArmEncoder::new_thumb2();
7173
7174        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
7175        let code = encoder
7176            .encode(&ArmOp::Add {
7177                rd: Reg::R12,
7178                rn: Reg::R12,
7179                op2: Operand2::Reg(Reg::R0),
7180            })
7181            .unwrap();
7182        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
7183        assert_eq!(
7184            code,
7185            vec![0x0C, 0xEB, 0x00, 0x0C],
7186            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
7187        );
7188        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
7189        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
7190
7191        // Low-register add stays 16-bit (no regression for the common case).
7192        let lo = encoder
7193            .encode(&ArmOp::Add {
7194                rd: Reg::R1,
7195                rn: Reg::R2,
7196                op2: Operand2::Reg(Reg::R3),
7197            })
7198            .unwrap();
7199        assert_eq!(
7200            lo.len(),
7201            2,
7202            "low-reg ADD should remain 16-bit, got {lo:02X?}"
7203        );
7204    }
7205
7206    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
7207    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
7208    #[test]
7209    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
7210        let encoder = ArmEncoder::new_thumb2();
7211
7212        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
7213        let adds = encoder
7214            .encode(&ArmOp::Adds {
7215                rd: Reg::R10,
7216                rn: Reg::R10,
7217                op2: Operand2::Reg(Reg::R8),
7218            })
7219            .unwrap();
7220        assert_eq!(
7221            adds,
7222            vec![0x1A, 0xEB, 0x08, 0x0A],
7223            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
7224        );
7225
7226        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
7227        let subs = encoder
7228            .encode(&ArmOp::Subs {
7229                rd: Reg::R10,
7230                rn: Reg::R10,
7231                op2: Operand2::Reg(Reg::R8),
7232            })
7233            .unwrap();
7234        assert_eq!(
7235            subs,
7236            vec![0xBA, 0xEB, 0x08, 0x0A],
7237            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
7238        );
7239    }
7240
7241    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
7242    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
7243    #[test]
7244    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
7245        let encoder = ArmEncoder::new_thumb2();
7246
7247        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7248        let cmn = encoder
7249            .encode(&ArmOp::Cmn {
7250                rn: Reg::R10,
7251                op2: Operand2::Reg(Reg::R8),
7252            })
7253            .unwrap();
7254        assert_eq!(
7255            cmn,
7256            vec![0x1A, 0xEB, 0x08, 0x0F],
7257            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7258        );
7259
7260        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7261        let lo = encoder
7262            .encode(&ArmOp::Cmn {
7263                rn: Reg::R1,
7264                op2: Operand2::Reg(Reg::R2),
7265            })
7266            .unwrap();
7267        assert_eq!(
7268            lo.len(),
7269            2,
7270            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7271        );
7272        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7273    }
7274
7275    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7276    /// guards its registers must return Err, not panic under debug-assertions.
7277    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7278    #[test]
7279    fn test_encode_pc_operand_returns_err_not_panic_185() {
7280        let encoder = ArmEncoder::new_thumb2();
7281        for op in [
7282            ArmOp::Sdiv {
7283                rd: Reg::PC,
7284                rn: Reg::R0,
7285                rm: Reg::R1,
7286            },
7287            ArmOp::Udiv {
7288                rd: Reg::R0,
7289                rn: Reg::PC,
7290                rm: Reg::R1,
7291            },
7292            ArmOp::Sdiv {
7293                rd: Reg::R0,
7294                rn: Reg::R1,
7295                rm: Reg::PC,
7296            },
7297        ] {
7298            let r = encoder.encode(&op);
7299            assert!(
7300                r.is_err(),
7301                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7302            );
7303        }
7304        // Valid registers still encode fine (no false rejection).
7305        assert!(
7306            encoder
7307                .encode(&ArmOp::Sdiv {
7308                    rd: Reg::R0,
7309                    rn: Reg::R1,
7310                    rm: Reg::R2
7311                })
7312                .is_ok()
7313        );
7314    }
7315
7316    #[test]
7317    fn test_encode_nop_arm32() {
7318        let encoder = ArmEncoder::new_arm32();
7319        let code = encoder.encode(&ArmOp::Nop).unwrap();
7320
7321        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7322        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7323    }
7324
7325    #[test]
7326    fn test_encode_nop_thumb() {
7327        let encoder = ArmEncoder::new_thumb2();
7328        let code = encoder.encode(&ArmOp::Nop).unwrap();
7329
7330        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7331        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7332    }
7333
7334    #[test]
7335    fn test_encode_mov_immediate_arm32() {
7336        let encoder = ArmEncoder::new_arm32();
7337        let op = ArmOp::Mov {
7338            rd: Reg::R0,
7339            op2: Operand2::Imm(42),
7340        };
7341
7342        let code = encoder.encode(&op).unwrap();
7343        assert_eq!(code.len(), 4);
7344
7345        // Verify it's a MOV instruction (bits should have immediate flag set)
7346        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7347        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7348    }
7349
7350    #[test]
7351    fn test_encode_add_registers_arm32() {
7352        let encoder = ArmEncoder::new_arm32();
7353        let op = ArmOp::Add {
7354            rd: Reg::R0,
7355            rn: Reg::R1,
7356            op2: Operand2::Reg(Reg::R2),
7357        };
7358
7359        let code = encoder.encode(&op).unwrap();
7360        assert_eq!(code.len(), 4);
7361
7362        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7363        // Verify it's an ADD instruction with correct opcode
7364        assert_eq!(instr & 0x0FE00000, 0x00800000);
7365    }
7366
7367    #[test]
7368    fn test_encode_ldr_arm32() {
7369        let encoder = ArmEncoder::new_arm32();
7370        let op = ArmOp::Ldr {
7371            rd: Reg::R0,
7372            addr: MemAddr::imm(Reg::R1, 4),
7373        };
7374
7375        let code = encoder.encode(&op).unwrap();
7376        assert_eq!(code.len(), 4);
7377
7378        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7379        // Verify load bit is set
7380        assert_eq!(instr & 0x00100000, 0x00100000);
7381    }
7382
7383    #[test]
7384    fn test_encode_str_arm32() {
7385        let encoder = ArmEncoder::new_arm32();
7386        let op = ArmOp::Str {
7387            rd: Reg::R0,
7388            addr: MemAddr::imm(Reg::SP, 0),
7389        };
7390
7391        let code = encoder.encode(&op).unwrap();
7392        assert_eq!(code.len(), 4);
7393    }
7394
7395    #[test]
7396    fn test_encode_branch_arm32() {
7397        let encoder = ArmEncoder::new_arm32();
7398        let op = ArmOp::Bl {
7399            label: "main".to_string(),
7400        };
7401
7402        let code = encoder.encode(&op).unwrap();
7403        assert_eq!(code.len(), 4);
7404
7405        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7406        // Verify BL opcode
7407        assert_eq!(instr & 0x0F000000, 0x0B000000);
7408    }
7409
7410    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7411    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7412    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7413    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7414    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7415    ///     to fit (#167).
7416    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7417    ///     entry (#174).
7418    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7419    #[test]
7420    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7421        let encoder = ArmEncoder::new_thumb2();
7422        let op = ArmOp::Bl {
7423            label: "callee".to_string(),
7424        };
7425
7426        let code = encoder.encode(&op).unwrap();
7427        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7428
7429        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7430        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7431        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7432        assert_eq!(
7433            hw2, 0xFFFE,
7434            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
7435        );
7436        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
7437        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
7438    }
7439
7440    #[test]
7441    fn test_encode_sequence() {
7442        let encoder = ArmEncoder::new_arm32();
7443        let ops = vec![
7444            ArmOp::Mov {
7445                rd: Reg::R0,
7446                op2: Operand2::Imm(42),
7447            },
7448            ArmOp::Mov {
7449                rd: Reg::R1,
7450                op2: Operand2::Imm(10),
7451            },
7452            ArmOp::Add {
7453                rd: Reg::R2,
7454                rn: Reg::R0,
7455                op2: Operand2::Reg(Reg::R1),
7456            },
7457        ];
7458
7459        let code = encoder.encode_sequence(&ops).unwrap();
7460        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
7461    }
7462
7463    #[test]
7464    fn test_reg_to_bits() {
7465        assert_eq!(reg_to_bits(&Reg::R0), 0);
7466        assert_eq!(reg_to_bits(&Reg::R7), 7);
7467        assert_eq!(reg_to_bits(&Reg::SP), 13);
7468        assert_eq!(reg_to_bits(&Reg::LR), 14);
7469        assert_eq!(reg_to_bits(&Reg::PC), 15);
7470    }
7471
7472    #[test]
7473    fn test_encode_bitwise_operations() {
7474        let encoder = ArmEncoder::new_arm32();
7475
7476        let and_op = ArmOp::And {
7477            rd: Reg::R0,
7478            rn: Reg::R1,
7479            op2: Operand2::Reg(Reg::R2),
7480        };
7481        let and_code = encoder.encode(&and_op).unwrap();
7482        assert_eq!(and_code.len(), 4);
7483
7484        let orr_op = ArmOp::Orr {
7485            rd: Reg::R0,
7486            rn: Reg::R1,
7487            op2: Operand2::Reg(Reg::R2),
7488        };
7489        let orr_code = encoder.encode(&orr_op).unwrap();
7490        assert_eq!(orr_code.len(), 4);
7491
7492        let eor_op = ArmOp::Eor {
7493            rd: Reg::R0,
7494            rn: Reg::R1,
7495            op2: Operand2::Reg(Reg::R2),
7496        };
7497        let eor_code = encoder.encode(&eor_op).unwrap();
7498        assert_eq!(eor_code.len(), 4);
7499    }
7500
7501    // === Thumb-2 32-bit encoding tests ===
7502
7503    #[test]
7504    fn test_encode_sdiv_thumb2() {
7505        let encoder = ArmEncoder::new_thumb2();
7506        let op = ArmOp::Sdiv {
7507            rd: Reg::R0,
7508            rn: Reg::R1,
7509            rm: Reg::R2,
7510        };
7511
7512        let code = encoder.encode(&op).unwrap();
7513        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7514
7515        // SDIV R0, R1, R2: 0xFB91 0xF0F2
7516        // First halfword: 0xFB90 | Rn(1) = 0xFB91
7517        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
7518        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
7519        assert_eq!(code[0], 0x91);
7520        assert_eq!(code[1], 0xFB);
7521        assert_eq!(code[2], 0xF2);
7522        assert_eq!(code[3], 0xF0);
7523    }
7524
7525    #[test]
7526    fn test_encode_udiv_thumb2() {
7527        let encoder = ArmEncoder::new_thumb2();
7528        let op = ArmOp::Udiv {
7529            rd: Reg::R0,
7530            rn: Reg::R1,
7531            rm: Reg::R2,
7532        };
7533
7534        let code = encoder.encode(&op).unwrap();
7535        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7536
7537        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
7538        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
7539        assert_eq!(code[0], 0xB1);
7540        assert_eq!(code[1], 0xFB);
7541        assert_eq!(code[2], 0xF2);
7542        assert_eq!(code[3], 0xF0);
7543    }
7544
7545    #[test]
7546    fn test_encode_mul_thumb2() {
7547        let encoder = ArmEncoder::new_thumb2();
7548        let op = ArmOp::Mul {
7549            rd: Reg::R0,
7550            rn: Reg::R1,
7551            rm: Reg::R2,
7552        };
7553
7554        let code = encoder.encode(&op).unwrap();
7555        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7556    }
7557
7558    #[test]
7559    fn test_encode_and_thumb2() {
7560        let encoder = ArmEncoder::new_thumb2();
7561        let op = ArmOp::And {
7562            rd: Reg::R0,
7563            rn: Reg::R1,
7564            op2: Operand2::Reg(Reg::R2),
7565        };
7566
7567        let code = encoder.encode(&op).unwrap();
7568        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7569    }
7570
7571    #[test]
7572    fn test_encode_lsl_thumb2_low_regs() {
7573        let encoder = ArmEncoder::new_thumb2();
7574        let op = ArmOp::Lsl {
7575            rd: Reg::R0,
7576            rn: Reg::R1,
7577            shift: 5,
7578        };
7579
7580        let code = encoder.encode(&op).unwrap();
7581        assert_eq!(code.len(), 2); // 16-bit for low registers
7582    }
7583
7584    #[test]
7585    fn test_encode_clz_thumb2() {
7586        let encoder = ArmEncoder::new_thumb2();
7587        let op = ArmOp::Clz {
7588            rd: Reg::R0,
7589            rm: Reg::R1,
7590        };
7591
7592        let code = encoder.encode(&op).unwrap();
7593        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7594    }
7595
7596    #[test]
7597    fn test_encode_bx_thumb2() {
7598        let encoder = ArmEncoder::new_thumb2();
7599        let op = ArmOp::Bx { rm: Reg::LR };
7600
7601        let code = encoder.encode(&op).unwrap();
7602        assert_eq!(code.len(), 2); // 16-bit instruction
7603
7604        // BX LR: 0x4770
7605        assert_eq!(code, vec![0x70, 0x47]);
7606    }
7607
7608    // ========================================================================
7609    // f32 pseudo-op encoding tests
7610    // ========================================================================
7611
7612    #[test]
7613    fn test_encode_f32_abs_arm32() {
7614        let encoder = ArmEncoder::new_arm32();
7615        let op = ArmOp::F32Abs {
7616            sd: VfpReg::S0,
7617            sm: VfpReg::S2,
7618        };
7619        let code = encoder.encode(&op).unwrap();
7620        assert_eq!(code.len(), 4); // Single VFP instruction
7621    }
7622
7623    #[test]
7624    fn test_encode_f32_neg_arm32() {
7625        let encoder = ArmEncoder::new_arm32();
7626        let op = ArmOp::F32Neg {
7627            sd: VfpReg::S0,
7628            sm: VfpReg::S2,
7629        };
7630        let code = encoder.encode(&op).unwrap();
7631        assert_eq!(code.len(), 4);
7632    }
7633
7634    #[test]
7635    fn test_encode_f32_sqrt_arm32() {
7636        let encoder = ArmEncoder::new_arm32();
7637        let op = ArmOp::F32Sqrt {
7638            sd: VfpReg::S0,
7639            sm: VfpReg::S2,
7640        };
7641        let code = encoder.encode(&op).unwrap();
7642        assert_eq!(code.len(), 4);
7643    }
7644
7645    #[test]
7646    fn test_encode_f32_ceil_arm32() {
7647        let encoder = ArmEncoder::new_arm32();
7648        let op = ArmOp::F32Ceil {
7649            sd: VfpReg::S0,
7650            sm: VfpReg::S2,
7651        };
7652        let code = encoder.encode(&op).unwrap();
7653        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
7654        assert_eq!(code.len(), 36);
7655    }
7656
7657    #[test]
7658    fn test_encode_f32_floor_thumb2() {
7659        let encoder = ArmEncoder::new_thumb2();
7660        let op = ArmOp::F32Floor {
7661            sd: VfpReg::S0,
7662            sm: VfpReg::S2,
7663        };
7664        let code = encoder.encode(&op).unwrap();
7665        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
7666        assert_eq!(code.len(), 36);
7667    }
7668
7669    #[test]
7670    fn test_encode_f32_min_arm32() {
7671        let encoder = ArmEncoder::new_arm32();
7672        let op = ArmOp::F32Min {
7673            sd: VfpReg::S0,
7674            sn: VfpReg::S2,
7675            sm: VfpReg::S4,
7676        };
7677        let code = encoder.encode(&op).unwrap();
7678        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
7679    }
7680
7681    #[test]
7682    fn test_encode_f32_max_thumb2() {
7683        let encoder = ArmEncoder::new_thumb2();
7684        let op = ArmOp::F32Max {
7685            sd: VfpReg::S0,
7686            sn: VfpReg::S2,
7687            sm: VfpReg::S4,
7688        };
7689        let code = encoder.encode(&op).unwrap();
7690        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
7691        assert_eq!(code.len(), 18);
7692    }
7693
7694    #[test]
7695    fn test_encode_f32_copysign_arm32() {
7696        let encoder = ArmEncoder::new_arm32();
7697        let op = ArmOp::F32Copysign {
7698            sd: VfpReg::S0,
7699            sn: VfpReg::S2,
7700            sm: VfpReg::S4,
7701        };
7702        let code = encoder.encode(&op).unwrap();
7703        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
7704        assert_eq!(code.len(), 24);
7705    }
7706
7707    // ========================================================================
7708    // f64 encoding tests
7709    // ========================================================================
7710
7711    #[test]
7712    fn test_encode_f64_add_arm32() {
7713        let encoder = ArmEncoder::new_arm32();
7714        let op = ArmOp::F64Add {
7715            dd: VfpReg::D0,
7716            dn: VfpReg::D1,
7717            dm: VfpReg::D2,
7718        };
7719        let code = encoder.encode(&op).unwrap();
7720        assert_eq!(code.len(), 4);
7721        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
7722        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7723        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7724    }
7725
7726    #[test]
7727    fn test_encode_f64_sub_thumb2() {
7728        let encoder = ArmEncoder::new_thumb2();
7729        let op = ArmOp::F64Sub {
7730            dd: VfpReg::D0,
7731            dn: VfpReg::D1,
7732            dm: VfpReg::D2,
7733        };
7734        let code = encoder.encode(&op).unwrap();
7735        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
7736    }
7737
7738    #[test]
7739    fn test_encode_f64_mul_arm32() {
7740        let encoder = ArmEncoder::new_arm32();
7741        let op = ArmOp::F64Mul {
7742            dd: VfpReg::D0,
7743            dn: VfpReg::D1,
7744            dm: VfpReg::D2,
7745        };
7746        let code = encoder.encode(&op).unwrap();
7747        assert_eq!(code.len(), 4);
7748    }
7749
7750    #[test]
7751    fn test_encode_f64_div_arm32() {
7752        let encoder = ArmEncoder::new_arm32();
7753        let op = ArmOp::F64Div {
7754            dd: VfpReg::D0,
7755            dn: VfpReg::D1,
7756            dm: VfpReg::D2,
7757        };
7758        let code = encoder.encode(&op).unwrap();
7759        assert_eq!(code.len(), 4);
7760    }
7761
7762    #[test]
7763    fn test_encode_f64_abs_arm32() {
7764        let encoder = ArmEncoder::new_arm32();
7765        let op = ArmOp::F64Abs {
7766            dd: VfpReg::D0,
7767            dm: VfpReg::D2,
7768        };
7769        let code = encoder.encode(&op).unwrap();
7770        assert_eq!(code.len(), 4);
7771    }
7772
7773    #[test]
7774    fn test_encode_f64_neg_arm32() {
7775        let encoder = ArmEncoder::new_arm32();
7776        let op = ArmOp::F64Neg {
7777            dd: VfpReg::D0,
7778            dm: VfpReg::D2,
7779        };
7780        let code = encoder.encode(&op).unwrap();
7781        assert_eq!(code.len(), 4);
7782    }
7783
7784    #[test]
7785    fn test_encode_f64_sqrt_arm32() {
7786        let encoder = ArmEncoder::new_arm32();
7787        let op = ArmOp::F64Sqrt {
7788            dd: VfpReg::D0,
7789            dm: VfpReg::D2,
7790        };
7791        let code = encoder.encode(&op).unwrap();
7792        assert_eq!(code.len(), 4);
7793    }
7794
7795    #[test]
7796    fn test_encode_f64_load_arm32() {
7797        let encoder = ArmEncoder::new_arm32();
7798        let op = ArmOp::F64Load {
7799            dd: VfpReg::D0,
7800            addr: MemAddr::imm(Reg::R0, 8),
7801        };
7802        let code = encoder.encode(&op).unwrap();
7803        assert_eq!(code.len(), 4);
7804        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7805        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
7806        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
7807    }
7808
7809    #[test]
7810    fn test_encode_f64_store_thumb2() {
7811        let encoder = ArmEncoder::new_thumb2();
7812        let op = ArmOp::F64Store {
7813            dd: VfpReg::D0,
7814            addr: MemAddr::imm(Reg::SP, 0),
7815        };
7816        let code = encoder.encode(&op).unwrap();
7817        assert_eq!(code.len(), 4);
7818    }
7819
7820    #[test]
7821    fn test_encode_f64_compare_arm32() {
7822        let encoder = ArmEncoder::new_arm32();
7823        let op = ArmOp::F64Eq {
7824            rd: Reg::R0,
7825            dn: VfpReg::D0,
7826            dm: VfpReg::D1,
7827        };
7828        let code = encoder.encode(&op).unwrap();
7829        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
7830    }
7831
7832    #[test]
7833    fn test_encode_f64_compare_thumb2() {
7834        let encoder = ArmEncoder::new_thumb2();
7835        let op = ArmOp::F64Lt {
7836            rd: Reg::R0,
7837            dn: VfpReg::D0,
7838            dm: VfpReg::D1,
7839        };
7840        let code = encoder.encode(&op).unwrap();
7841        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
7842        assert_eq!(code.len(), 14);
7843    }
7844
7845    #[test]
7846    fn test_encode_f64_const_arm32() {
7847        let encoder = ArmEncoder::new_arm32();
7848        let op = ArmOp::F64Const {
7849            dd: VfpReg::D0,
7850            value: 3.125,
7851        };
7852        let code = encoder.encode(&op).unwrap();
7853        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7854        assert_eq!(code.len(), 20);
7855    }
7856
7857    #[test]
7858    fn test_encode_f64_const_thumb2() {
7859        let encoder = ArmEncoder::new_thumb2();
7860        let op = ArmOp::F64Const {
7861            dd: VfpReg::D0,
7862            value: 2.5,
7863        };
7864        let code = encoder.encode(&op).unwrap();
7865        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7866        assert_eq!(code.len(), 20);
7867    }
7868
7869    #[test]
7870    fn test_encode_f64_convert_i32s_arm32() {
7871        let encoder = ArmEncoder::new_arm32();
7872        let op = ArmOp::F64ConvertI32S {
7873            dd: VfpReg::D0,
7874            rm: Reg::R0,
7875        };
7876        let code = encoder.encode(&op).unwrap();
7877        // VMOV(4) + VCVT(4) = 8
7878        assert_eq!(code.len(), 8);
7879    }
7880
7881    #[test]
7882    fn test_encode_f64_promote_f32_arm32() {
7883        let encoder = ArmEncoder::new_arm32();
7884        let op = ArmOp::F64PromoteF32 {
7885            dd: VfpReg::D0,
7886            sm: VfpReg::S0,
7887        };
7888        let code = encoder.encode(&op).unwrap();
7889        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
7890    }
7891
7892    #[test]
7893    fn test_encode_f64_promote_f32_thumb2() {
7894        let encoder = ArmEncoder::new_thumb2();
7895        let op = ArmOp::F64PromoteF32 {
7896            dd: VfpReg::D0,
7897            sm: VfpReg::S0,
7898        };
7899        let code = encoder.encode(&op).unwrap();
7900        assert_eq!(code.len(), 4);
7901    }
7902
7903    #[test]
7904    fn test_encode_i32_trunc_f64s_arm32() {
7905        let encoder = ArmEncoder::new_arm32();
7906        let op = ArmOp::I32TruncF64S {
7907            rd: Reg::R0,
7908            dm: VfpReg::D0,
7909        };
7910        let code = encoder.encode(&op).unwrap();
7911        // VCVT(4) + VMOV(4) = 8
7912        assert_eq!(code.len(), 8);
7913    }
7914
7915    #[test]
7916    fn test_encode_f64_reinterpret_i64_arm32() {
7917        let encoder = ArmEncoder::new_arm32();
7918        let op = ArmOp::F64ReinterpretI64 {
7919            dd: VfpReg::D0,
7920            rmlo: Reg::R0,
7921            rmhi: Reg::R1,
7922        };
7923        let code = encoder.encode(&op).unwrap();
7924        assert_eq!(code.len(), 4); // Single VMOV instruction
7925    }
7926
7927    #[test]
7928    fn test_encode_i64_reinterpret_f64_thumb2() {
7929        let encoder = ArmEncoder::new_thumb2();
7930        let op = ArmOp::I64ReinterpretF64 {
7931            rdlo: Reg::R0,
7932            rdhi: Reg::R1,
7933            dm: VfpReg::D0,
7934        };
7935        let code = encoder.encode(&op).unwrap();
7936        assert_eq!(code.len(), 4);
7937    }
7938
7939    #[test]
7940    fn test_encode_f64_trunc_thumb2() {
7941        let encoder = ArmEncoder::new_thumb2();
7942        let op = ArmOp::F64Trunc {
7943            dd: VfpReg::D0,
7944            dm: VfpReg::D1,
7945        };
7946        let code = encoder.encode(&op).unwrap();
7947        // Two VFP instructions via Thumb encoding
7948        assert_eq!(code.len(), 8);
7949    }
7950
7951    #[test]
7952    fn test_encode_f64_min_arm32() {
7953        let encoder = ArmEncoder::new_arm32();
7954        let op = ArmOp::F64Min {
7955            dd: VfpReg::D0,
7956            dn: VfpReg::D1,
7957            dm: VfpReg::D2,
7958        };
7959        let code = encoder.encode(&op).unwrap();
7960        // VMOV + VCMP + VMRS + conditional VMOV = 16
7961        assert_eq!(code.len(), 16);
7962    }
7963
7964    #[test]
7965    fn test_f64_cp11_encoding() {
7966        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
7967        let encoder = ArmEncoder::new_arm32();
7968
7969        // F64Add
7970        let code = encoder
7971            .encode(&ArmOp::F64Add {
7972                dd: VfpReg::D0,
7973                dn: VfpReg::D0,
7974                dm: VfpReg::D0,
7975            })
7976            .unwrap();
7977        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7978        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
7979
7980        // F32Add for comparison
7981        let code = encoder
7982            .encode(&ArmOp::F32Add {
7983                sd: VfpReg::S0,
7984                sn: VfpReg::S0,
7985                sm: VfpReg::S0,
7986            })
7987            .unwrap();
7988        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7989        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
7990    }
7991
7992    #[test]
7993    fn test_dreg_encoding_higher_registers() {
7994        let encoder = ArmEncoder::new_arm32();
7995
7996        // Test with D15 (highest register)
7997        let op = ArmOp::F64Add {
7998            dd: VfpReg::D15,
7999            dn: VfpReg::D14,
8000            dm: VfpReg::D13,
8001        };
8002        let code = encoder.encode(&op).unwrap();
8003        assert_eq!(code.len(), 4);
8004
8005        // Verify the register encoding worked (instruction is valid)
8006        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8007        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
8008    }
8009
8010    // ========================================================================
8011    // Control flow encoding tests
8012    // ========================================================================
8013
8014    #[test]
8015    fn test_encode_label_emits_no_bytes() {
8016        let encoder = ArmEncoder::new_thumb2();
8017        let op = ArmOp::Label {
8018            name: ".Lblock_end_0".to_string(),
8019        };
8020        let code = encoder.encode(&op).unwrap();
8021        assert!(code.is_empty(), "Label should emit zero bytes");
8022
8023        let encoder32 = ArmEncoder::new_arm32();
8024        let code32 = encoder32.encode(&op).unwrap();
8025        assert!(
8026            code32.is_empty(),
8027            "Label should emit zero bytes in ARM32 too"
8028        );
8029    }
8030
8031    #[test]
8032    fn test_encode_bcc_eq_thumb2() {
8033        use synth_synthesis::Condition;
8034        let encoder = ArmEncoder::new_thumb2();
8035        let op = ArmOp::Bcc {
8036            cond: Condition::EQ,
8037            label: "target".to_string(),
8038        };
8039        let code = encoder.encode(&op).unwrap();
8040        assert_eq!(code.len(), 2); // 16-bit conditional branch
8041
8042        // BEQ with offset 0: 0xD000 in little-endian
8043        assert_eq!(code, vec![0x00, 0xD0]);
8044    }
8045
8046    #[test]
8047    fn test_encode_bcc_ne_thumb2() {
8048        use synth_synthesis::Condition;
8049        let encoder = ArmEncoder::new_thumb2();
8050        let op = ArmOp::Bcc {
8051            cond: Condition::NE,
8052            label: "target".to_string(),
8053        };
8054        let code = encoder.encode(&op).unwrap();
8055        assert_eq!(code.len(), 2);
8056
8057        // BNE with offset 0: 0xD100 in little-endian
8058        assert_eq!(code, vec![0x00, 0xD1]);
8059    }
8060
8061    #[test]
8062    fn test_encode_bcc_arm32() {
8063        use synth_synthesis::Condition;
8064        let encoder = ArmEncoder::new_arm32();
8065        let op = ArmOp::Bcc {
8066            cond: Condition::EQ,
8067            label: "target".to_string(),
8068        };
8069        let code = encoder.encode(&op).unwrap();
8070        assert_eq!(code.len(), 4); // 32-bit ARM instruction
8071
8072        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8073        // BEQ: cond=0x0, opcode=0xA, offset=0
8074        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
8075        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
8076    }
8077
8078    #[test]
8079    fn test_encode_udf_thumb2() {
8080        let encoder = ArmEncoder::new_thumb2();
8081        let op = ArmOp::Udf { imm: 0 };
8082        let code = encoder.encode(&op).unwrap();
8083        assert_eq!(code.len(), 2); // 16-bit
8084
8085        // UDF #0: 0xDE00 in little-endian
8086        assert_eq!(code, vec![0x00, 0xDE]);
8087    }
8088
8089    #[test]
8090    fn test_encode_nop_thumb2() {
8091        let encoder = ArmEncoder::new_thumb2();
8092        let op = ArmOp::Nop;
8093        let code = encoder.encode(&op).unwrap();
8094        assert_eq!(code.len(), 2); // 16-bit
8095
8096        // NOP: 0xBF00 in little-endian
8097        assert_eq!(code, vec![0x00, 0xBF]);
8098    }
8099
8100    // =========================================================================
8101    // i64 Thumb-2 encoding tests
8102    // =========================================================================
8103
8104    #[test]
8105    fn test_encode_i64_add_thumb2() {
8106        let encoder = ArmEncoder::new_thumb2();
8107        let op = ArmOp::I64Add {
8108            rdlo: Reg::R0,
8109            rdhi: Reg::R1,
8110            rnlo: Reg::R0,
8111            rnhi: Reg::R1,
8112            rmlo: Reg::R2,
8113            rmhi: Reg::R3,
8114        };
8115        let code = encoder.encode(&op).unwrap();
8116        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
8117        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
8118    }
8119
8120    #[test]
8121    fn test_encode_i64_sub_thumb2() {
8122        let encoder = ArmEncoder::new_thumb2();
8123        let op = ArmOp::I64Sub {
8124            rdlo: Reg::R0,
8125            rdhi: Reg::R1,
8126            rnlo: Reg::R0,
8127            rnhi: Reg::R1,
8128            rmlo: Reg::R2,
8129            rmhi: Reg::R3,
8130        };
8131        let code = encoder.encode(&op).unwrap();
8132        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
8133        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
8134    }
8135
8136    #[test]
8137    fn test_encode_i64_and_thumb2() {
8138        let encoder = ArmEncoder::new_thumb2();
8139        let op = ArmOp::I64And {
8140            rdlo: Reg::R0,
8141            rdhi: Reg::R1,
8142            rnlo: Reg::R0,
8143            rnhi: Reg::R1,
8144            rmlo: Reg::R2,
8145            rmhi: Reg::R3,
8146        };
8147        let code = encoder.encode(&op).unwrap();
8148        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
8149        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
8150    }
8151
8152    #[test]
8153    fn test_encode_i64_or_thumb2() {
8154        let encoder = ArmEncoder::new_thumb2();
8155        let op = ArmOp::I64Or {
8156            rdlo: Reg::R0,
8157            rdhi: Reg::R1,
8158            rnlo: Reg::R0,
8159            rnhi: Reg::R1,
8160            rmlo: Reg::R2,
8161            rmhi: Reg::R3,
8162        };
8163        let code = encoder.encode(&op).unwrap();
8164        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
8165    }
8166
8167    #[test]
8168    fn test_encode_i64_xor_thumb2() {
8169        let encoder = ArmEncoder::new_thumb2();
8170        let op = ArmOp::I64Xor {
8171            rdlo: Reg::R0,
8172            rdhi: Reg::R1,
8173            rnlo: Reg::R0,
8174            rnhi: Reg::R1,
8175            rmlo: Reg::R2,
8176            rmhi: Reg::R3,
8177        };
8178        let code = encoder.encode(&op).unwrap();
8179        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
8180    }
8181
8182    #[test]
8183    fn test_encode_i64_const_small_thumb2() {
8184        let encoder = ArmEncoder::new_thumb2();
8185        // Small constant: only needs MOVW for each half
8186        let op = ArmOp::I64Const {
8187            rdlo: Reg::R0,
8188            rdhi: Reg::R1,
8189            value: 42,
8190        };
8191        let code = encoder.encode(&op).unwrap();
8192        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
8193        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
8194    }
8195
8196    #[test]
8197    fn test_encode_i64_const_large_thumb2() {
8198        let encoder = ArmEncoder::new_thumb2();
8199        // Large constant: needs MOVW+MOVT for each half
8200        let op = ArmOp::I64Const {
8201            rdlo: Reg::R0,
8202            rdhi: Reg::R1,
8203            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
8204        };
8205        let code = encoder.encode(&op).unwrap();
8206        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8207        assert_eq!(
8208            code.len(),
8209            16,
8210            "I64Const with large value should be 16 bytes"
8211        );
8212    }
8213
8214    #[test]
8215    fn test_encode_i64_extend_i32_s_thumb2() {
8216        let encoder = ArmEncoder::new_thumb2();
8217        let op = ArmOp::I64ExtendI32S {
8218            rdlo: Reg::R0,
8219            rdhi: Reg::R1,
8220            rn: Reg::R0,
8221        };
8222        let code = encoder.encode(&op).unwrap();
8223        // When rdlo == rn, only ASR (4 bytes) is emitted
8224        assert_eq!(
8225            code.len(),
8226            4,
8227            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
8228        );
8229    }
8230
8231    #[test]
8232    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
8233        let encoder = ArmEncoder::new_thumb2();
8234        let op = ArmOp::I64ExtendI32S {
8235            rdlo: Reg::R0,
8236            rdhi: Reg::R1,
8237            rn: Reg::R2,
8238        };
8239        let code = encoder.encode(&op).unwrap();
8240        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
8241        assert!(
8242            code.len() >= 6,
8243            "I64ExtendI32S (diff reg) should be at least 6 bytes"
8244        );
8245    }
8246
8247    #[test]
8248    fn test_encode_i64_extend_i32_u_thumb2() {
8249        let encoder = ArmEncoder::new_thumb2();
8250        let op = ArmOp::I64ExtendI32U {
8251            rdlo: Reg::R0,
8252            rdhi: Reg::R1,
8253            rn: Reg::R0,
8254        };
8255        let code = encoder.encode(&op).unwrap();
8256        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8257        assert_eq!(
8258            code.len(),
8259            2,
8260            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8261        );
8262    }
8263
8264    #[test]
8265    fn test_encode_i32_wrap_i64_nop_thumb2() {
8266        let encoder = ArmEncoder::new_thumb2();
8267        // When rd == rnlo, should be a NOP
8268        let op = ArmOp::I32WrapI64 {
8269            rd: Reg::R0,
8270            rnlo: Reg::R0,
8271        };
8272        let code = encoder.encode(&op).unwrap();
8273        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8274        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8275    }
8276
8277    #[test]
8278    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8279        let encoder = ArmEncoder::new_thumb2();
8280        let op = ArmOp::I32WrapI64 {
8281            rd: Reg::R2,
8282            rnlo: Reg::R0,
8283        };
8284        let code = encoder.encode(&op).unwrap();
8285        // MOV R2, R0 (2 or 4 bytes)
8286        assert!(
8287            code.len() >= 2,
8288            "I32WrapI64 diff reg should emit at least 2 bytes"
8289        );
8290    }
8291
8292    #[test]
8293    fn test_encode_i64_eqz_thumb2() {
8294        let encoder = ArmEncoder::new_thumb2();
8295        let op = ArmOp::I64Eqz {
8296            rd: Reg::R0,
8297            rnlo: Reg::R0,
8298            rnhi: Reg::R1,
8299        };
8300        let code = encoder.encode(&op).unwrap();
8301        // Delegates to I64SetCondZ which is already encoded
8302        assert!(
8303            code.len() >= 6,
8304            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8305        );
8306    }
8307
8308    #[test]
8309    fn test_encode_i64_eq_thumb2() {
8310        let encoder = ArmEncoder::new_thumb2();
8311        let op = ArmOp::I64Eq {
8312            rd: Reg::R0,
8313            rnlo: Reg::R0,
8314            rnhi: Reg::R1,
8315            rmlo: Reg::R2,
8316            rmhi: Reg::R3,
8317        };
8318        let code = encoder.encode(&op).unwrap();
8319        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8320        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8321    }
8322
8323    #[test]
8324    fn test_encode_i64_ldr_thumb2() {
8325        let encoder = ArmEncoder::new_thumb2();
8326        let op = ArmOp::I64Ldr {
8327            rdlo: Reg::R0,
8328            rdhi: Reg::R1,
8329            addr: MemAddr::imm(Reg::SP, 0),
8330        };
8331        let code = encoder.encode(&op).unwrap();
8332        // Two LDR instructions (lo at offset, hi at offset+4)
8333        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8334    }
8335
8336    #[test]
8337    fn test_encode_i64_str_thumb2() {
8338        let encoder = ArmEncoder::new_thumb2();
8339        let op = ArmOp::I64Str {
8340            rdlo: Reg::R0,
8341            rdhi: Reg::R1,
8342            addr: MemAddr::imm(Reg::SP, 0),
8343        };
8344        let code = encoder.encode(&op).unwrap();
8345        // Two STR instructions (lo at offset, hi at offset+4)
8346        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8347    }
8348
8349    #[test]
8350    fn test_encode_i64_all_comparisons_thumb2() {
8351        let encoder = ArmEncoder::new_thumb2();
8352
8353        let ops = vec![
8354            ArmOp::I64Ne {
8355                rd: Reg::R0,
8356                rnlo: Reg::R0,
8357                rnhi: Reg::R1,
8358                rmlo: Reg::R2,
8359                rmhi: Reg::R3,
8360            },
8361            ArmOp::I64LtS {
8362                rd: Reg::R0,
8363                rnlo: Reg::R0,
8364                rnhi: Reg::R1,
8365                rmlo: Reg::R2,
8366                rmhi: Reg::R3,
8367            },
8368            ArmOp::I64LtU {
8369                rd: Reg::R0,
8370                rnlo: Reg::R0,
8371                rnhi: Reg::R1,
8372                rmlo: Reg::R2,
8373                rmhi: Reg::R3,
8374            },
8375            ArmOp::I64LeS {
8376                rd: Reg::R0,
8377                rnlo: Reg::R0,
8378                rnhi: Reg::R1,
8379                rmlo: Reg::R2,
8380                rmhi: Reg::R3,
8381            },
8382            ArmOp::I64LeU {
8383                rd: Reg::R0,
8384                rnlo: Reg::R0,
8385                rnhi: Reg::R1,
8386                rmlo: Reg::R2,
8387                rmhi: Reg::R3,
8388            },
8389            ArmOp::I64GtS {
8390                rd: Reg::R0,
8391                rnlo: Reg::R0,
8392                rnhi: Reg::R1,
8393                rmlo: Reg::R2,
8394                rmhi: Reg::R3,
8395            },
8396            ArmOp::I64GtU {
8397                rd: Reg::R0,
8398                rnlo: Reg::R0,
8399                rnhi: Reg::R1,
8400                rmlo: Reg::R2,
8401                rmhi: Reg::R3,
8402            },
8403            ArmOp::I64GeS {
8404                rd: Reg::R0,
8405                rnlo: Reg::R0,
8406                rnhi: Reg::R1,
8407                rmlo: Reg::R2,
8408                rmhi: Reg::R3,
8409            },
8410            ArmOp::I64GeU {
8411                rd: Reg::R0,
8412                rnlo: Reg::R0,
8413                rnhi: Reg::R1,
8414                rmlo: Reg::R2,
8415                rmhi: Reg::R3,
8416            },
8417        ];
8418
8419        for op in &ops {
8420            let code = encoder.encode(op).unwrap();
8421            assert!(
8422                code.len() >= 8,
8423                "i64 comparison {:?} should emit at least 8 bytes, got {}",
8424                op,
8425                code.len()
8426            );
8427        }
8428    }
8429
8430    #[test]
8431    fn test_encode_i64_const_zero_thumb2() {
8432        let encoder = ArmEncoder::new_thumb2();
8433        let op = ArmOp::I64Const {
8434            rdlo: Reg::R0,
8435            rdhi: Reg::R1,
8436            value: 0,
8437        };
8438        let code = encoder.encode(&op).unwrap();
8439        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
8440        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
8441    }
8442
8443    #[test]
8444    fn test_encode_i64_const_negative_one_thumb2() {
8445        let encoder = ArmEncoder::new_thumb2();
8446        let op = ArmOp::I64Const {
8447            rdlo: Reg::R0,
8448            rdhi: Reg::R1,
8449            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
8450        };
8451        let code = encoder.encode(&op).unwrap();
8452        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8453        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
8454    }
8455
8456    // =========================================================================
8457    // Sub-word load/store encoding tests
8458    // =========================================================================
8459
8460    #[test]
8461    fn test_encode_ldrb_arm32() {
8462        let encoder = ArmEncoder::new_arm32();
8463        let op = ArmOp::Ldrb {
8464            rd: Reg::R0,
8465            addr: MemAddr::imm(Reg::R1, 4),
8466        };
8467        let code = encoder.encode(&op).unwrap();
8468        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
8469        // LDRB R0, [R1, #4] = 0xE5D10004
8470        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8471        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
8472    }
8473
8474    #[test]
8475    fn test_encode_strb_arm32() {
8476        let encoder = ArmEncoder::new_arm32();
8477        let op = ArmOp::Strb {
8478            rd: Reg::R0,
8479            addr: MemAddr::imm(Reg::R1, 0),
8480        };
8481        let code = encoder.encode(&op).unwrap();
8482        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
8483        // STRB R0, [R1, #0] = 0xE5C10000
8484        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8485        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
8486    }
8487
8488    #[test]
8489    fn test_encode_ldrh_arm32() {
8490        let encoder = ArmEncoder::new_arm32();
8491        let op = ArmOp::Ldrh {
8492            rd: Reg::R0,
8493            addr: MemAddr::imm(Reg::R1, 2),
8494        };
8495        let code = encoder.encode(&op).unwrap();
8496        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
8497    }
8498
8499    #[test]
8500    fn test_encode_strh_arm32() {
8501        let encoder = ArmEncoder::new_arm32();
8502        let op = ArmOp::Strh {
8503            rd: Reg::R0,
8504            addr: MemAddr::imm(Reg::R1, 0),
8505        };
8506        let code = encoder.encode(&op).unwrap();
8507        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
8508    }
8509
8510    #[test]
8511    fn test_encode_ldrsb_arm32() {
8512        let encoder = ArmEncoder::new_arm32();
8513        let op = ArmOp::Ldrsb {
8514            rd: Reg::R0,
8515            addr: MemAddr::imm(Reg::R1, 0),
8516        };
8517        let code = encoder.encode(&op).unwrap();
8518        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
8519    }
8520
8521    #[test]
8522    fn test_encode_ldrsh_arm32() {
8523        let encoder = ArmEncoder::new_arm32();
8524        let op = ArmOp::Ldrsh {
8525            rd: Reg::R0,
8526            addr: MemAddr::imm(Reg::R1, 0),
8527        };
8528        let code = encoder.encode(&op).unwrap();
8529        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
8530    }
8531
8532    #[test]
8533    fn test_encode_ldrb_thumb2_16bit() {
8534        let encoder = ArmEncoder::new_thumb2();
8535        let op = ArmOp::Ldrb {
8536            rd: Reg::R0,
8537            addr: MemAddr::imm(Reg::R1, 4),
8538        };
8539        let code = encoder.encode(&op).unwrap();
8540        // Low registers + small offset -> 16-bit encoding
8541        assert_eq!(
8542            code.len(),
8543            2,
8544            "Thumb-2 LDRB with small offset should be 16-bit"
8545        );
8546    }
8547
8548    #[test]
8549    fn test_encode_ldrb_thumb2_32bit() {
8550        let encoder = ArmEncoder::new_thumb2();
8551        let op = ArmOp::Ldrb {
8552            rd: Reg::R0,
8553            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
8554        };
8555        let code = encoder.encode(&op).unwrap();
8556        assert_eq!(
8557            code.len(),
8558            4,
8559            "Thumb-2 LDRB with large offset should be 32-bit"
8560        );
8561    }
8562
8563    #[test]
8564    fn test_encode_strb_thumb2_16bit() {
8565        let encoder = ArmEncoder::new_thumb2();
8566        let op = ArmOp::Strb {
8567            rd: Reg::R0,
8568            addr: MemAddr::imm(Reg::R1, 10),
8569        };
8570        let code = encoder.encode(&op).unwrap();
8571        assert_eq!(
8572            code.len(),
8573            2,
8574            "Thumb-2 STRB with small offset should be 16-bit"
8575        );
8576    }
8577
8578    #[test]
8579    fn test_encode_ldrh_thumb2_16bit() {
8580        let encoder = ArmEncoder::new_thumb2();
8581        let op = ArmOp::Ldrh {
8582            rd: Reg::R0,
8583            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
8584        };
8585        let code = encoder.encode(&op).unwrap();
8586        assert_eq!(
8587            code.len(),
8588            2,
8589            "Thumb-2 LDRH with small aligned offset should be 16-bit"
8590        );
8591    }
8592
8593    #[test]
8594    fn test_encode_strh_thumb2_16bit() {
8595        let encoder = ArmEncoder::new_thumb2();
8596        let op = ArmOp::Strh {
8597            rd: Reg::R0,
8598            addr: MemAddr::imm(Reg::R1, 4),
8599        };
8600        let code = encoder.encode(&op).unwrap();
8601        assert_eq!(
8602            code.len(),
8603            2,
8604            "Thumb-2 STRH with small aligned offset should be 16-bit"
8605        );
8606    }
8607
8608    #[test]
8609    fn test_encode_ldrsb_thumb2() {
8610        let encoder = ArmEncoder::new_thumb2();
8611        let op = ArmOp::Ldrsb {
8612            rd: Reg::R0,
8613            addr: MemAddr::imm(Reg::R1, 0),
8614        };
8615        let code = encoder.encode(&op).unwrap();
8616        // LDRSB has no 16-bit immediate form, always 32-bit
8617        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
8618    }
8619
8620    #[test]
8621    fn test_encode_ldrsh_thumb2() {
8622        let encoder = ArmEncoder::new_thumb2();
8623        let op = ArmOp::Ldrsh {
8624            rd: Reg::R0,
8625            addr: MemAddr::imm(Reg::R1, 0),
8626        };
8627        let code = encoder.encode(&op).unwrap();
8628        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
8629    }
8630
8631    #[test]
8632    fn test_encode_memory_size_thumb2() {
8633        let encoder = ArmEncoder::new_thumb2();
8634        let op = ArmOp::MemorySize { rd: Reg::R0 };
8635        let code = encoder.encode(&op).unwrap();
8636        // R0 and R10 are not both low registers, so this needs careful handling
8637        assert!(!code.is_empty(), "MemorySize should produce code");
8638    }
8639
8640    #[test]
8641    fn test_encode_memory_grow_thumb2() {
8642        let encoder = ArmEncoder::new_thumb2();
8643        let op = ArmOp::MemoryGrow {
8644            rd: Reg::R0,
8645            rn: Reg::R0,
8646        };
8647        let code = encoder.encode(&op).unwrap();
8648        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
8649    }
8650
8651    #[test]
8652    fn test_encode_subword_reg_offset_thumb2() {
8653        let encoder = ArmEncoder::new_thumb2();
8654
8655        // LDRB with register offset
8656        let op = ArmOp::Ldrb {
8657            rd: Reg::R0,
8658            addr: MemAddr::reg(Reg::R1, Reg::R2),
8659        };
8660        let code = encoder.encode(&op).unwrap();
8661        assert_eq!(
8662            code.len(),
8663            4,
8664            "Thumb-2 LDRB with reg offset should be 32-bit"
8665        );
8666
8667        // STRB with register offset
8668        let op = ArmOp::Strb {
8669            rd: Reg::R0,
8670            addr: MemAddr::reg(Reg::R1, Reg::R2),
8671        };
8672        let code = encoder.encode(&op).unwrap();
8673        assert_eq!(
8674            code.len(),
8675            4,
8676            "Thumb-2 STRB with reg offset should be 32-bit"
8677        );
8678
8679        // LDRH with register offset
8680        let op = ArmOp::Ldrh {
8681            rd: Reg::R0,
8682            addr: MemAddr::reg(Reg::R1, Reg::R2),
8683        };
8684        let code = encoder.encode(&op).unwrap();
8685        assert_eq!(
8686            code.len(),
8687            4,
8688            "Thumb-2 LDRH with reg offset should be 32-bit"
8689        );
8690
8691        // STRH with register offset
8692        let op = ArmOp::Strh {
8693            rd: Reg::R0,
8694            addr: MemAddr::reg(Reg::R1, Reg::R2),
8695        };
8696        let code = encoder.encode(&op).unwrap();
8697        assert_eq!(
8698            code.len(),
8699            4,
8700            "Thumb-2 STRH with reg offset should be 32-bit"
8701        );
8702    }
8703
8704    #[test]
8705    fn test_encode_subword_reg_imm_offset_thumb2() {
8706        let encoder = ArmEncoder::new_thumb2();
8707
8708        // LDRB with both register and immediate offset
8709        let op = ArmOp::Ldrb {
8710            rd: Reg::R0,
8711            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
8712        };
8713        let code = encoder.encode(&op).unwrap();
8714        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
8715        assert_eq!(
8716            code.len(),
8717            8,
8718            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
8719        );
8720    }
8721
8722    // ========================================================================
8723    // Helium MVE encoding tests
8724    // ========================================================================
8725
8726    #[test]
8727    fn test_encode_mve_addi32_thumb2() {
8728        let encoder = ArmEncoder::new_thumb2();
8729        let op = ArmOp::MveAddI {
8730            qd: QReg::Q0,
8731            qn: QReg::Q1,
8732            qm: QReg::Q2,
8733            size: MveSize::S32,
8734        };
8735        let code = encoder.encode(&op).unwrap();
8736        assert_eq!(
8737            code.len(),
8738            4,
8739            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
8740        );
8741    }
8742
8743    #[test]
8744    fn test_encode_mve_subi16_thumb2() {
8745        let encoder = ArmEncoder::new_thumb2();
8746        let op = ArmOp::MveSubI {
8747            qd: QReg::Q0,
8748            qn: QReg::Q1,
8749            qm: QReg::Q2,
8750            size: MveSize::S16,
8751        };
8752        let code = encoder.encode(&op).unwrap();
8753        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
8754    }
8755
8756    #[test]
8757    fn test_encode_mve_muli8_thumb2() {
8758        let encoder = ArmEncoder::new_thumb2();
8759        let op = ArmOp::MveMulI {
8760            qd: QReg::Q0,
8761            qn: QReg::Q1,
8762            qm: QReg::Q2,
8763            size: MveSize::S8,
8764        };
8765        let code = encoder.encode(&op).unwrap();
8766        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
8767    }
8768
8769    #[test]
8770    fn test_encode_mve_bitwise_thumb2() {
8771        let encoder = ArmEncoder::new_thumb2();
8772
8773        let ops = vec![
8774            ArmOp::MveAnd {
8775                qd: QReg::Q0,
8776                qn: QReg::Q1,
8777                qm: QReg::Q2,
8778            },
8779            ArmOp::MveOrr {
8780                qd: QReg::Q0,
8781                qn: QReg::Q1,
8782                qm: QReg::Q2,
8783            },
8784            ArmOp::MveEor {
8785                qd: QReg::Q0,
8786                qn: QReg::Q1,
8787                qm: QReg::Q2,
8788            },
8789            ArmOp::MveBic {
8790                qd: QReg::Q0,
8791                qn: QReg::Q1,
8792                qm: QReg::Q2,
8793            },
8794        ];
8795        for op in ops {
8796            let code = encoder.encode(&op).unwrap();
8797            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
8798        }
8799    }
8800
8801    #[test]
8802    fn test_encode_mve_mvn_thumb2() {
8803        let encoder = ArmEncoder::new_thumb2();
8804        let op = ArmOp::MveMvn {
8805            qd: QReg::Q0,
8806            qm: QReg::Q1,
8807        };
8808        let code = encoder.encode(&op).unwrap();
8809        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
8810    }
8811
8812    #[test]
8813    fn test_encode_mve_load_store_thumb2() {
8814        let encoder = ArmEncoder::new_thumb2();
8815
8816        let load = ArmOp::MveLoad {
8817            qd: QReg::Q0,
8818            addr: MemAddr::imm(Reg::R0, 16),
8819        };
8820        let code = encoder.encode(&load).unwrap();
8821        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
8822
8823        let store = ArmOp::MveStore {
8824            qd: QReg::Q1,
8825            addr: MemAddr::imm(Reg::R1, 0),
8826        };
8827        let code = encoder.encode(&store).unwrap();
8828        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
8829    }
8830
8831    #[test]
8832    fn test_encode_mve_const_thumb2() {
8833        let encoder = ArmEncoder::new_thumb2();
8834        let op = ArmOp::MveConst {
8835            qd: QReg::Q0,
8836            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
8837        };
8838        let code = encoder.encode(&op).unwrap();
8839        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
8840        // Some words with hi16=0 skip MOVT, so length varies
8841        assert!(
8842            code.len() >= 24,
8843            "MVE const should produce multiple instructions"
8844        );
8845    }
8846
8847    #[test]
8848    fn test_encode_mve_dup_thumb2() {
8849        let encoder = ArmEncoder::new_thumb2();
8850        let op = ArmOp::MveDup {
8851            qd: QReg::Q0,
8852            rn: Reg::R0,
8853            size: MveSize::S32,
8854        };
8855        let code = encoder.encode(&op).unwrap();
8856        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
8857    }
8858
8859    #[test]
8860    fn test_encode_mve_extract_lane_thumb2() {
8861        let encoder = ArmEncoder::new_thumb2();
8862        let op = ArmOp::MveExtractLane {
8863            rd: Reg::R0,
8864            qn: QReg::Q1,
8865            lane: 2,
8866            size: MveSize::S32,
8867        };
8868        let code = encoder.encode(&op).unwrap();
8869        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
8870    }
8871
8872    #[test]
8873    fn test_encode_mve_insert_lane_thumb2() {
8874        let encoder = ArmEncoder::new_thumb2();
8875        let op = ArmOp::MveInsertLane {
8876            qd: QReg::Q0,
8877            rn: Reg::R1,
8878            lane: 3,
8879            size: MveSize::S32,
8880        };
8881        let code = encoder.encode(&op).unwrap();
8882        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
8883    }
8884
8885    #[test]
8886    fn test_encode_mve_addf32_thumb2() {
8887        let encoder = ArmEncoder::new_thumb2();
8888        let op = ArmOp::MveAddF32 {
8889            qd: QReg::Q0,
8890            qn: QReg::Q1,
8891            qm: QReg::Q2,
8892        };
8893        let code = encoder.encode(&op).unwrap();
8894        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
8895    }
8896
8897    #[test]
8898    fn test_encode_mve_divf32_thumb2() {
8899        let encoder = ArmEncoder::new_thumb2();
8900        let op = ArmOp::MveDivF32 {
8901            qd: QReg::Q0,
8902            qn: QReg::Q1,
8903            qm: QReg::Q2,
8904        };
8905        let code = encoder.encode(&op).unwrap();
8906        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
8907        assert_eq!(
8908            code.len(),
8909            16,
8910            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
8911        );
8912    }
8913
8914    #[test]
8915    fn test_encode_mve_sqrtf32_thumb2() {
8916        let encoder = ArmEncoder::new_thumb2();
8917        let op = ArmOp::MveSqrtF32 {
8918            qd: QReg::Q0,
8919            qm: QReg::Q1,
8920        };
8921        let code = encoder.encode(&op).unwrap();
8922        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
8923        assert_eq!(
8924            code.len(),
8925            16,
8926            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
8927        );
8928    }
8929
8930    #[test]
8931    fn test_encode_mve_negf32_thumb2() {
8932        let encoder = ArmEncoder::new_thumb2();
8933        let op = ArmOp::MveNegF32 {
8934            qd: QReg::Q0,
8935            qm: QReg::Q1,
8936        };
8937        let code = encoder.encode(&op).unwrap();
8938        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
8939    }
8940
8941    #[test]
8942    fn test_encode_mve_absf32_thumb2() {
8943        let encoder = ArmEncoder::new_thumb2();
8944        let op = ArmOp::MveAbsF32 {
8945            qd: QReg::Q0,
8946            qm: QReg::Q1,
8947        };
8948        let code = encoder.encode(&op).unwrap();
8949        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
8950    }
8951
8952    #[test]
8953    fn test_encode_mve_different_qregs() {
8954        let encoder = ArmEncoder::new_thumb2();
8955
8956        // Test that different Q-register numbers produce different encodings
8957        let op1 = ArmOp::MveAddI {
8958            qd: QReg::Q0,
8959            qn: QReg::Q0,
8960            qm: QReg::Q0,
8961            size: MveSize::S32,
8962        };
8963        let op2 = ArmOp::MveAddI {
8964            qd: QReg::Q3,
8965            qn: QReg::Q5,
8966            qm: QReg::Q7,
8967            size: MveSize::S32,
8968        };
8969        let code1 = encoder.encode(&op1).unwrap();
8970        let code2 = encoder.encode(&op2).unwrap();
8971        assert_ne!(
8972            code1, code2,
8973            "Different Q-registers should produce different encodings"
8974        );
8975    }
8976
8977    #[test]
8978    fn test_encode_mve_arm32_nop() {
8979        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
8980        let encoder = ArmEncoder::new_arm32();
8981        let op = ArmOp::MveAddI {
8982            qd: QReg::Q0,
8983            qn: QReg::Q1,
8984            qm: QReg::Q2,
8985            size: MveSize::S32,
8986        };
8987        let code = encoder.encode(&op).unwrap();
8988        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
8989        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
8990        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8991        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
8992    }
8993}