Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
55        let instr: u32 = match op {
56            // Data processing instructions
57            ArmOp::Add { rd, rn, op2 } => {
58                let rd_bits = reg_to_bits(rd);
59                let rn_bits = reg_to_bits(rn);
60                let (op2_bits, i_flag) = encode_operand2(op2);
61
62                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
63                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
64                    | (i_flag << 25)
65                    | (rn_bits << 16)
66                    | (rd_bits << 12)
67                    | op2_bits
68            }
69
70            ArmOp::Sub { rd, rn, op2 } => {
71                let rd_bits = reg_to_bits(rd);
72                let rn_bits = reg_to_bits(rn);
73                let (op2_bits, i_flag) = encode_operand2(op2);
74
75                // SUB encoding: opcode=0010
76                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
77            }
78
79            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
80            ArmOp::Adds { rd, rn, op2 } => {
81                let rd_bits = reg_to_bits(rd);
82                let rn_bits = reg_to_bits(rn);
83                let (op2_bits, i_flag) = encode_operand2(op2);
84
85                // ADDS encoding: opcode=0100, S=1
86                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
87            }
88
89            ArmOp::Adc { rd, rn, op2 } => {
90                let rd_bits = reg_to_bits(rd);
91                let rn_bits = reg_to_bits(rn);
92                let (op2_bits, i_flag) = encode_operand2(op2);
93
94                // ADC encoding: opcode=0101
95                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
96            }
97
98            ArmOp::Subs { rd, rn, op2 } => {
99                let rd_bits = reg_to_bits(rd);
100                let rn_bits = reg_to_bits(rn);
101                let (op2_bits, i_flag) = encode_operand2(op2);
102
103                // SUBS encoding: opcode=0010, S=1
104                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
105            }
106
107            ArmOp::Sbc { rd, rn, op2 } => {
108                let rd_bits = reg_to_bits(rd);
109                let rn_bits = reg_to_bits(rn);
110                let (op2_bits, i_flag) = encode_operand2(op2);
111
112                // SBC encoding: opcode=0110
113                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
114            }
115
116            ArmOp::Mul { rd, rn, rm } => {
117                let rd_bits = reg_to_bits(rd);
118                let rn_bits = reg_to_bits(rn);
119                let rm_bits = reg_to_bits(rm);
120
121                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
122                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
123            }
124
125            ArmOp::Sdiv { rd, rn, rm } => {
126                let rd_bits = reg_to_bits(rd);
127                let rn_bits = reg_to_bits(rn);
128                let rm_bits = reg_to_bits(rm);
129
130                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
131                // ARMv7-M and above
132                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
133            }
134
135            ArmOp::Udiv { rd, rn, rm } => {
136                let rd_bits = reg_to_bits(rd);
137                let rn_bits = reg_to_bits(rn);
138                let rm_bits = reg_to_bits(rm);
139
140                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
141                // ARMv7-M and above
142                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
143            }
144
145            ArmOp::Mls { rd, rn, rm, ra } => {
146                let rd_bits = reg_to_bits(rd);
147                let rn_bits = reg_to_bits(rn);
148                let rm_bits = reg_to_bits(rm);
149                let ra_bits = reg_to_bits(ra);
150
151                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
152                // Rd = Ra - (Rn * Rm)
153                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
154            }
155
156            ArmOp::And { rd, rn, op2 } => {
157                let rd_bits = reg_to_bits(rd);
158                let rn_bits = reg_to_bits(rn);
159                let (op2_bits, i_flag) = encode_operand2(op2);
160
161                // AND encoding: opcode=0000
162                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
163            }
164
165            ArmOp::Orr { rd, rn, op2 } => {
166                let rd_bits = reg_to_bits(rd);
167                let rn_bits = reg_to_bits(rn);
168                let (op2_bits, i_flag) = encode_operand2(op2);
169
170                // ORR encoding: opcode=1100
171                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
172            }
173
174            ArmOp::Eor { rd, rn, op2 } => {
175                let rd_bits = reg_to_bits(rd);
176                let rn_bits = reg_to_bits(rn);
177                let (op2_bits, i_flag) = encode_operand2(op2);
178
179                // EOR encoding: opcode=0001
180                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
181            }
182
183            // Shift instructions
184            ArmOp::Lsl { rd, rn, shift } => {
185                let rd_bits = reg_to_bits(rd);
186                let rn_bits = reg_to_bits(rn);
187                let shift_bits = *shift & 0x1F;
188
189                // LSL encoding: MOV with shift
190                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
191            }
192
193            ArmOp::Lsr { rd, rn, shift } => {
194                let rd_bits = reg_to_bits(rd);
195                let rn_bits = reg_to_bits(rn);
196                let shift_bits = *shift & 0x1F;
197
198                // LSR encoding
199                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
200            }
201
202            ArmOp::Asr { rd, rn, shift } => {
203                let rd_bits = reg_to_bits(rd);
204                let rn_bits = reg_to_bits(rn);
205                let shift_bits = *shift & 0x1F;
206
207                // ASR encoding
208                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
209            }
210
211            ArmOp::Ror { rd, rn, shift } => {
212                let rd_bits = reg_to_bits(rd);
213                let rn_bits = reg_to_bits(rn);
214                let shift_bits = *shift & 0x1F;
215
216                // ROR encoding: MOV with ROR shift
217                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
218            }
219
220            // Register-based shifts (ARM32)
221            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
222            ArmOp::LslReg { rd, rn, rm } => {
223                let rd_bits = reg_to_bits(rd);
224                let rn_bits = reg_to_bits(rn);
225                let rm_bits = reg_to_bits(rm);
226                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
227            }
228            ArmOp::LsrReg { rd, rn, rm } => {
229                let rd_bits = reg_to_bits(rd);
230                let rn_bits = reg_to_bits(rn);
231                let rm_bits = reg_to_bits(rm);
232                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
233            }
234            ArmOp::AsrReg { rd, rn, rm } => {
235                let rd_bits = reg_to_bits(rd);
236                let rn_bits = reg_to_bits(rn);
237                let rm_bits = reg_to_bits(rm);
238                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
239            }
240            ArmOp::RorReg { rd, rn, rm } => {
241                let rd_bits = reg_to_bits(rd);
242                let rn_bits = reg_to_bits(rn);
243                let rm_bits = reg_to_bits(rm);
244                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
245            }
246
247            // RSB (Reverse Subtract): Rd = imm - Rn
248            ArmOp::Rsb { rd, rn, imm } => {
249                let rd_bits = reg_to_bits(rd);
250                let rn_bits = reg_to_bits(rn);
251                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
252                // Opcode for RSB = 0011, I=1 (immediate), S=0
253                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
254            }
255
256            // Bit manipulation instructions
257            ArmOp::Clz { rd, rm } => {
258                let rd_bits = reg_to_bits(rd);
259                let rm_bits = reg_to_bits(rm);
260
261                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
262                // ARMv5T and above
263                0xE16F0F10 | (rd_bits << 12) | rm_bits
264            }
265
266            ArmOp::Rbit { rd, rm } => {
267                let rd_bits = reg_to_bits(rd);
268                let rm_bits = reg_to_bits(rm);
269
270                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
271                // ARMv6T2 and above
272                0xE6FF0F30 | (rd_bits << 12) | rm_bits
273            }
274
275            ArmOp::Sxtb { rd, rm } => {
276                let rd_bits = reg_to_bits(rd);
277                let rm_bits = reg_to_bits(rm);
278
279                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
280                // ARMv6 and above. rotate=00 for no rotation
281                0xE6AF0070 | (rd_bits << 12) | rm_bits
282            }
283
284            ArmOp::Sxth { rd, rm } => {
285                let rd_bits = reg_to_bits(rd);
286                let rm_bits = reg_to_bits(rm);
287
288                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
289                // ARMv6 and above. rotate=00 for no rotation
290                0xE6BF0070 | (rd_bits << 12) | rm_bits
291            }
292
293            // Move instructions
294            ArmOp::Mov { rd, op2 } => {
295                let rd_bits = reg_to_bits(rd);
296                let (op2_bits, i_flag) = encode_operand2(op2);
297
298                // MOV encoding: opcode=1101
299                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
300            }
301
302            ArmOp::Mvn { rd, op2 } => {
303                let rd_bits = reg_to_bits(rd);
304                let (op2_bits, i_flag) = encode_operand2(op2);
305
306                // MVN encoding: opcode=1111
307                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
308            }
309
310            // MOVW - Move Wide (ARM32)
311            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
312            ArmOp::Movw { rd, imm16 } => {
313                let rd_bits = reg_to_bits(rd);
314                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
315                let imm12 = (*imm16 as u32) & 0xFFF;
316                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
317            }
318
319            // MOVT - Move Top (ARM32)
320            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
321            ArmOp::Movt { rd, imm16 } => {
322                let rd_bits = reg_to_bits(rd);
323                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
324                let imm12 = (*imm16 as u32) & 0xFFF;
325                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
326            }
327
328            // Compare
329            ArmOp::Cmp { rn, op2 } => {
330                let rn_bits = reg_to_bits(rn);
331                let (op2_bits, i_flag) = encode_operand2(op2);
332
333                // CMP encoding: opcode=1010, S=1
334                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
335            }
336
337            // Compare Negative (CMN) - computes Rn + op2 and sets flags
338            ArmOp::Cmn { rn, op2 } => {
339                let rn_bits = reg_to_bits(rn);
340                let (op2_bits, i_flag) = encode_operand2(op2);
341
342                // CMN encoding: opcode=1011, S=1
343                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
344            }
345
346            // Load/Store
347            ArmOp::Ldr { rd, addr } => {
348                let rd_bits = reg_to_bits(rd);
349                let (base_bits, offset_bits) = encode_mem_addr(addr);
350
351                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
352                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
353                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
354            }
355
356            ArmOp::Str { rd, addr } => {
357                let rd_bits = reg_to_bits(rd);
358                let (base_bits, offset_bits) = encode_mem_addr(addr);
359
360                // STR encoding: L=0 (store)
361                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
362            }
363
364            // Sub-word loads (ARM32 encoding)
365            ArmOp::Ldrb { rd, addr } => {
366                let rd_bits = reg_to_bits(rd);
367                let (base_bits, offset_bits) = encode_mem_addr(addr);
368                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
369                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
370            }
371
372            ArmOp::Ldrsb { rd, addr } => {
373                let rd_bits = reg_to_bits(rd);
374                let (base_bits, offset_bits) = encode_mem_addr(addr);
375                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
376                // Simplified with immediate offset
377                let offset_val = offset_bits & 0xFF;
378                let imm4h = (offset_val >> 4) & 0xF;
379                let imm4l = offset_val & 0xF;
380                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
381            }
382
383            ArmOp::Ldrh { rd, addr } => {
384                let rd_bits = reg_to_bits(rd);
385                let (base_bits, offset_bits) = encode_mem_addr(addr);
386                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
387                let offset_val = offset_bits & 0xFF;
388                let imm4h = (offset_val >> 4) & 0xF;
389                let imm4l = offset_val & 0xF;
390                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
391            }
392
393            ArmOp::Ldrsh { rd, addr } => {
394                let rd_bits = reg_to_bits(rd);
395                let (base_bits, offset_bits) = encode_mem_addr(addr);
396                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
397                let offset_val = offset_bits & 0xFF;
398                let imm4h = (offset_val >> 4) & 0xF;
399                let imm4l = offset_val & 0xF;
400                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
401            }
402
403            // Sub-word stores (ARM32 encoding)
404            ArmOp::Strb { rd, addr } => {
405                let rd_bits = reg_to_bits(rd);
406                let (base_bits, offset_bits) = encode_mem_addr(addr);
407                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
408                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
409            }
410
411            ArmOp::Strh { rd, addr } => {
412                let rd_bits = reg_to_bits(rd);
413                let (base_bits, offset_bits) = encode_mem_addr(addr);
414                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
415                let offset_val = offset_bits & 0xFF;
416                let imm4h = (offset_val >> 4) & 0xF;
417                let imm4l = offset_val & 0xF;
418                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
419            }
420
421            // Memory management (ARM32 encoding)
422            ArmOp::MemorySize { rd } => {
423                let rd_bits = reg_to_bits(rd);
424                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
425                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
426                // LSR #16: shift5=10000, type=01
427                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
428            }
429
430            ArmOp::MemoryGrow { rd, .. } => {
431                let rd_bits = reg_to_bits(rd);
432                // On embedded, always fail: MOV rd, #-1
433                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
434            }
435
436            // Label pseudo-instruction: emits no machine code
437            ArmOp::Label { .. } => {
438                return Ok(Vec::new());
439            }
440
441            // Branch instructions
442            ArmOp::B { label: _ } => {
443                // B encoding: cond(4) | 1010 | offset(24)
444                // Simplified: branch to offset 0 (will be patched by linker/resolver)
445                0xEA000000
446            }
447
448            // Conditional branch to label (generic)
449            ArmOp::Bcc { cond, label: _ } => {
450                use synth_synthesis::Condition;
451                let cond_bits: u32 = match cond {
452                    Condition::EQ => 0x0,
453                    Condition::NE => 0x1,
454                    Condition::HS => 0x2,
455                    Condition::LO => 0x3,
456                    Condition::HI => 0x8,
457                    Condition::LS => 0x9,
458                    Condition::GE => 0xA,
459                    Condition::LT => 0xB,
460                    Condition::GT => 0xC,
461                    Condition::LE => 0xD,
462                };
463                // B<cond> with offset 0 (will be patched)
464                (cond_bits << 28) | 0x0A000000
465            }
466
467            // BHS (Branch if Higher or Same) - used for bounds checking
468            ArmOp::Bhs { label: _ } => {
469                // BHS encoding: cond(2=HS) | 1010 | offset(24)
470                0x2A000000 // BHS with offset 0
471            }
472
473            // BLO (Branch if Lower) - complementary to BHS
474            ArmOp::Blo { label: _ } => {
475                // BLO encoding: cond(3=LO) | 1010 | offset(24)
476                0x3A000000 // BLO with offset 0
477            }
478
479            // Branch with numeric offset (in instructions)
480            // ARM32 B instruction: offset is in instructions, stored as words
481            // The offset is relative to PC+8 (due to ARM pipeline)
482            ArmOp::BOffset { offset } => {
483                // B encoding: cond(4) | 1010 | offset(24)
484                // Offset is signed, in words (4-byte units)
485                // ARM adds PC+8 to the offset, so we need to adjust:
486                // target = PC + 8 + (offset * 4)
487                // For backward branch of N instructions: offset = -(N + 2)
488                let adjusted_offset = *offset - 2; // Account for PC+8
489                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
490                0xEA000000 | offset_bits
491            }
492
493            // Conditional branch with numeric offset
494            ArmOp::BCondOffset { cond, offset } => {
495                use synth_synthesis::Condition;
496                let cond_bits: u32 = match cond {
497                    Condition::EQ => 0x0,
498                    Condition::NE => 0x1,
499                    Condition::HS => 0x2,
500                    Condition::LO => 0x3,
501                    Condition::HI => 0x8,
502                    Condition::LS => 0x9,
503                    Condition::GE => 0xA,
504                    Condition::LT => 0xB,
505                    Condition::GT => 0xC,
506                    Condition::LE => 0xD,
507                };
508                // B<cond> encoding: cond(4) | 1010 | offset(24)
509                let adjusted_offset = *offset - 2; // Account for PC+8
510                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
511                (cond_bits << 28) | 0x0A000000 | offset_bits
512            }
513
514            ArmOp::Bl { label: _ } => {
515                // BL encoding: cond(4) | 1011 | offset(24)
516                0xEB000000
517            }
518
519            ArmOp::Bx { rm } => {
520                let rm_bits = reg_to_bits(rm);
521
522                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
523                0xE12FFF10 | rm_bits
524            }
525
526            ArmOp::Blx { rm } => {
527                let rm_bits = reg_to_bits(rm);
528
529                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
530                0xE12FFF30 | rm_bits
531            }
532
533            ArmOp::Push { regs } => {
534                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
535                let mut reg_list: u32 = 0;
536                for r in regs {
537                    reg_list |= 1 << reg_to_bits(r);
538                }
539                0xE92D0000 | reg_list
540            }
541
542            ArmOp::Pop { regs } => {
543                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
544                let mut reg_list: u32 = 0;
545                for r in regs {
546                    reg_list |= 1 << reg_to_bits(r);
547                }
548                0xE8BD0000 | reg_list
549            }
550
551            ArmOp::Nop => {
552                // NOP encoding: MOV R0, R0
553                0xE1A00000
554            }
555
556            ArmOp::Udf { imm } => {
557                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
558                // We only use imm8, so split into imm4_hi and imm4_lo
559                let imm8 = *imm as u32;
560                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
561            }
562
563            // Pseudo-instructions for verification - encode as NOP
564            // These are used in formal verification but not actual code generation
565            ArmOp::Popcnt { .. } => {
566                // Population count pseudo-instruction
567                // Not a real ARM instruction, would be expanded to actual code
568                0xE1A00000 // NOP for now
569            }
570
571            ArmOp::SetCond { .. } => {
572                // Condition evaluation pseudo-instruction
573                // Not a real ARM instruction, would be expanded to actual code
574                0xE1A00000 // NOP for now
575            }
576
577            ArmOp::SelectMove { .. } => {
578                // Conditional move pseudo-instruction for ARM32
579                // Would use MOV{cond} instruction
580                0xE1A00000 // NOP for now
581            }
582
583            ArmOp::Select { .. } => {
584                // Select pseudo-instruction
585                // Not a real ARM instruction, would be expanded to conditional moves
586                0xE1A00000 // NOP for now
587            }
588
589            ArmOp::LocalGet { .. } => {
590                // Local variable get pseudo-instruction
591                // Not a real ARM instruction, would be expanded to memory access
592                0xE1A00000 // NOP for now
593            }
594
595            ArmOp::LocalSet { .. } => {
596                // Local variable set pseudo-instruction
597                // Not a real ARM instruction, would be expanded to memory access
598                0xE1A00000 // NOP for now
599            }
600
601            ArmOp::LocalTee { .. } => {
602                // Local variable tee pseudo-instruction
603                // Not a real ARM instruction, would be expanded to memory access
604                0xE1A00000 // NOP for now
605            }
606
607            ArmOp::GlobalGet { .. } => {
608                // Global variable get pseudo-instruction
609                // Not a real ARM instruction, would be expanded to memory access
610                0xE1A00000 // NOP for now
611            }
612
613            ArmOp::GlobalSet { .. } => {
614                // Global variable set pseudo-instruction
615                // Not a real ARM instruction, would be expanded to memory access
616                0xE1A00000 // NOP for now
617            }
618
619            ArmOp::BrTable { .. } => {
620                // Branch table pseudo-instruction
621                // Not a real ARM instruction, would be expanded to jump table
622                0xE1A00000 // NOP for now
623            }
624
625            ArmOp::Call { .. } => {
626                // Function call pseudo-instruction
627                // Not a real ARM instruction, would be expanded to BL
628                0xE1A00000 // NOP for now
629            }
630
631            ArmOp::CallIndirect { .. } => {
632                // Indirect function call pseudo-instruction
633                // Not a real ARM instruction, would be expanded to indirect branch
634                0xE1A00000 // NOP for now
635            }
636
637            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
638            // Real compiler would expand these to multi-instruction sequences
639            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
640            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
641            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
642            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
643            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
644            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
645            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
646            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
647            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
648            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
649            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
650            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
651            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
652            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
653            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
654            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
655            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
656            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
657            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
658            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
659            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
660            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
661            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
662            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
663            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
664            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
665            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
666            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
667            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
668            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
669            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
670            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
671
672            // f32 VFP single-precision instructions
673            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
674            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
675            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
676            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
677            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
678            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
679            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
680
681            // f32 pseudo-ops — multi-instruction sequences
682            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
683            ArmOp::F32Ceil { sd, sm } => {
684                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
685            }
686            ArmOp::F32Floor { sd, sm } => {
687                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
688            }
689            ArmOp::F32Trunc { sd, sm } => {
690                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
691            }
692            ArmOp::F32Nearest { sd, sm } => {
693                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
694            }
695            ArmOp::F32Min { sd, sn, sm } => {
696                return self.encode_arm_f32_minmax(sd, sn, sm, true);
697            }
698            ArmOp::F32Max { sd, sn, sm } => {
699                return self.encode_arm_f32_minmax(sd, sn, sm, false);
700            }
701            ArmOp::F32Copysign { sd, sn, sm } => {
702                return self.encode_arm_f32_copysign(sd, sn, sm);
703            }
704
705            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
706            ArmOp::F32Eq { rd, sn, sm } => {
707                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
708            }
709            ArmOp::F32Ne { rd, sn, sm } => {
710                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
711            }
712            ArmOp::F32Lt { rd, sn, sm } => {
713                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
714            }
715            ArmOp::F32Le { rd, sn, sm } => {
716                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
717            }
718            ArmOp::F32Gt { rd, sn, sm } => {
719                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
720            }
721            ArmOp::F32Ge { rd, sn, sm } => {
722                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
723            }
724
725            // f32 const — multi-instruction: MOVW + MOVT + VMOV
726            ArmOp::F32Const { sd, value } => {
727                return self.encode_arm_f32_const(sd, *value);
728            }
729
730            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
731            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
732
733            // f32 conversions — multi-instruction sequences
734            ArmOp::F32ConvertI32S { sd, rm } => {
735                return self.encode_arm_f32_convert_i32(sd, rm, true);
736            }
737            ArmOp::F32ConvertI32U { sd, rm } => {
738                return self.encode_arm_f32_convert_i32(sd, rm, false);
739            }
740            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
741                return Err(synth_core::Error::synthesis(
742                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
743                ));
744            }
745            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
746            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
747            ArmOp::I32TruncF32S { rd, sm } => {
748                return self.encode_arm_i32_trunc_f32(rd, sm, true);
749            }
750            ArmOp::I32TruncF32U { rd, sm } => {
751                return self.encode_arm_i32_trunc_f32(rd, sm, false);
752            }
753
754            // f64 VFP double-precision instructions (ARM32)
755            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
756            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
757            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
758            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
759            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
760            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
761            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
762            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
763
764            // f64 pseudo-ops
765            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
766            ArmOp::F64Ceil { dd, dm } => {
767                return self.encode_arm_f64_rounding(dd, dm, 0b01);
768            }
769            ArmOp::F64Floor { dd, dm } => {
770                return self.encode_arm_f64_rounding(dd, dm, 0b10);
771            }
772            ArmOp::F64Trunc { dd, dm } => {
773                return self.encode_arm_f64_rounding(dd, dm, 0b11);
774            }
775            ArmOp::F64Nearest { dd, dm } => {
776                return self.encode_arm_f64_rounding(dd, dm, 0b00);
777            }
778            ArmOp::F64Min { dd, dn, dm } => {
779                return self.encode_arm_f64_minmax(dd, dn, dm, true);
780            }
781            ArmOp::F64Max { dd, dn, dm } => {
782                return self.encode_arm_f64_minmax(dd, dn, dm, false);
783            }
784            ArmOp::F64Copysign { dd, dn, dm } => {
785                return self.encode_arm_f64_copysign(dd, dn, dm);
786            }
787
788            // f64 comparisons
789            ArmOp::F64Eq { rd, dn, dm } => {
790                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
791            }
792            ArmOp::F64Ne { rd, dn, dm } => {
793                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
794            }
795            ArmOp::F64Lt { rd, dn, dm } => {
796                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
797            }
798            ArmOp::F64Le { rd, dn, dm } => {
799                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
800            }
801            ArmOp::F64Gt { rd, dn, dm } => {
802                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
803            }
804            ArmOp::F64Ge { rd, dn, dm } => {
805                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
806            }
807
808            ArmOp::F64Const { dd, value } => {
809                return self.encode_arm_f64_const(dd, *value);
810            }
811
812            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
813            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
814
815            ArmOp::F64ConvertI32S { dd, rm } => {
816                return self.encode_arm_f64_convert_i32(dd, rm, true);
817            }
818            ArmOp::F64ConvertI32U { dd, rm } => {
819                return self.encode_arm_f64_convert_i32(dd, rm, false);
820            }
821            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
822                return Err(synth_core::Error::synthesis(
823                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
824                ));
825            }
826            ArmOp::F64PromoteF32 { dd, sm } => {
827                return self.encode_arm_f64_promote_f32(dd, sm);
828            }
829            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
830                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
831            }
832            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
833                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
834            }
835            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
836                return Err(synth_core::Error::synthesis(
837                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
838                ));
839            }
840            ArmOp::I32TruncF64S { rd, dm } => {
841                return self.encode_arm_i32_trunc_f64(rd, dm, true);
842            }
843            ArmOp::I32TruncF64U { rd, dm } => {
844                return self.encode_arm_i32_trunc_f64(rd, dm, false);
845            }
846            // Multi-instruction sequences - only meaningful in Thumb-2 mode
847            ArmOp::I64SetCond { .. }
848            | ArmOp::I64SetCondZ { .. }
849            | ArmOp::I64Mul { .. }
850            | ArmOp::I64Shl { .. }
851            | ArmOp::I64ShrS { .. }
852            | ArmOp::I64ShrU { .. }
853            | ArmOp::I64Rotl { .. }
854            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
855
856            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
857            ArmOp::MveLoad { .. }
858            | ArmOp::MveStore { .. }
859            | ArmOp::MveConst { .. }
860            | ArmOp::MveAnd { .. }
861            | ArmOp::MveOrr { .. }
862            | ArmOp::MveEor { .. }
863            | ArmOp::MveMvn { .. }
864            | ArmOp::MveBic { .. }
865            | ArmOp::MveAddI { .. }
866            | ArmOp::MveSubI { .. }
867            | ArmOp::MveMulI { .. }
868            | ArmOp::MveNegI { .. }
869            | ArmOp::MveCmpEqI { .. }
870            | ArmOp::MveCmpNeI { .. }
871            | ArmOp::MveCmpLtS { .. }
872            | ArmOp::MveCmpLtU { .. }
873            | ArmOp::MveCmpGtS { .. }
874            | ArmOp::MveCmpGtU { .. }
875            | ArmOp::MveCmpLeS { .. }
876            | ArmOp::MveCmpLeU { .. }
877            | ArmOp::MveCmpGeS { .. }
878            | ArmOp::MveCmpGeU { .. }
879            | ArmOp::MveDup { .. }
880            | ArmOp::MveExtractLane { .. }
881            | ArmOp::MveInsertLane { .. }
882            | ArmOp::MveAddF32 { .. }
883            | ArmOp::MveSubF32 { .. }
884            | ArmOp::MveMulF32 { .. }
885            | ArmOp::MveNegF32 { .. }
886            | ArmOp::MveAbsF32 { .. }
887            | ArmOp::MveCmpEqF32 { .. }
888            | ArmOp::MveCmpNeF32 { .. }
889            | ArmOp::MveCmpLtF32 { .. }
890            | ArmOp::MveCmpLeF32 { .. }
891            | ArmOp::MveCmpGtF32 { .. }
892            | ArmOp::MveCmpGeF32 { .. }
893            | ArmOp::MveDupF32 { .. }
894            | ArmOp::MveExtractLaneF32 { .. }
895            | ArmOp::MveReplaceLaneF32 { .. }
896            | ArmOp::MveDivF32 { .. }
897            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
898        };
899
900        // ARM32 instructions are little-endian
901        Ok(instr.to_le_bytes().to_vec())
902    }
903
904    // === ARM32 VFP multi-instruction helpers ===
905
906    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
907    fn encode_arm_f32_compare(
908        &self,
909        rd: &Reg,
910        sn: &VfpReg,
911        sm: &VfpReg,
912        cond_code: u32,
913    ) -> Result<Vec<u8>> {
914        let mut bytes = Vec::new();
915
916        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
917        let sn_num = vfp_sreg_to_num(sn)?;
918        let sm_num = vfp_sreg_to_num(sm)?;
919        let (vd, d) = encode_sreg(sn_num);
920        let (vm, m) = encode_sreg(sm_num);
921        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
922        bytes.extend_from_slice(&vcmp.to_le_bytes());
923
924        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
925        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
926
927        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
928        let rd_bits = reg_to_bits(rd);
929        let mov_zero = 0xE3A00000 | (rd_bits << 12);
930        bytes.extend_from_slice(&mov_zero.to_le_bytes());
931
932        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
933        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
934        bytes.extend_from_slice(&mov_one.to_le_bytes());
935
936        Ok(bytes)
937    }
938
939    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
940    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
941        let mut bytes = Vec::new();
942        let bits = value.to_bits();
943
944        // Use R12 as temp register for constant loading
945        let rt: u32 = 12; // R12/IP
946
947        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
948        let lo16 = bits & 0xFFFF;
949        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
950        bytes.extend_from_slice(&movw.to_le_bytes());
951
952        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
953        let hi16 = (bits >> 16) & 0xFFFF;
954        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
955        bytes.extend_from_slice(&movt.to_le_bytes());
956
957        // VMOV Sd, R12
958        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
959        bytes.extend_from_slice(&vmov.to_le_bytes());
960
961        Ok(bytes)
962    }
963
964    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
965    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
966        let mut bytes = Vec::new();
967
968        // VMOV Sd, Rm — move integer to VFP register
969        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
970        bytes.extend_from_slice(&vmov.to_le_bytes());
971
972        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
973        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
974        let sd_num = vfp_sreg_to_num(sd)?;
975        let (vd, d) = encode_sreg(sd_num);
976        let (vm, m) = encode_sreg(sd_num); // same register as source
977        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
978        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
979        bytes.extend_from_slice(&vcvt.to_le_bytes());
980
981        Ok(bytes)
982    }
983
984    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
985    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
986    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
987    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
988    /// Simplified: convert to int (toward zero for trunc) then back to float.
989    /// Encode F32 rounding as ARM32.
990    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
991    ///
992    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
993    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
994    /// which honours FPSCR rmode), then restores FPSCR.
995    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
996        let mut bytes = Vec::new();
997        let sm_num = vfp_sreg_to_num(sm)?;
998        let sd_num = vfp_sreg_to_num(sd)?;
999        let (vd_s, d_s) = encode_sreg(sd_num);
1000        let (vm_s, m_s) = encode_sreg(sm_num);
1001
1002        if mode == 0b11 {
1003            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1004            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1005            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1006            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1007        } else {
1008            // ceil/floor/nearest: manipulate FPSCR rounding mode
1009            let rt: u32 = 12; // R12/IP as temp
1010
1011            // VMRS R12, FPSCR
1012            let vmrs = 0xEEF10A10 | (rt << 12);
1013            bytes.extend_from_slice(&vmrs.to_le_bytes());
1014
1015            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1016            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1017            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1018            bytes.extend_from_slice(&bic.to_le_bytes());
1019
1020            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1021            if mode != 0 {
1022                // mode<<22: rotation=5, imm8=mode
1023                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1024                bytes.extend_from_slice(&orr.to_le_bytes());
1025            }
1026
1027            // VMSR FPSCR, R12
1028            let vmsr = 0xEEE10A10 | (rt << 12);
1029            bytes.extend_from_slice(&vmsr.to_le_bytes());
1030
1031            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1032            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1033            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1034
1035            // Restore FPSCR: clear rmode bits back to nearest (default)
1036            bytes.extend_from_slice(&vmrs.to_le_bytes());
1037            bytes.extend_from_slice(&bic.to_le_bytes());
1038            bytes.extend_from_slice(&vmsr.to_le_bytes());
1039        }
1040
1041        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1042        let (vd2, d2) = encode_sreg(sd_num);
1043        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1044        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1045
1046        Ok(bytes)
1047    }
1048
1049    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1050    fn encode_arm_f32_minmax(
1051        &self,
1052        sd: &VfpReg,
1053        sn: &VfpReg,
1054        sm: &VfpReg,
1055        is_min: bool,
1056    ) -> Result<Vec<u8>> {
1057        let mut bytes = Vec::new();
1058        let sn_num = vfp_sreg_to_num(sn)?;
1059        let sm_num = vfp_sreg_to_num(sm)?;
1060        let sd_num = vfp_sreg_to_num(sd)?;
1061
1062        // VMOV Sd, Sn (start with first operand)
1063        let (vd, d) = encode_sreg(sd_num);
1064        let (vn, n) = encode_sreg(sn_num);
1065        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1066        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1067
1068        // VCMP.F32 Sn, Sm
1069        let (vm, m) = encode_sreg(sm_num);
1070        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1071        bytes.extend_from_slice(&vcmp.to_le_bytes());
1072
1073        // VMRS APSR_nzcv, FPSCR
1074        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1075
1076        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1077        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1078        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1079
1080        // VMOV{cond} Sd, Sm — conditional VMOV
1081        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1082        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1083
1084        Ok(bytes)
1085    }
1086
1087    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1088    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1089        let mut bytes = Vec::new();
1090
1091        // VMOV R12, Sm (get sign source bits)
1092        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1093        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1094
1095        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1096        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1097        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1098
1099        // AND R12, R12, #0x80000000 (keep only sign bit)
1100        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1101        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1102        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1103        bytes.extend_from_slice(&and_sign.to_le_bytes());
1104
1105        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1106        // R0 = register 0, so Rn and Rd fields are 0
1107        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1108        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1109
1110        // ORR R0, R0, R12 (combine sign + magnitude)
1111        // R0 = register 0, so Rn and Rd fields are 0
1112        let orr = 0xE1800000u32 | 12;
1113        bytes.extend_from_slice(&orr.to_le_bytes());
1114
1115        // VMOV Sd, R0
1116        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1117        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1118
1119        Ok(bytes)
1120    }
1121
1122    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1123    fn encode_arm_f64_compare(
1124        &self,
1125        rd: &Reg,
1126        dn: &VfpReg,
1127        dm: &VfpReg,
1128        cond_code: u32,
1129    ) -> Result<Vec<u8>> {
1130        let mut bytes = Vec::new();
1131
1132        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1133        let dn_num = vfp_dreg_to_num(dn)?;
1134        let dm_num = vfp_dreg_to_num(dm)?;
1135        let (vd, d) = encode_dreg(dn_num);
1136        let (vm, m) = encode_dreg(dm_num);
1137        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1138        bytes.extend_from_slice(&vcmp.to_le_bytes());
1139
1140        // VMRS APSR_nzcv, FPSCR
1141        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1142
1143        // MOV rd, #0
1144        let rd_bits = reg_to_bits(rd);
1145        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1146        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1147
1148        // MOVcond rd, #1
1149        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1150        bytes.extend_from_slice(&mov_one.to_le_bytes());
1151
1152        Ok(bytes)
1153    }
1154
1155    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1156    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1157        let mut bytes = Vec::new();
1158        let bits = value.to_bits();
1159        let lo32 = bits as u32;
1160        let hi32 = (bits >> 32) as u32;
1161
1162        // Load low 32 bits into R0 (Rd field = 0 for R0)
1163        let lo16 = lo32 & 0xFFFF;
1164        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1165        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1166        let hi16 = (lo32 >> 16) & 0xFFFF;
1167        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1168        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1169
1170        // Load high 32 bits into R12
1171        let lo16 = hi32 & 0xFFFF;
1172        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1173        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1174        let hi16 = (hi32 >> 16) & 0xFFFF;
1175        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1176        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1177
1178        // VMOV Dd, R0, R12
1179        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1180        bytes.extend_from_slice(&vmov.to_le_bytes());
1181
1182        Ok(bytes)
1183    }
1184
1185    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1186    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1187        let mut bytes = Vec::new();
1188
1189        // Use S0 as intermediate: VMOV S0, Rm
1190        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1191        bytes.extend_from_slice(&vmov.to_le_bytes());
1192
1193        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1194        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1195        let dd_num = vfp_dreg_to_num(dd)?;
1196        let (vd, d) = encode_dreg(dd_num);
1197        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1198        // S0 is register 0: Vm=0, M=0
1199        let vcvt = base | (d << 22) | (vd << 12);
1200        bytes.extend_from_slice(&vcvt.to_le_bytes());
1201
1202        Ok(bytes)
1203    }
1204
1205    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1206    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1207        let dd_num = vfp_dreg_to_num(dd)?;
1208        let sm_num = vfp_sreg_to_num(sm)?;
1209        let (vd, d) = encode_dreg(dd_num);
1210        let (vm, m) = encode_sreg(sm_num);
1211
1212        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1213        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1214        Ok(vcvt.to_le_bytes().to_vec())
1215    }
1216
1217    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1218    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1219        let mut bytes = Vec::new();
1220        let dm_num = vfp_dreg_to_num(dm)?;
1221        let (vm, m) = encode_dreg(dm_num);
1222
1223        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1224        // S0: Vd=0, D=0
1225        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1226        let vcvt = base | (m << 5) | vm;
1227        bytes.extend_from_slice(&vcvt.to_le_bytes());
1228
1229        // VMOV Rd, S0
1230        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1231        bytes.extend_from_slice(&vmov.to_le_bytes());
1232
1233        Ok(bytes)
1234    }
1235
1236    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1237    /// Encode F64 rounding as ARM32.
1238    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1239    ///
1240    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1241    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1242    /// then restores FPSCR.
1243    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1244        let mut bytes = Vec::new();
1245        let dm_num = vfp_dreg_to_num(dm)?;
1246        let dd_num = vfp_dreg_to_num(dd)?;
1247        let (vm, m) = encode_dreg(dm_num);
1248        let (vd, d) = encode_dreg(dd_num);
1249
1250        if mode == 0b11 {
1251            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1252            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1253            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1254        } else {
1255            // ceil/floor/nearest: manipulate FPSCR rounding mode
1256            let rt: u32 = 12;
1257
1258            // VMRS R12, FPSCR
1259            let vmrs = 0xEEF10A10 | (rt << 12);
1260            bytes.extend_from_slice(&vmrs.to_le_bytes());
1261
1262            // BIC R12, R12, #(3 << 22)
1263            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1264            bytes.extend_from_slice(&bic.to_le_bytes());
1265
1266            // ORR R12, R12, #(mode << 22)
1267            if mode != 0 {
1268                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1269                bytes.extend_from_slice(&orr.to_le_bytes());
1270            }
1271
1272            // VMSR FPSCR, R12
1273            let vmsr = 0xEEE10A10 | (rt << 12);
1274            bytes.extend_from_slice(&vmsr.to_le_bytes());
1275
1276            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1277            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1278            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1279
1280            // Restore FPSCR
1281            bytes.extend_from_slice(&vmrs.to_le_bytes());
1282            bytes.extend_from_slice(&bic.to_le_bytes());
1283            bytes.extend_from_slice(&vmsr.to_le_bytes());
1284        }
1285
1286        // VCVT.F64.S32 Dd, S0 (convert back to double)
1287        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1288        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1289
1290        Ok(bytes)
1291    }
1292
1293    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1294    fn encode_arm_f64_minmax(
1295        &self,
1296        dd: &VfpReg,
1297        dn: &VfpReg,
1298        dm: &VfpReg,
1299        is_min: bool,
1300    ) -> Result<Vec<u8>> {
1301        let mut bytes = Vec::new();
1302        let dn_num = vfp_dreg_to_num(dn)?;
1303        let dm_num = vfp_dreg_to_num(dm)?;
1304        let dd_num = vfp_dreg_to_num(dd)?;
1305
1306        // VMOV.F64 Dd, Dn (start with first operand)
1307        let (vd, d) = encode_dreg(dd_num);
1308        let (vn, n) = encode_dreg(dn_num);
1309        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1310        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1311
1312        // VCMP.F64 Dn, Dm
1313        let (vm, m) = encode_dreg(dm_num);
1314        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1315        bytes.extend_from_slice(&vcmp.to_le_bytes());
1316
1317        // VMRS APSR_nzcv, FPSCR
1318        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1319
1320        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1321        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1322        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1323
1324        Ok(bytes)
1325    }
1326
1327    /// Encode F64 copysign as ARM32
1328    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1329        let mut bytes = Vec::new();
1330
1331        // VMOV R0, R12, Dm (get sign source bits)
1332        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1333        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1334
1335        // VMOV R1, R2, Dn (get magnitude source bits)
1336        // We use R1 (lo) and R2 (hi) for the magnitude
1337        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1338        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1339
1340        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1341        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1342        bytes.extend_from_slice(&and_sign.to_le_bytes());
1343
1344        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1345        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1346        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1347
1348        // ORR R2, R2, R12 (combine sign + magnitude)
1349        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1350        bytes.extend_from_slice(&orr.to_le_bytes());
1351
1352        // VMOV Dd, R1, R2
1353        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1354        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1355
1356        Ok(bytes)
1357    }
1358
1359    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1360    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1361        let mut bytes = Vec::new();
1362
1363        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1364        // We use Sm as both source and destination for the intermediate result
1365        let sm_num = vfp_sreg_to_num(sm)?;
1366        let (vd, d) = encode_sreg(sm_num);
1367        let (vm, m) = encode_sreg(sm_num);
1368        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1369        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1370        bytes.extend_from_slice(&vcvt.to_le_bytes());
1371
1372        // VMOV Rd, Sm — move result back to core register
1373        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1374        bytes.extend_from_slice(&vmov.to_le_bytes());
1375
1376        Ok(bytes)
1377    }
1378
1379    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1380    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1381        // Thumb-2 supports both 16-bit and 32-bit instructions
1382        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1383        match op {
1384            // === 16-bit Thumb encodings ===
1385            ArmOp::Add { rd, rn, op2 } => {
1386                let rd_bits = reg_to_bits(rd) as u16;
1387                let rn_bits = reg_to_bits(rn) as u16;
1388
1389                if let Operand2::Reg(rm) = op2 {
1390                    let rm_bits = reg_to_bits(rm) as u16;
1391                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1392                    // high registers (e.g. R12, the MemLoad/MemStore base
1393                    // scratch) the bits overflow into adjacent fields, silently
1394                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1395                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1396                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1397                    // as the Sub handler below does.
1398                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1399                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1400                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1401                        Ok(instr.to_le_bytes().to_vec())
1402                    } else {
1403                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1404                        self.encode_thumb32_add_reg_raw(
1405                            rd_bits as u32,
1406                            rn_bits as u32,
1407                            rm_bits as u32,
1408                        )
1409                    }
1410                } else if let Operand2::Imm(imm) = op2 {
1411                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1412                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1413                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1414                        Ok(instr.to_le_bytes().to_vec())
1415                    } else {
1416                        // Use 32-bit ADD for larger immediates
1417                        self.encode_thumb32_add(rd, rn, *imm as u32)
1418                    }
1419                } else {
1420                    // Fallback to 32-bit encoding
1421                    self.encode_thumb32_add(rd, rn, 0)
1422                }
1423            }
1424
1425            ArmOp::Sub { rd, rn, op2 } => {
1426                let rd_bits = reg_to_bits(rd) as u16;
1427                let rn_bits = reg_to_bits(rn) as u16;
1428
1429                if let Operand2::Reg(rm) = op2 {
1430                    let rm_bits = reg_to_bits(rm) as u16;
1431                    // 16-bit SUBS can only use low registers (R0-R7)
1432                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1433                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1434                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1435                        Ok(instr.to_le_bytes().to_vec())
1436                    } else {
1437                        // Use 32-bit SUB.W for high registers
1438                        self.encode_thumb32_sub_reg_raw(
1439                            rd_bits as u32,
1440                            rn_bits as u32,
1441                            rm_bits as u32,
1442                        )
1443                    }
1444                } else if let Operand2::Imm(imm) = op2 {
1445                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1446                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1447                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1448                        Ok(instr.to_le_bytes().to_vec())
1449                    } else {
1450                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1451                    }
1452                } else {
1453                    self.encode_thumb32_sub(rd, rn, 0)
1454                }
1455            }
1456
1457            ArmOp::Mov { rd, op2 } => {
1458                let rd_bits = reg_to_bits(rd) as u16;
1459
1460                if let Operand2::Imm(imm) = op2 {
1461                    if *imm <= 255 && rd_bits < 8 {
1462                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1463                        let imm_bits = (*imm as u16) & 0xFF;
1464                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1465                        Ok(instr.to_le_bytes().to_vec())
1466                    } else {
1467                        // Use 32-bit MOVW for larger immediates
1468                        self.encode_thumb32_movw(rd, *imm as u32)
1469                    }
1470                } else if let Operand2::Reg(rm) = op2 {
1471                    let rm_bits = reg_to_bits(rm) as u16;
1472                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1473                    // D = Rd[3], Rd[2:0] in lower bits
1474                    let d_bit = (rd_bits >> 3) & 1;
1475                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1476                    Ok(instr.to_le_bytes().to_vec())
1477                } else {
1478                    let instr: u16 = 0xBF00; // NOP fallback
1479                    Ok(instr.to_le_bytes().to_vec())
1480                }
1481            }
1482
1483            ArmOp::Push { regs } => {
1484                // Thumb-2 PUSH encoding:
1485                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1486                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1487                let mut reg_list: u16 = 0;
1488                let mut need_32bit = false;
1489                for r in regs {
1490                    let bit = reg_to_bits(r);
1491                    if bit >= 8 && *r != Reg::LR {
1492                        need_32bit = true;
1493                    }
1494                    reg_list |= 1 << bit;
1495                }
1496                if !need_32bit {
1497                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1498                    let m_bit = if reg_list & (1 << 14) != 0 {
1499                        1u16
1500                    } else {
1501                        0u16
1502                    };
1503                    let low_regs = reg_list & 0xFF;
1504                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1505                    Ok(instr.to_le_bytes().to_vec())
1506                } else {
1507                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1508                    let hw1: u16 = 0xE92D;
1509                    let hw2: u16 = reg_list;
1510                    let mut bytes = hw1.to_le_bytes().to_vec();
1511                    bytes.extend_from_slice(&hw2.to_le_bytes());
1512                    Ok(bytes)
1513                }
1514            }
1515
1516            ArmOp::Pop { regs } => {
1517                // Thumb-2 POP encoding:
1518                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1519                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1520                let mut reg_list: u16 = 0;
1521                let mut need_32bit = false;
1522                for r in regs {
1523                    let bit = reg_to_bits(r);
1524                    if bit >= 8 && *r != Reg::PC {
1525                        need_32bit = true;
1526                    }
1527                    reg_list |= 1 << bit;
1528                }
1529                if !need_32bit {
1530                    // 16-bit POP: 1011 110 P rrrrrrrr
1531                    let p_bit = if reg_list & (1 << 15) != 0 {
1532                        1u16
1533                    } else {
1534                        0u16
1535                    };
1536                    let low_regs = reg_list & 0xFF;
1537                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1538                    Ok(instr.to_le_bytes().to_vec())
1539                } else {
1540                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1541                    let hw1: u16 = 0xE8BD;
1542                    let hw2: u16 = reg_list;
1543                    let mut bytes = hw1.to_le_bytes().to_vec();
1544                    bytes.extend_from_slice(&hw2.to_le_bytes());
1545                    Ok(bytes)
1546                }
1547            }
1548
1549            ArmOp::Nop => {
1550                let instr: u16 = 0xBF00; // NOP in Thumb-2
1551                Ok(instr.to_le_bytes().to_vec())
1552            }
1553
1554            ArmOp::Udf { imm } => {
1555                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1556                // This triggers UsageFault/HardFault, used for WASM traps
1557                let instr: u16 = 0xDE00 | (*imm as u16);
1558                let bytes = instr.to_le_bytes().to_vec();
1559                encoding_contracts::verify_thumb16(&bytes);
1560                Ok(bytes)
1561            }
1562
1563            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1564            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1565            ArmOp::Adds { rd, rn, op2 } => {
1566                let rd_bits = reg_to_bits(rd) as u16;
1567                let rn_bits = reg_to_bits(rn) as u16;
1568
1569                if let Operand2::Reg(rm) = op2 {
1570                    let rm_bits = reg_to_bits(rm) as u16;
1571                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1572                    // operands in R8-R11, which would overflow the 3-bit fields
1573                    // and corrupt the operands (#178/#180 class). Guard and fall
1574                    // back to 32-bit ADDS.W for high registers.
1575                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1576                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1577                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1578                        Ok(instr.to_le_bytes().to_vec())
1579                    } else {
1580                        self.encode_thumb32_adds_reg_raw(
1581                            rd_bits as u32,
1582                            rn_bits as u32,
1583                            rm_bits as u32,
1584                        )
1585                    }
1586                } else {
1587                    // 32-bit Thumb-2 ADDS with immediate
1588                    self.encode_thumb32_adds(rd, rn, 0)
1589                }
1590            }
1591
1592            // ADC: Add with Carry (Thumb-2 32-bit)
1593            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1594            ArmOp::Adc { rd, rn, op2 } => {
1595                let rd_bits = reg_to_bits(rd);
1596                let rn_bits = reg_to_bits(rn);
1597
1598                if let Operand2::Reg(rm) = op2 {
1599                    let rm_bits = reg_to_bits(rm);
1600                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1601                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1602                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1603
1604                    let mut bytes = hw1.to_le_bytes().to_vec();
1605                    bytes.extend_from_slice(&hw2.to_le_bytes());
1606                    Ok(bytes)
1607                } else {
1608                    // ADC with immediate - use 32-bit encoding
1609                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1610                    let hw2: u16 = (rd_bits << 8) as u16;
1611                    let mut bytes = hw1.to_le_bytes().to_vec();
1612                    bytes.extend_from_slice(&hw2.to_le_bytes());
1613                    Ok(bytes)
1614                }
1615            }
1616
1617            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1618            ArmOp::Subs { rd, rn, op2 } => {
1619                let rd_bits = reg_to_bits(rd) as u16;
1620                let rn_bits = reg_to_bits(rn) as u16;
1621
1622                if let Operand2::Reg(rm) = op2 {
1623                    let rm_bits = reg_to_bits(rm) as u16;
1624                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1625                    // would overflow the 3-bit fields (#178/#180 class). Guard
1626                    // and fall back to 32-bit SUBS.W for high registers.
1627                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1628                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1629                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1630                        Ok(instr.to_le_bytes().to_vec())
1631                    } else {
1632                        self.encode_thumb32_subs_reg_raw(
1633                            rd_bits as u32,
1634                            rn_bits as u32,
1635                            rm_bits as u32,
1636                        )
1637                    }
1638                } else {
1639                    // 32-bit Thumb-2 SUBS with immediate
1640                    self.encode_thumb32_subs(rd, rn, 0)
1641                }
1642            }
1643
1644            // SBC: Subtract with Carry (Thumb-2 32-bit)
1645            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1646            ArmOp::Sbc { rd, rn, op2 } => {
1647                let rd_bits = reg_to_bits(rd);
1648                let rn_bits = reg_to_bits(rn);
1649
1650                if let Operand2::Reg(rm) = op2 {
1651                    let rm_bits = reg_to_bits(rm);
1652                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1653                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1654                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1655
1656                    let mut bytes = hw1.to_le_bytes().to_vec();
1657                    bytes.extend_from_slice(&hw2.to_le_bytes());
1658                    Ok(bytes)
1659                } else {
1660                    // SBC with immediate - use 32-bit encoding
1661                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1662                    let hw2: u16 = (rd_bits << 8) as u16;
1663                    let mut bytes = hw1.to_le_bytes().to_vec();
1664                    bytes.extend_from_slice(&hw2.to_le_bytes());
1665                    Ok(bytes)
1666                }
1667            }
1668
1669            // === 32-bit Thumb-2 encodings ===
1670
1671            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1672            ArmOp::Sdiv { rd, rn, rm } => {
1673                let rd_bits = reg_to_bits(rd);
1674                let rn_bits = reg_to_bits(rn);
1675                let rm_bits = reg_to_bits(rm);
1676                reg_bits_checked(rd_bits)?;
1677                reg_bits_checked(rn_bits)?;
1678                reg_bits_checked(rm_bits)?;
1679
1680                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1681                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1682                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1683                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1684                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1685
1686                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1687                let mut bytes = hw1.to_le_bytes().to_vec();
1688                bytes.extend_from_slice(&hw2.to_le_bytes());
1689                encoding_contracts::verify_thumb32(&bytes);
1690                Ok(bytes)
1691            }
1692
1693            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1694            ArmOp::Udiv { rd, rn, rm } => {
1695                let rd_bits = reg_to_bits(rd);
1696                let rn_bits = reg_to_bits(rn);
1697                let rm_bits = reg_to_bits(rm);
1698                reg_bits_checked(rd_bits)?;
1699                reg_bits_checked(rn_bits)?;
1700                reg_bits_checked(rm_bits)?;
1701
1702                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1703                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1704                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1705
1706                let mut bytes = hw1.to_le_bytes().to_vec();
1707                bytes.extend_from_slice(&hw2.to_le_bytes());
1708                encoding_contracts::verify_thumb32(&bytes);
1709                Ok(bytes)
1710            }
1711
1712            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1713            ArmOp::Mul { rd, rn, rm } => {
1714                let rd_bits = reg_to_bits(rd);
1715                let rn_bits = reg_to_bits(rn);
1716                let rm_bits = reg_to_bits(rm);
1717
1718                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1719                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1720                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1721                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1722
1723                let mut bytes = hw1.to_le_bytes().to_vec();
1724                bytes.extend_from_slice(&hw2.to_le_bytes());
1725                Ok(bytes)
1726            }
1727
1728            // MLS: Rd = Ra - Rn * Rm
1729            ArmOp::Mls { rd, rn, rm, ra } => {
1730                let rd_bits = reg_to_bits(rd);
1731                let rn_bits = reg_to_bits(rn);
1732                let rm_bits = reg_to_bits(rm);
1733                let ra_bits = reg_to_bits(ra);
1734
1735                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1736                // 11111011 0000 Rn | Ra Rd 0001 Rm
1737                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1738                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1739
1740                let mut bytes = hw1.to_le_bytes().to_vec();
1741                bytes.extend_from_slice(&hw2.to_le_bytes());
1742                Ok(bytes)
1743            }
1744
1745            // AND (Thumb-2 32-bit)
1746            ArmOp::And { rd, rn, op2 } => {
1747                if let Operand2::Reg(rm) = op2 {
1748                    let rd_bits = reg_to_bits(rd);
1749                    let rn_bits = reg_to_bits(rn);
1750                    let rm_bits = reg_to_bits(rm);
1751
1752                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1753                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1754                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1755
1756                    let mut bytes = hw1.to_le_bytes().to_vec();
1757                    bytes.extend_from_slice(&hw2.to_le_bytes());
1758                    Ok(bytes)
1759                } else if let Operand2::Imm(imm) = op2 {
1760                    let rd_bits = reg_to_bits(rd);
1761                    let rn_bits = reg_to_bits(rn);
1762                    let imm_val = *imm as u32;
1763
1764                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
1765                    let i_bit = (imm_val >> 11) & 1;
1766                    let imm3 = (imm_val >> 8) & 0x7;
1767                    let imm8 = imm_val & 0xFF;
1768
1769                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1770                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1771
1772                    let mut bytes = hw1.to_le_bytes().to_vec();
1773                    bytes.extend_from_slice(&hw2.to_le_bytes());
1774                    Ok(bytes)
1775                } else {
1776                    // RegShift variant - fallback to NOP
1777                    let instr: u16 = 0xBF00;
1778                    Ok(instr.to_le_bytes().to_vec())
1779                }
1780            }
1781
1782            // ORR (Thumb-2 32-bit)
1783            ArmOp::Orr { rd, rn, op2 } => {
1784                if let Operand2::Reg(rm) = op2 {
1785                    let rd_bits = reg_to_bits(rd);
1786                    let rn_bits = reg_to_bits(rn);
1787                    let rm_bits = reg_to_bits(rm);
1788
1789                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1790                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1791                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1792
1793                    let mut bytes = hw1.to_le_bytes().to_vec();
1794                    bytes.extend_from_slice(&hw2.to_le_bytes());
1795                    Ok(bytes)
1796                } else {
1797                    let instr: u16 = 0xBF00;
1798                    Ok(instr.to_le_bytes().to_vec())
1799                }
1800            }
1801
1802            // EOR (Thumb-2 32-bit)
1803            ArmOp::Eor { rd, rn, op2 } => {
1804                if let Operand2::Reg(rm) = op2 {
1805                    let rd_bits = reg_to_bits(rd);
1806                    let rn_bits = reg_to_bits(rn);
1807                    let rm_bits = reg_to_bits(rm);
1808
1809                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
1810                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
1811                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1812
1813                    let mut bytes = hw1.to_le_bytes().to_vec();
1814                    bytes.extend_from_slice(&hw2.to_le_bytes());
1815                    Ok(bytes)
1816                } else {
1817                    let instr: u16 = 0xBF00;
1818                    Ok(instr.to_le_bytes().to_vec())
1819                }
1820            }
1821
1822            // Shift operations (16-bit for low registers)
1823            ArmOp::Lsl { rd, rn, shift } => {
1824                let rd_bits = reg_to_bits(rd) as u16;
1825                let rn_bits = reg_to_bits(rn) as u16;
1826                let shift_bits = (*shift as u16) & 0x1F;
1827
1828                if rd_bits < 8 && rn_bits < 8 {
1829                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
1830                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1831                    Ok(instr.to_le_bytes().to_vec())
1832                } else {
1833                    // Use 32-bit encoding for high registers
1834                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
1835                }
1836            }
1837
1838            ArmOp::Lsr { rd, rn, shift } => {
1839                let rd_bits = reg_to_bits(rd) as u16;
1840                let rn_bits = reg_to_bits(rn) as u16;
1841                let shift_bits = (*shift as u16) & 0x1F;
1842
1843                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1844                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
1845                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1846                    Ok(instr.to_le_bytes().to_vec())
1847                } else {
1848                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
1849                }
1850            }
1851
1852            ArmOp::Asr { rd, rn, shift } => {
1853                let rd_bits = reg_to_bits(rd) as u16;
1854                let rn_bits = reg_to_bits(rn) as u16;
1855                let shift_bits = (*shift as u16) & 0x1F;
1856
1857                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1858                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
1859                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1860                    Ok(instr.to_le_bytes().to_vec())
1861                } else {
1862                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
1863                }
1864            }
1865
1866            ArmOp::Ror { rd, rn, shift } => {
1867                // ROR doesn't have a 16-bit immediate form, use 32-bit
1868                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
1869            }
1870
1871            // Register-based shifts (Thumb-2 32-bit)
1872            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
1873            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
1874            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
1875            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
1876            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
1877            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
1878
1879            // RSB (Reverse Subtract): Rd = imm - Rn
1880            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
1881            ArmOp::Rsb { rd, rn, imm } => {
1882                let rd_bits = reg_to_bits(rd);
1883                let rn_bits = reg_to_bits(rn);
1884                let imm_val = *imm;
1885
1886                let i_bit = (imm_val >> 11) & 1;
1887                let imm3 = (imm_val >> 8) & 0x7;
1888                let imm8 = imm_val & 0xFF;
1889
1890                // hw1: 11110 i 01110 0 Rn  (S=0)
1891                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
1892                // hw2: 0 imm3 Rd imm8
1893                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1894
1895                let mut bytes = hw1.to_le_bytes().to_vec();
1896                bytes.extend_from_slice(&hw2.to_le_bytes());
1897                Ok(bytes)
1898            }
1899
1900            // CLZ (Thumb-2 32-bit)
1901            ArmOp::Clz { rd, rm } => {
1902                let rd_bits = reg_to_bits(rd);
1903                let rm_bits = reg_to_bits(rm);
1904
1905                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
1906                // 11111010 1011 Rm | 1111 1000 Rd Rm
1907                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
1908                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
1909
1910                let mut bytes = hw1.to_le_bytes().to_vec();
1911                bytes.extend_from_slice(&hw2.to_le_bytes());
1912                Ok(bytes)
1913            }
1914
1915            // RBIT (Thumb-2 32-bit)
1916            ArmOp::Rbit { rd, rm } => {
1917                let rd_bits = reg_to_bits(rd);
1918                let rm_bits = reg_to_bits(rm);
1919
1920                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
1921                // 11111010 1001 Rm | 1111 Rd 1010 Rm
1922                let hw1: u16 = (0xFA90 | rm_bits) as u16;
1923                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
1924
1925                let mut bytes = hw1.to_le_bytes().to_vec();
1926                bytes.extend_from_slice(&hw2.to_le_bytes());
1927                Ok(bytes)
1928            }
1929
1930            // SXTB (16-bit for low registers)
1931            ArmOp::Sxtb { rd, rm } => {
1932                let rd_bits = reg_to_bits(rd) as u16;
1933                let rm_bits = reg_to_bits(rm) as u16;
1934
1935                if rd_bits < 8 && rm_bits < 8 {
1936                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
1937                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
1938                    Ok(instr.to_le_bytes().to_vec())
1939                } else {
1940                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
1941                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
1942                    let rd_bits32 = rd_bits as u32;
1943                    let rm_bits32 = rm_bits as u32;
1944                    let hw1: u16 = 0xFA4F;
1945                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
1946                    let mut bytes = hw1.to_le_bytes().to_vec();
1947                    bytes.extend_from_slice(&hw2.to_le_bytes());
1948                    Ok(bytes)
1949                }
1950            }
1951
1952            // SXTH (16-bit for low registers)
1953            ArmOp::Sxth { rd, rm } => {
1954                let rd_bits = reg_to_bits(rd) as u16;
1955                let rm_bits = reg_to_bits(rm) as u16;
1956
1957                if rd_bits < 8 && rm_bits < 8 {
1958                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
1959                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
1960                    Ok(instr.to_le_bytes().to_vec())
1961                } else {
1962                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
1963                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
1964                    let rd_bits32 = rd_bits as u32;
1965                    let rm_bits32 = rm_bits as u32;
1966                    let hw1: u16 = 0xFA0F;
1967                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
1968                    let mut bytes = hw1.to_le_bytes().to_vec();
1969                    bytes.extend_from_slice(&hw2.to_le_bytes());
1970                    Ok(bytes)
1971                }
1972            }
1973
1974            // CMP (can be 16-bit for low registers)
1975            ArmOp::Cmp { rn, op2 } => {
1976                let rn_bits = reg_to_bits(rn) as u16;
1977
1978                if let Operand2::Imm(imm) = op2 {
1979                    // Only use 16-bit encoding for non-negative immediates 0-255
1980                    // Negative immediates must use 32-bit encoding
1981                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
1982                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
1983                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
1984                        Ok(instr.to_le_bytes().to_vec())
1985                    } else {
1986                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
1987                    }
1988                } else if let Operand2::Reg(rm) = op2 {
1989                    let rm_bits = reg_to_bits(rm) as u16;
1990                    if rn_bits < 8 && rm_bits < 8 {
1991                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
1992                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
1993                        Ok(instr.to_le_bytes().to_vec())
1994                    } else {
1995                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
1996                        let n_bit = (rn_bits >> 3) & 1;
1997                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
1998                        Ok(instr.to_le_bytes().to_vec())
1999                    }
2000                } else {
2001                    let instr: u16 = 0xBF00;
2002                    Ok(instr.to_le_bytes().to_vec())
2003                }
2004            }
2005
2006            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2007            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2008            ArmOp::Cmn { rn, op2 } => {
2009                let rn_bits = reg_to_bits(rn) as u16;
2010
2011                if let Operand2::Imm(imm) = op2 {
2012                    // CMN.W Rn, #imm (32-bit encoding)
2013                    // Encoding: F110 Rn | 0F00 imm8 (for small immediates 0-255)
2014                    if *imm >= 0 && *imm <= 255 {
2015                        let imm8 = *imm as u16 & 0xFF;
2016                        let hw1: u16 = 0xF110 | rn_bits;
2017                        let hw2: u16 = 0x0F00 | imm8;
2018                        let mut bytes = hw1.to_le_bytes().to_vec();
2019                        bytes.extend_from_slice(&hw2.to_le_bytes());
2020                        Ok(bytes)
2021                    } else {
2022                        // For other immediates, fallback to NOP (should not happen in our use case)
2023                        Ok(vec![0xBF, 0x00])
2024                    }
2025                } else if let Operand2::Reg(rm) = op2 {
2026                    let rm_bits = reg_to_bits(rm) as u16;
2027                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2028                    // the 3-bit fields and corrupt the operands (#184, the #180
2029                    // class). CMN has no high-register 16-bit form, so fall back
2030                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2031                    // Rd discarded as PC/1111).
2032                    if rn_bits < 8 && rm_bits < 8 {
2033                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2034                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2035                        Ok(instr.to_le_bytes().to_vec())
2036                    } else {
2037                        let hw1: u16 = 0xEB10 | rn_bits;
2038                        let hw2: u16 = 0x0F00 | rm_bits;
2039                        let mut bytes = hw1.to_le_bytes().to_vec();
2040                        bytes.extend_from_slice(&hw2.to_le_bytes());
2041                        Ok(bytes)
2042                    }
2043                } else {
2044                    Ok(vec![0xBF, 0x00])
2045                }
2046            }
2047
2048            // LDR (can be 16-bit for simple cases)
2049            ArmOp::Ldr { rd, addr } => {
2050                let rd_bits = reg_to_bits(rd);
2051                let base_bits = reg_to_bits(&addr.base);
2052
2053                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2054                if let Some(offset_reg) = &addr.offset_reg {
2055                    let rm_bits = reg_to_bits(offset_reg);
2056
2057                    // If there's also an immediate offset, we need to ADD it first
2058                    if addr.offset != 0 {
2059                        // Use R12 (IP) as scratch to avoid clobbering the address register
2060                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2061                        let scratch = Reg::R12;
2062                        let mut bytes =
2063                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2064                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2065                        return Ok(bytes);
2066                    }
2067
2068                    // Simple register offset: LDR Rd, [Rn, Rm]
2069                    // 16-bit: only if Rd, Rn, Rm < R8
2070                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2071                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2072                        let instr: u16 = 0x5800
2073                            | ((rm_bits as u16) << 6)
2074                            | ((base_bits as u16) << 3)
2075                            | (rd_bits as u16);
2076                        return Ok(instr.to_le_bytes().to_vec());
2077                    }
2078
2079                    // 32-bit register offset
2080                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2081                }
2082
2083                // Immediate offset mode [base, #imm]
2084                let offset = addr.offset as u32;
2085
2086                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2087                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2088                    let imm5 = (offset >> 2) as u16;
2089                    let instr: u16 =
2090                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2091                    Ok(instr.to_le_bytes().to_vec())
2092                } else {
2093                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2094                }
2095            }
2096
2097            // STR (can be 16-bit for simple cases)
2098            ArmOp::Str { rd, addr } => {
2099                let rd_bits = reg_to_bits(rd);
2100                let base_bits = reg_to_bits(&addr.base);
2101
2102                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2103                if let Some(offset_reg) = &addr.offset_reg {
2104                    let rm_bits = reg_to_bits(offset_reg);
2105
2106                    // If there's also an immediate offset, we need to ADD it first
2107                    if addr.offset != 0 {
2108                        // Use R12 (IP) as scratch to avoid clobbering the address register
2109                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2110                        let scratch = Reg::R12;
2111                        let mut bytes =
2112                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2113                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2114                        return Ok(bytes);
2115                    }
2116
2117                    // Simple register offset: STR Rd, [Rn, Rm]
2118                    // 16-bit: only if Rd, Rn, Rm < R8
2119                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2120                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2121                        let instr: u16 = 0x5000
2122                            | ((rm_bits as u16) << 6)
2123                            | ((base_bits as u16) << 3)
2124                            | (rd_bits as u16);
2125                        return Ok(instr.to_le_bytes().to_vec());
2126                    }
2127
2128                    // 32-bit register offset
2129                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2130                }
2131
2132                // Immediate offset mode [base, #imm]
2133                let offset = addr.offset as u32;
2134
2135                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2136                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2137                    let imm5 = (offset >> 2) as u16;
2138                    let instr: u16 =
2139                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2140                    Ok(instr.to_le_bytes().to_vec())
2141                } else {
2142                    self.encode_thumb32_str(rd, &addr.base, offset)
2143                }
2144            }
2145
2146            // LDRB (Thumb-2)
2147            ArmOp::Ldrb { rd, addr } => {
2148                let rd_bits = reg_to_bits(rd);
2149                let base_bits = reg_to_bits(&addr.base);
2150
2151                if let Some(offset_reg) = &addr.offset_reg {
2152                    if addr.offset != 0 {
2153                        let scratch = Reg::R12;
2154                        let mut bytes =
2155                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2156                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2157                        return Ok(bytes);
2158                    }
2159                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2160                }
2161
2162                let offset = addr.offset as u32;
2163                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2164                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2165                    let instr: u16 = 0x7800
2166                        | ((offset as u16) << 6)
2167                        | ((base_bits as u16) << 3)
2168                        | (rd_bits as u16);
2169                    Ok(instr.to_le_bytes().to_vec())
2170                } else {
2171                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2172                }
2173            }
2174
2175            // LDRSB (Thumb-2)
2176            ArmOp::Ldrsb { rd, addr } => {
2177                let rd_bits = reg_to_bits(rd);
2178                let base_bits = reg_to_bits(&addr.base);
2179
2180                if let Some(offset_reg) = &addr.offset_reg {
2181                    if addr.offset != 0 {
2182                        let scratch = Reg::R12;
2183                        let mut bytes =
2184                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2185                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2186                        return Ok(bytes);
2187                    }
2188                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2189                }
2190
2191                let offset = addr.offset as u32;
2192                // LDRSB has no 16-bit immediate form (only register)
2193                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2194                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2195                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2196                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2197                } else {
2198                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2199                }
2200            }
2201
2202            // LDRH (Thumb-2)
2203            ArmOp::Ldrh { rd, addr } => {
2204                let rd_bits = reg_to_bits(rd);
2205                let base_bits = reg_to_bits(&addr.base);
2206
2207                if let Some(offset_reg) = &addr.offset_reg {
2208                    if addr.offset != 0 {
2209                        let scratch = Reg::R12;
2210                        let mut bytes =
2211                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2212                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2213                        return Ok(bytes);
2214                    }
2215                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2216                }
2217
2218                let offset = addr.offset as u32;
2219                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2220                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2221                    let imm5 = (offset >> 1) as u16;
2222                    let instr: u16 =
2223                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2224                    Ok(instr.to_le_bytes().to_vec())
2225                } else {
2226                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2227                }
2228            }
2229
2230            // LDRSH (Thumb-2)
2231            ArmOp::Ldrsh { rd, addr } => {
2232                if let Some(offset_reg) = &addr.offset_reg {
2233                    if addr.offset != 0 {
2234                        let scratch = Reg::R12;
2235                        let mut bytes =
2236                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2237                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2238                        return Ok(bytes);
2239                    }
2240                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2241                }
2242
2243                let offset = addr.offset as u32;
2244                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2245            }
2246
2247            // STRB (Thumb-2)
2248            ArmOp::Strb { rd, addr } => {
2249                let rd_bits = reg_to_bits(rd);
2250                let base_bits = reg_to_bits(&addr.base);
2251
2252                if let Some(offset_reg) = &addr.offset_reg {
2253                    if addr.offset != 0 {
2254                        let scratch = Reg::R12;
2255                        let mut bytes =
2256                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2257                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2258                        return Ok(bytes);
2259                    }
2260                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2261                }
2262
2263                let offset = addr.offset as u32;
2264                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2265                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2266                    let instr: u16 = 0x7000
2267                        | ((offset as u16) << 6)
2268                        | ((base_bits as u16) << 3)
2269                        | (rd_bits as u16);
2270                    Ok(instr.to_le_bytes().to_vec())
2271                } else {
2272                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2273                }
2274            }
2275
2276            // STRH (Thumb-2)
2277            ArmOp::Strh { rd, addr } => {
2278                let rd_bits = reg_to_bits(rd);
2279                let base_bits = reg_to_bits(&addr.base);
2280
2281                if let Some(offset_reg) = &addr.offset_reg {
2282                    if addr.offset != 0 {
2283                        let scratch = Reg::R12;
2284                        let mut bytes =
2285                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2286                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2287                        return Ok(bytes);
2288                    }
2289                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2290                }
2291
2292                let offset = addr.offset as u32;
2293                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2294                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2295                    let imm5 = (offset >> 1) as u16;
2296                    let instr: u16 =
2297                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2298                    Ok(instr.to_le_bytes().to_vec())
2299                } else {
2300                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2301                }
2302            }
2303
2304            // MemorySize (Thumb-2)
2305            ArmOp::MemorySize { rd } => {
2306                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2307                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2308                let rd_bits = reg_to_bits(rd);
2309                let r10_bits = reg_to_bits(&Reg::R10);
2310                if rd_bits < 8 && r10_bits < 8 {
2311                    let instr: u16 =
2312                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2313                    Ok(instr.to_le_bytes().to_vec())
2314                } else {
2315                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2316                    let imm5: u32 = 16;
2317                    let imm3 = (imm5 >> 2) & 0x7;
2318                    let imm2 = imm5 & 0x3;
2319                    let hw1: u16 = 0xEA4F;
2320                    let hw2: u16 =
2321                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2322                    let mut bytes = hw1.to_le_bytes().to_vec();
2323                    bytes.extend_from_slice(&hw2.to_le_bytes());
2324                    Ok(bytes)
2325                }
2326            }
2327
2328            // MemoryGrow (Thumb-2)
2329            ArmOp::MemoryGrow { rd, .. } => {
2330                // On embedded with fixed memory, always return -1 (failure)
2331                // MVN rd, #0 → MOV rd, #-1
2332                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2333                let rd_bits = reg_to_bits(rd);
2334                let hw1: u16 = 0xF06F; // MVN with i=0
2335                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2336                let mut bytes = hw1.to_le_bytes().to_vec();
2337                bytes.extend_from_slice(&hw2.to_le_bytes());
2338                Ok(bytes)
2339            }
2340
2341            // BX (16-bit)
2342            ArmOp::Bx { rm } => {
2343                let rm_bits = reg_to_bits(rm) as u16;
2344                // BX Rm (16-bit): 0100 0111 0 Rm 000
2345                let instr: u16 = 0x4700 | (rm_bits << 3);
2346                Ok(instr.to_le_bytes().to_vec())
2347            }
2348
2349            // BLX (16-bit) - Branch with Link and Exchange
2350            // BLX Rm: 0100 0111 1 Rm 000
2351            ArmOp::Blx { rm } => {
2352                let rm_bits = reg_to_bits(rm) as u16;
2353                let instr: u16 = 0x4780 | (rm_bits << 3);
2354                Ok(instr.to_le_bytes().to_vec())
2355            }
2356
2357            // CallIndirect - indirect function call via table lookup
2358            // table_index_reg contains the table index
2359            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2360            ArmOp::CallIndirect {
2361                rd: _,
2362                type_idx: _,
2363                table_index_reg,
2364            } => {
2365                let idx_reg = reg_to_bits(table_index_reg);
2366                let mut bytes = Vec::new();
2367
2368                // For now, we generate code that:
2369                // 1. Multiplies index by 4 (function pointer size)
2370                // 2. Loads function pointer from table (assumes table base in R11)
2371                // 3. Calls the function via BLX
2372                //
2373                // Table base setup must be done by caller/runtime.
2374                // This is a simplified implementation - full support needs:
2375                // - Table base address resolution
2376                // - Type signature checking
2377                // - Bounds checking
2378
2379                // LSL R12, idx_reg, #2 (multiply index by 4)
2380                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2381                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2382                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2383                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2384                bytes.extend_from_slice(&hw1.to_le_bytes());
2385                bytes.extend_from_slice(&hw2.to_le_bytes());
2386
2387                // LDR R12, [R11, R12] - load function pointer
2388                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2389                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2390                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2391                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2392                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2393                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2394
2395                // BLX R12 (call function indirectly)
2396                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2397                let blx: u16 = 0x47E0; // BLX R12
2398                bytes.extend_from_slice(&blx.to_le_bytes());
2399
2400                Ok(bytes)
2401            }
2402
2403            // Label pseudo-instruction: emits no machine code
2404            ArmOp::Label { .. } => Ok(Vec::new()),
2405
2406            // Conditional branch to label (generic) - offset 0, will be patched
2407            ArmOp::Bcc { cond, label: _ } => {
2408                use synth_synthesis::Condition;
2409                let cond_bits: u16 = match cond {
2410                    Condition::EQ => 0x0,
2411                    Condition::NE => 0x1,
2412                    Condition::HS => 0x2,
2413                    Condition::LO => 0x3,
2414                    Condition::HI => 0x8,
2415                    Condition::LS => 0x9,
2416                    Condition::GE => 0xA,
2417                    Condition::LT => 0xB,
2418                    Condition::GT => 0xC,
2419                    Condition::LE => 0xD,
2420                };
2421                // 16-bit B<cond> with offset 0: 1101 cond imm8
2422                let instr: u16 = 0xD000 | (cond_bits << 8);
2423                Ok(instr.to_le_bytes().to_vec())
2424            }
2425
2426            // Branch instructions
2427            ArmOp::B { label: _ } => {
2428                // Simplified: B.N with offset 0
2429                // For real usage, would need label resolution
2430                let instr: u16 = 0xE000; // B.N #0
2431                Ok(instr.to_le_bytes().to_vec())
2432            }
2433
2434            // BHS (Branch if Higher or Same) - used for bounds checking
2435            // Condition code: 0x2 (C set)
2436            ArmOp::Bhs { label: _ } => {
2437                // 16-bit B<cond> with offset 0: 1101 cond imm8
2438                // cond = 0x2 (HS)
2439                let instr: u16 = 0xD200; // BHS.N #0
2440                Ok(instr.to_le_bytes().to_vec())
2441            }
2442
2443            // BLO (Branch if Lower) - complementary to BHS
2444            // Condition code: 0x3 (C clear)
2445            ArmOp::Blo { label: _ } => {
2446                // 16-bit B<cond> with offset 0: 1101 cond imm8
2447                // cond = 0x3 (LO)
2448                let instr: u16 = 0xD300; // BLO.N #0
2449                Ok(instr.to_le_bytes().to_vec())
2450            }
2451
2452            // Branch with numeric offset (Thumb-2)
2453            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2454            ArmOp::BOffset { offset } => {
2455                // offset is already the halfword displacement: (target - branch - 4) / 2
2456                // This is the raw encoded value, accounting for variable-length instructions
2457                let halfword_offset = *offset;
2458
2459                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2460                // Range: -1024 to +1022 halfwords
2461                if (-1024..=1022).contains(&halfword_offset) {
2462                    // 16-bit B.N encoding: 1110 0 imm11
2463                    let imm11 = (halfword_offset as u16) & 0x7FF;
2464                    let instr: u16 = 0xE000 | imm11;
2465                    Ok(instr.to_le_bytes().to_vec())
2466                } else {
2467                    // 32-bit B.W encoding for larger offsets
2468                    // First halfword: 1111 0 S imm10
2469                    // Second halfword: 10 J1 0 J2 imm11
2470                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2471                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2472
2473                    // The B.W (T4) encoding packs the signed offset as:
2474                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2475                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2476                    // Input halfword_offset already equals (target - PC - 4) / 2,
2477                    // so the full byte offset = halfword_offset << 1.
2478                    // The encoding fields split that 25-bit signed value (including the
2479                    // implicit trailing zero) as: S | imm10 | imm11
2480                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2481                    let signed_offset = halfword_offset << 1; // byte offset
2482                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2483                    let uoffset = signed_offset as u32;
2484                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2485                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2486                    let i1 = (uoffset >> 23) & 1; // bit 23
2487                    let i2 = (uoffset >> 22) & 1; // bit 22
2488                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2489                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2490
2491                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2492                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2493
2494                    let mut bytes = hw1.to_le_bytes().to_vec();
2495                    bytes.extend_from_slice(&hw2.to_le_bytes());
2496                    Ok(bytes)
2497                }
2498            }
2499
2500            // Conditional branch with numeric offset (Thumb-2)
2501            ArmOp::BCondOffset { cond, offset } => {
2502                use synth_synthesis::Condition;
2503                let cond_bits: u16 = match cond {
2504                    Condition::EQ => 0x0,
2505                    Condition::NE => 0x1,
2506                    Condition::HS => 0x2,
2507                    Condition::LO => 0x3,
2508                    Condition::HI => 0x8,
2509                    Condition::LS => 0x9,
2510                    Condition::GE => 0xA,
2511                    Condition::LT => 0xB,
2512                    Condition::GT => 0xC,
2513                    Condition::LE => 0xD,
2514                };
2515
2516                // offset is already the halfword displacement: (target - branch - 4) / 2
2517                // This is the raw imm8 value for 16-bit B<cond> encoding
2518                let halfword_offset = *offset;
2519
2520                // 16-bit B<cond> encoding: 1101 cond imm8
2521                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2522                if (-128..=127).contains(&halfword_offset) {
2523                    let imm8 = (halfword_offset as u16) & 0xFF;
2524                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2525                    Ok(instr.to_le_bytes().to_vec())
2526                } else {
2527                    // 32-bit B<cond>.W for larger offsets
2528                    // First halfword: 1111 0 S cond imm6
2529                    // Second halfword: 10 J1 0 J2 imm11
2530                    let offset = halfword_offset >> 1;
2531                    let s = if offset < 0 { 1u32 } else { 0u32 };
2532                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2533                    let imm11 = (offset as u32) & 0x7FF;
2534                    let j1 = if s == 1 { 1 } else { 0 };
2535                    let j2 = if s == 1 { 1 } else { 0 };
2536
2537                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2538                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2539
2540                    let mut bytes = hw1.to_le_bytes().to_vec();
2541                    bytes.extend_from_slice(&hw2.to_le_bytes());
2542                    Ok(bytes)
2543                }
2544            }
2545
2546            ArmOp::Bl { label: _ } => {
2547                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2548                // placeholder; an R_ARM_THM_CALL relocation patches the target
2549                // (see arm_backend.rs). The placeholder must carry an embedded
2550                // addend of -4 so the relocation nets to exactly the symbol S.
2551                //
2552                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2553                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2554                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2555                // instruction past the callee entry (#174). The correct
2556                // placeholder is what `gas` emits for `bl <extern>`:
2557                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2558                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2559                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2560                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2561                let hw1: u16 = 0xF7FF;
2562                let hw2: u16 = 0xFFFE;
2563                let mut bytes = hw1.to_le_bytes().to_vec();
2564                bytes.extend_from_slice(&hw2.to_le_bytes());
2565                Ok(bytes)
2566            }
2567
2568            // MVN
2569            ArmOp::Mvn { rd, op2 } => {
2570                if let Operand2::Reg(rm) = op2 {
2571                    let rd_bits = reg_to_bits(rd) as u16;
2572                    let rm_bits = reg_to_bits(rm) as u16;
2573
2574                    if rd_bits < 8 && rm_bits < 8 {
2575                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2576                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2577                        Ok(instr.to_le_bytes().to_vec())
2578                    } else {
2579                        // 32-bit MVN
2580                        let hw1: u16 = 0xEA6F_u16;
2581                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2582                        let mut bytes = hw1.to_le_bytes().to_vec();
2583                        bytes.extend_from_slice(&hw2.to_le_bytes());
2584                        Ok(bytes)
2585                    }
2586                } else {
2587                    let instr: u16 = 0xBF00;
2588                    Ok(instr.to_le_bytes().to_vec())
2589                }
2590            }
2591
2592            // MOVW - Move Wide (Thumb-2 32-bit)
2593            ArmOp::Movw { rd, imm16 } => {
2594                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2595            }
2596
2597            // MOVT - Move Top (Thumb-2 32-bit)
2598            ArmOp::Movt { rd, imm16 } => {
2599                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2600            }
2601
2602            // SetCond: Materialize condition flag into register (0 or 1)
2603            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2604            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2605            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2606            // any MOV instruction clobbers the flags from CMP.
2607            ArmOp::SetCond { rd, cond } => {
2608                let rd_bits = reg_to_bits(rd) as u16;
2609
2610                // Condition code encoding for IT block
2611                use synth_synthesis::Condition;
2612                let cond_bits: u16 = match cond {
2613                    Condition::EQ => 0x0,
2614                    Condition::NE => 0x1,
2615                    Condition::LT => 0xB,
2616                    Condition::LE => 0xD,
2617                    Condition::GT => 0xC,
2618                    Condition::GE => 0xA,
2619                    Condition::LO => 0x3, // CC/LO (unsigned <)
2620                    Condition::LS => 0x9, // LS (unsigned <=)
2621                    Condition::HI => 0x8, // HI (unsigned >)
2622                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2623                };
2624
2625                // ITE <cond>: encodes If-Then-Else block
2626                // The mask field depends on firstcond[0]:
2627                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2628                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2629                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2630                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2631
2632                // MOV Rd, #1 (Then branch - condition true)
2633                let mov_one: u16 = 0x2001 | (rd_bits << 8);
2634
2635                // MOV Rd, #0 (Else branch - condition false)
2636                let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2637
2638                // Emit: ITE, MOV #1 (Then), MOV #0 (Else)
2639                let mut bytes = ite_instr.to_le_bytes().to_vec();
2640                bytes.extend_from_slice(&mov_one.to_le_bytes());
2641                bytes.extend_from_slice(&mov_zero.to_le_bytes());
2642                Ok(bytes)
2643            }
2644
2645            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2646            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2647            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2648            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2649            ArmOp::I64SetCond {
2650                rd,
2651                rn_lo,
2652                rn_hi,
2653                rm_lo,
2654                rm_hi,
2655                cond,
2656            } => {
2657                use synth_synthesis::Condition;
2658                let rd_bits = reg_to_bits(rd) as u16;
2659                let mut bytes = Vec::new();
2660
2661                // Helper: encode CMP Rn, Rm (16-bit)
2662                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2663                                      rm: &synth_synthesis::Reg|
2664                 -> Vec<u8> {
2665                    let rn_bits = reg_to_bits(rn) as u16;
2666                    let rm_bits = reg_to_bits(rm) as u16;
2667                    if rn_bits < 8 && rm_bits < 8 {
2668                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2669                        instr.to_le_bytes().to_vec()
2670                    } else {
2671                        let n_bit = (rn_bits >> 3) & 1;
2672                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2673                        instr.to_le_bytes().to_vec()
2674                    }
2675                };
2676
2677                // Helper: encode ITE <cond> (2 bytes)
2678                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2679                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2680                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2681                    ite_instr.to_le_bytes().to_vec()
2682                };
2683
2684                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2685                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2686                    let mut b = encode_ite(cond_bits);
2687                    let mov_one: u16 = 0x2001 | (rd_bits << 8);
2688                    let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2689                    b.extend_from_slice(&mov_one.to_le_bytes());
2690                    b.extend_from_slice(&mov_zero.to_le_bytes());
2691                    b
2692                };
2693
2694                match cond {
2695                    Condition::EQ | Condition::NE => {
2696                        // CMP rn_lo, rm_lo (compare low words)
2697                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2698
2699                        // IT EQ (execute next instruction only if Z=1)
2700                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
2701                        bytes.extend_from_slice(&it_eq.to_le_bytes());
2702
2703                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
2704                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
2705
2706                        // ITE <cond>; MOV rd, #1; MOV rd, #0
2707                        let cond_bits: u16 = match cond {
2708                            Condition::EQ => 0x0,
2709                            Condition::NE => 0x1,
2710                            _ => unreachable!(),
2711                        };
2712                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
2713                    }
2714
2715                    Condition::LT => {
2716                        // CMP rn_lo, rm_lo (sets C flag for borrow)
2717                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2718
2719                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
2720                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
2721                        let rn_hi_bits = reg_to_bits(rn_hi);
2722                        let rm_hi_bits = reg_to_bits(rm_hi);
2723                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2724                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2725                        bytes.extend_from_slice(&hw1.to_le_bytes());
2726                        bytes.extend_from_slice(&hw2.to_le_bytes());
2727
2728                        // ITE LT; MOV rd, #1; MOV rd, #0
2729                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2730                    }
2731
2732                    Condition::GT => {
2733                        // GT(a,b) = LT(b,a): swap operands
2734                        // CMP rm_lo, rn_lo (swapped)
2735                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2736
2737                        // SBCS rd, rm_hi, rn_hi (swapped)
2738                        let rm_hi_bits = reg_to_bits(rm_hi);
2739                        let rn_hi_bits = reg_to_bits(rn_hi);
2740                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2741                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2742                        bytes.extend_from_slice(&hw1.to_le_bytes());
2743                        bytes.extend_from_slice(&hw2.to_le_bytes());
2744
2745                        // ITE LT; MOV rd, #1; MOV rd, #0
2746                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2747                    }
2748
2749                    Condition::LE => {
2750                        // LE(a,b) = !GT(a,b): use GT logic but invert result
2751                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
2752                        // CMP rm_lo, rn_lo (swapped, same as GT)
2753                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2754
2755                        // SBCS rd, rm_hi, rn_hi (swapped)
2756                        let rm_hi_bits = reg_to_bits(rm_hi);
2757                        let rn_hi_bits = reg_to_bits(rn_hi);
2758                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2759                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2760                        bytes.extend_from_slice(&hw1.to_le_bytes());
2761                        bytes.extend_from_slice(&hw2.to_le_bytes());
2762
2763                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
2764                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2765                    }
2766
2767                    Condition::GE => {
2768                        // GE(a,b) = !LT(a,b): use LT logic but invert result
2769                        // CMP rn_lo, rm_lo (same as LT)
2770                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2771
2772                        // SBCS rd, rn_hi, rm_hi (same as LT)
2773                        let rn_hi_bits = reg_to_bits(rn_hi);
2774                        let rm_hi_bits = reg_to_bits(rm_hi);
2775                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2776                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2777                        bytes.extend_from_slice(&hw1.to_le_bytes());
2778                        bytes.extend_from_slice(&hw2.to_le_bytes());
2779
2780                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
2781                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2782                    }
2783
2784                    // Unsigned comparisons - same instruction sequence, different conditions
2785                    Condition::LO => {
2786                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
2787                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2788                        let rn_hi_bits = reg_to_bits(rn_hi);
2789                        let rm_hi_bits = reg_to_bits(rm_hi);
2790                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2791                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2792                        bytes.extend_from_slice(&hw1.to_le_bytes());
2793                        bytes.extend_from_slice(&hw2.to_le_bytes());
2794                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2795                    }
2796
2797                    Condition::HI => {
2798                        // HI (unsigned GT): swap operands and check LO
2799                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2800                        let rm_hi_bits = reg_to_bits(rm_hi);
2801                        let rn_hi_bits = reg_to_bits(rn_hi);
2802                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2803                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2804                        bytes.extend_from_slice(&hw1.to_le_bytes());
2805                        bytes.extend_from_slice(&hw2.to_le_bytes());
2806                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2807                    }
2808
2809                    Condition::LS => {
2810                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
2811                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2812                        let rm_hi_bits = reg_to_bits(rm_hi);
2813                        let rn_hi_bits = reg_to_bits(rn_hi);
2814                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2815                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2816                        bytes.extend_from_slice(&hw1.to_le_bytes());
2817                        bytes.extend_from_slice(&hw2.to_le_bytes());
2818                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2819                    }
2820
2821                    Condition::HS => {
2822                        // HS (unsigned GE): !(a < b) = !(LO)
2823                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2824                        let rn_hi_bits = reg_to_bits(rn_hi);
2825                        let rm_hi_bits = reg_to_bits(rm_hi);
2826                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2827                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2828                        bytes.extend_from_slice(&hw1.to_le_bytes());
2829                        bytes.extend_from_slice(&hw2.to_le_bytes());
2830                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2831                    }
2832                }
2833
2834                Ok(bytes)
2835            }
2836
2837            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
2838            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
2839            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
2840                let rd_bits = reg_to_bits(rd);
2841                let rn_lo_bits = reg_to_bits(rn_lo);
2842                let rn_hi_bits = reg_to_bits(rn_hi);
2843                let mut bytes = Vec::new();
2844
2845                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
2846                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
2847                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
2848                bytes.extend_from_slice(&hw1.to_le_bytes());
2849                bytes.extend_from_slice(&hw2.to_le_bytes());
2850
2851                // CMP rd, #0 (16-bit): 0010 1 Rd 0000 0000
2852                let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
2853                bytes.extend_from_slice(&cmp_instr.to_le_bytes());
2854
2855                // ITE EQ; MOV rd, #1; MOV rd, #0
2856                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
2857                let ite_instr: u16 = 0xBF00 | mask;
2858                bytes.extend_from_slice(&ite_instr.to_le_bytes());
2859                let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
2860                let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
2861                bytes.extend_from_slice(&mov_one.to_le_bytes());
2862                bytes.extend_from_slice(&mov_zero.to_le_bytes());
2863
2864                Ok(bytes)
2865            }
2866
2867            // I64Mul: 64-bit multiply using UMULL + MLA cross products
2868            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
2869            // Uses R12 as scratch register
2870            ArmOp::I64Mul {
2871                rd_lo,
2872                rd_hi,
2873                rn_lo,
2874                rn_hi,
2875                rm_lo,
2876                rm_hi,
2877            } => {
2878                let rd_lo_bits = reg_to_bits(rd_lo);
2879                let rd_hi_bits = reg_to_bits(rd_hi);
2880                let rn_lo_bits = reg_to_bits(rn_lo);
2881                let rn_hi_bits = reg_to_bits(rn_hi);
2882                let rm_lo_bits = reg_to_bits(rm_lo);
2883                let rm_hi_bits = reg_to_bits(rm_hi);
2884                let r12: u32 = 12; // IP scratch register
2885                let mut bytes = Vec::new();
2886
2887                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
2888                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
2889                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
2890                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
2891                bytes.extend_from_slice(&hw1.to_le_bytes());
2892                bytes.extend_from_slice(&hw2.to_le_bytes());
2893
2894                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
2895                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
2896                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
2897                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
2898                bytes.extend_from_slice(&hw1.to_le_bytes());
2899                bytes.extend_from_slice(&hw2.to_le_bytes());
2900
2901                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
2902                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
2903                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
2904                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
2905                bytes.extend_from_slice(&hw1.to_le_bytes());
2906                bytes.extend_from_slice(&hw2.to_le_bytes());
2907
2908                // 4. ADD rd_hi, R12  (rd_hi += cross products)
2909                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
2910                let d_bit = (rd_hi_bits >> 3) & 1;
2911                let add_instr: u16 =
2912                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
2913                bytes.extend_from_slice(&add_instr.to_le_bytes());
2914
2915                Ok(bytes)
2916            }
2917
2918            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
2919            // rm_hi (R3) is used as temp register
2920            ArmOp::I64Shl {
2921                rd_lo,
2922                rd_hi,
2923                rn_lo,
2924                rn_hi,
2925                rm_lo,
2926                rm_hi,
2927            } => {
2928                let rd_lo_bits = reg_to_bits(rd_lo);
2929                let rd_hi_bits = reg_to_bits(rd_hi);
2930                let rn_lo_bits = reg_to_bits(rn_lo);
2931                let rn_hi_bits = reg_to_bits(rn_hi);
2932                let rm_lo_bits = reg_to_bits(rm_lo);
2933                let rm_hi_bits = reg_to_bits(rm_hi); // temp
2934                let mut bytes = Vec::new();
2935
2936                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
2937                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
2938                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
2939                bytes.extend_from_slice(&hw1.to_le_bytes());
2940                bytes.extend_from_slice(&hw2.to_le_bytes());
2941
2942                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
2943                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
2944                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
2945                bytes.extend_from_slice(&hw1.to_le_bytes());
2946                bytes.extend_from_slice(&hw2.to_le_bytes());
2947
2948                // BPL .large (branch if n >= 32, offset = +10 halfwords)
2949                let bpl: u16 = 0xD50A;
2950                bytes.extend_from_slice(&bpl.to_le_bytes());
2951
2952                // --- Small shift (n < 32) ---
2953                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
2954                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
2955                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
2956                bytes.extend_from_slice(&hw1.to_le_bytes());
2957                bytes.extend_from_slice(&hw2.to_le_bytes());
2958
2959                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
2960                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
2961                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
2962                bytes.extend_from_slice(&hw1.to_le_bytes());
2963                bytes.extend_from_slice(&hw2.to_le_bytes());
2964
2965                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
2966                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
2967                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
2968                bytes.extend_from_slice(&hw1.to_le_bytes());
2969                bytes.extend_from_slice(&hw2.to_le_bytes());
2970
2971                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
2972                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
2973                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
2974                bytes.extend_from_slice(&hw1.to_le_bytes());
2975                bytes.extend_from_slice(&hw2.to_le_bytes());
2976
2977                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
2978                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
2979                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
2980                bytes.extend_from_slice(&hw1.to_le_bytes());
2981                bytes.extend_from_slice(&hw2.to_le_bytes());
2982
2983                // B .done (skip large shift: +2 halfwords)
2984                let b_done: u16 = 0xE002;
2985                bytes.extend_from_slice(&b_done.to_le_bytes());
2986
2987                // --- Large shift (n >= 32) ---
2988                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
2989                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
2990                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
2991                bytes.extend_from_slice(&hw1.to_le_bytes());
2992                bytes.extend_from_slice(&hw2.to_le_bytes());
2993
2994                // MOV rd_lo, #0
2995                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
2996                bytes.extend_from_slice(&mov_zero.to_le_bytes());
2997
2998                Ok(bytes) // Total: 38 bytes
2999            }
3000
3001            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3002            ArmOp::I64ShrU {
3003                rd_lo,
3004                rd_hi,
3005                rn_lo,
3006                rn_hi,
3007                rm_lo,
3008                rm_hi,
3009            } => {
3010                let rd_lo_bits = reg_to_bits(rd_lo);
3011                let rd_hi_bits = reg_to_bits(rd_hi);
3012                let rn_lo_bits = reg_to_bits(rn_lo);
3013                let rn_hi_bits = reg_to_bits(rn_hi);
3014                let rm_lo_bits = reg_to_bits(rm_lo);
3015                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3016                let mut bytes = Vec::new();
3017
3018                // AND.W rm_lo, rm_lo, #63
3019                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3020                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3021                bytes.extend_from_slice(&hw1.to_le_bytes());
3022                bytes.extend_from_slice(&hw2.to_le_bytes());
3023
3024                // SUBS.W rm_hi, rm_lo, #32
3025                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3026                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3027                bytes.extend_from_slice(&hw1.to_le_bytes());
3028                bytes.extend_from_slice(&hw2.to_le_bytes());
3029
3030                // BPL .large (+10 halfwords)
3031                let bpl: u16 = 0xD50A;
3032                bytes.extend_from_slice(&bpl.to_le_bytes());
3033
3034                // --- Small shift (n < 32) ---
3035                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3036                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3037                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3038                bytes.extend_from_slice(&hw1.to_le_bytes());
3039                bytes.extend_from_slice(&hw2.to_le_bytes());
3040
3041                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3042                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3043                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3044                bytes.extend_from_slice(&hw1.to_le_bytes());
3045                bytes.extend_from_slice(&hw2.to_le_bytes());
3046
3047                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3048                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3049                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3050                bytes.extend_from_slice(&hw1.to_le_bytes());
3051                bytes.extend_from_slice(&hw2.to_le_bytes());
3052
3053                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3054                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3055                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3056                bytes.extend_from_slice(&hw1.to_le_bytes());
3057                bytes.extend_from_slice(&hw2.to_le_bytes());
3058
3059                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3060                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3061                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3062                bytes.extend_from_slice(&hw1.to_le_bytes());
3063                bytes.extend_from_slice(&hw2.to_le_bytes());
3064
3065                // B .done (+2 halfwords)
3066                let b_done: u16 = 0xE002;
3067                bytes.extend_from_slice(&b_done.to_le_bytes());
3068
3069                // --- Large shift (n >= 32) ---
3070                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3071                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3072                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3073                bytes.extend_from_slice(&hw1.to_le_bytes());
3074                bytes.extend_from_slice(&hw2.to_le_bytes());
3075
3076                // MOV rd_hi, #0
3077                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3078                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3079
3080                Ok(bytes) // Total: 38 bytes
3081            }
3082
3083            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3084            ArmOp::I64ShrS {
3085                rd_lo,
3086                rd_hi,
3087                rn_lo,
3088                rn_hi,
3089                rm_lo,
3090                rm_hi,
3091            } => {
3092                let rd_lo_bits = reg_to_bits(rd_lo);
3093                let rd_hi_bits = reg_to_bits(rd_hi);
3094                let rn_lo_bits = reg_to_bits(rn_lo);
3095                let rn_hi_bits = reg_to_bits(rn_hi);
3096                let rm_lo_bits = reg_to_bits(rm_lo);
3097                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3098                let mut bytes = Vec::new();
3099
3100                // AND.W rm_lo, rm_lo, #63
3101                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3102                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3103                bytes.extend_from_slice(&hw1.to_le_bytes());
3104                bytes.extend_from_slice(&hw2.to_le_bytes());
3105
3106                // SUBS.W rm_hi, rm_lo, #32
3107                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3108                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3109                bytes.extend_from_slice(&hw1.to_le_bytes());
3110                bytes.extend_from_slice(&hw2.to_le_bytes());
3111
3112                // BPL .large (+10 halfwords)
3113                let bpl: u16 = 0xD50A;
3114                bytes.extend_from_slice(&bpl.to_le_bytes());
3115
3116                // --- Small shift (n < 32) ---
3117                // RSB.W rm_hi, rm_lo, #32
3118                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3119                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3120                bytes.extend_from_slice(&hw1.to_le_bytes());
3121                bytes.extend_from_slice(&hw2.to_le_bytes());
3122
3123                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3124                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3125                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3126                bytes.extend_from_slice(&hw1.to_le_bytes());
3127                bytes.extend_from_slice(&hw2.to_le_bytes());
3128
3129                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3130                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3131                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3132                bytes.extend_from_slice(&hw1.to_le_bytes());
3133                bytes.extend_from_slice(&hw2.to_le_bytes());
3134
3135                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3136                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3137                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3138                bytes.extend_from_slice(&hw1.to_le_bytes());
3139                bytes.extend_from_slice(&hw2.to_le_bytes());
3140
3141                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3142                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3143                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3144                bytes.extend_from_slice(&hw1.to_le_bytes());
3145                bytes.extend_from_slice(&hw2.to_le_bytes());
3146
3147                // B .done (+3 halfwords, large shift is 8 bytes)
3148                let b_done: u16 = 0xE003;
3149                bytes.extend_from_slice(&b_done.to_le_bytes());
3150
3151                // --- Large shift (n >= 32) ---
3152                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3153                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3154                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3155                bytes.extend_from_slice(&hw1.to_le_bytes());
3156                bytes.extend_from_slice(&hw2.to_le_bytes());
3157
3158                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3159                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3160                // imm5=31=11111 → imm3=111, imm2=11
3161                let hw1: u16 = 0xEA4F;
3162                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3163                bytes.extend_from_slice(&hw1.to_le_bytes());
3164                bytes.extend_from_slice(&hw2.to_le_bytes());
3165
3166                Ok(bytes) // Total: 40 bytes
3167            }
3168
3169            // I64Rotl: 64-bit rotate left
3170            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3171            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3172            // Uses R4 (saved/restored) and R12 as scratch
3173            ArmOp::I64Rotl {
3174                rdlo,
3175                rdhi,
3176                rnlo,
3177                rnhi,
3178                shift,
3179            } => {
3180                let rd_lo_bits = reg_to_bits(rdlo);
3181                let rd_hi_bits = reg_to_bits(rdhi);
3182                let rn_lo_bits = reg_to_bits(rnlo);
3183                let rn_hi_bits = reg_to_bits(rnhi);
3184                let shift_bits = reg_to_bits(shift);
3185                let r12: u32 = 12; // IP scratch
3186                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3187                let r4: u32 = 4; // Scratch (saved/restored)
3188                let mut bytes = Vec::new();
3189
3190                // PUSH {R4}
3191                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3192
3193                // AND.W shift, shift, #63 (mask to 6 bits)
3194                let hw1: u16 = (0xF000 | shift_bits) as u16;
3195                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3196                bytes.extend_from_slice(&hw1.to_le_bytes());
3197                bytes.extend_from_slice(&hw2.to_le_bytes());
3198
3199                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3200                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3201                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3202                bytes.extend_from_slice(&hw1.to_le_bytes());
3203                bytes.extend_from_slice(&hw2.to_le_bytes());
3204
3205                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3206                let bpl: u16 = 0xD50E;
3207                bytes.extend_from_slice(&bpl.to_le_bytes());
3208
3209                // === Small rotation (n < 32) ===
3210                // RSB.W R3, shift, #32 (R3 = 32-n)
3211                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3212                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3213                bytes.extend_from_slice(&hw1.to_le_bytes());
3214                bytes.extend_from_slice(&hw2.to_le_bytes());
3215
3216                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3217                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3218                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3219                bytes.extend_from_slice(&hw1.to_le_bytes());
3220                bytes.extend_from_slice(&hw2.to_le_bytes());
3221
3222                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3223                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3224                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3225                bytes.extend_from_slice(&hw1.to_le_bytes());
3226                bytes.extend_from_slice(&hw2.to_le_bytes());
3227
3228                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3229                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3230                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3231                bytes.extend_from_slice(&hw1.to_le_bytes());
3232                bytes.extend_from_slice(&hw2.to_le_bytes());
3233
3234                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3235                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3236                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3237                bytes.extend_from_slice(&hw1.to_le_bytes());
3238                bytes.extend_from_slice(&hw2.to_le_bytes());
3239
3240                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3241                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3242                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3243                bytes.extend_from_slice(&hw1.to_le_bytes());
3244                bytes.extend_from_slice(&hw2.to_le_bytes());
3245
3246                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3247                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3248                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3249                bytes.extend_from_slice(&hw1.to_le_bytes());
3250                bytes.extend_from_slice(&hw2.to_le_bytes());
3251
3252                // B .done (skip large block, offset = +14 halfwords)
3253                let b_done: u16 = 0xE00E;
3254                bytes.extend_from_slice(&b_done.to_le_bytes());
3255
3256                // === Large rotation (n >= 32) ===
3257                // R3 already has n-32 from the SUBS
3258                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3259                let hw1: u16 = (0xF1C0 | r3) as u16;
3260                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3261                bytes.extend_from_slice(&hw1.to_le_bytes());
3262                bytes.extend_from_slice(&hw2.to_le_bytes());
3263
3264                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3265                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3266                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3267                bytes.extend_from_slice(&hw1.to_le_bytes());
3268                bytes.extend_from_slice(&hw2.to_le_bytes());
3269
3270                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3271                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3272                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3273                bytes.extend_from_slice(&hw1.to_le_bytes());
3274                bytes.extend_from_slice(&hw2.to_le_bytes());
3275
3276                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3277                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3278                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3279                bytes.extend_from_slice(&hw1.to_le_bytes());
3280                bytes.extend_from_slice(&hw2.to_le_bytes());
3281
3282                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3283                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3284                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3285                bytes.extend_from_slice(&hw1.to_le_bytes());
3286                bytes.extend_from_slice(&hw2.to_le_bytes());
3287
3288                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3289                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3290                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3291                bytes.extend_from_slice(&hw1.to_le_bytes());
3292                bytes.extend_from_slice(&hw2.to_le_bytes());
3293
3294                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3295                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3296                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3297                bytes.extend_from_slice(&hw1.to_le_bytes());
3298                bytes.extend_from_slice(&hw2.to_le_bytes());
3299
3300                // MOV rd_hi, shift (rd_hi = new_hi)
3301                let d_bit = (rd_hi_bits >> 3) & 1;
3302                let mov_instr: u16 =
3303                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3304                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3305
3306                // POP {R4}
3307                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3308
3309                Ok(bytes) // Total: 74 bytes
3310            }
3311
3312            // I64Rotr: 64-bit rotate right
3313            // rotr(x, n) = rotl(x, 64-n)
3314            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3315            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3316            ArmOp::I64Rotr {
3317                rdlo,
3318                rdhi,
3319                rnlo,
3320                rnhi,
3321                shift,
3322            } => {
3323                let rd_lo_bits = reg_to_bits(rdlo);
3324                let rd_hi_bits = reg_to_bits(rdhi);
3325                let rn_lo_bits = reg_to_bits(rnlo);
3326                let rn_hi_bits = reg_to_bits(rnhi);
3327                let shift_bits = reg_to_bits(shift);
3328                let r12: u32 = 12;
3329                let r3: u32 = 3;
3330                let r4: u32 = 4;
3331                let mut bytes = Vec::new();
3332
3333                // PUSH {R4}
3334                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3335
3336                // AND.W shift, shift, #63
3337                let hw1: u16 = (0xF000 | shift_bits) as u16;
3338                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3339                bytes.extend_from_slice(&hw1.to_le_bytes());
3340                bytes.extend_from_slice(&hw2.to_le_bytes());
3341
3342                // SUBS.W R3, shift, #32
3343                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3344                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3345                bytes.extend_from_slice(&hw1.to_le_bytes());
3346                bytes.extend_from_slice(&hw2.to_le_bytes());
3347
3348                // BPL .large (+14 halfwords)
3349                let bpl: u16 = 0xD50E;
3350                bytes.extend_from_slice(&bpl.to_le_bytes());
3351
3352                // === Small rotation (n < 32) ===
3353                // RSB.W R3, shift, #32 (R3 = 32-n)
3354                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3355                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3356                bytes.extend_from_slice(&hw1.to_le_bytes());
3357                bytes.extend_from_slice(&hw2.to_le_bytes());
3358
3359                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3360                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3361                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3362                bytes.extend_from_slice(&hw1.to_le_bytes());
3363                bytes.extend_from_slice(&hw2.to_le_bytes());
3364
3365                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3366                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3367                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3368                bytes.extend_from_slice(&hw1.to_le_bytes());
3369                bytes.extend_from_slice(&hw2.to_le_bytes());
3370
3371                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3372                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3373                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3374                bytes.extend_from_slice(&hw1.to_le_bytes());
3375                bytes.extend_from_slice(&hw2.to_le_bytes());
3376
3377                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3378                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3379                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3380                bytes.extend_from_slice(&hw1.to_le_bytes());
3381                bytes.extend_from_slice(&hw2.to_le_bytes());
3382
3383                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3384                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3385                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3386                bytes.extend_from_slice(&hw1.to_le_bytes());
3387                bytes.extend_from_slice(&hw2.to_le_bytes());
3388
3389                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3390                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3391                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3392                bytes.extend_from_slice(&hw1.to_le_bytes());
3393                bytes.extend_from_slice(&hw2.to_le_bytes());
3394
3395                // B .done (+14 halfwords)
3396                let b_done: u16 = 0xE00E;
3397                bytes.extend_from_slice(&b_done.to_le_bytes());
3398
3399                // === Large rotation (n >= 32) ===
3400                // RSB.W R4, R3, #32 (R4 = 64-n)
3401                let hw1: u16 = (0xF1C0 | r3) as u16;
3402                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3403                bytes.extend_from_slice(&hw1.to_le_bytes());
3404                bytes.extend_from_slice(&hw2.to_le_bytes());
3405
3406                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3407                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3408                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3409                bytes.extend_from_slice(&hw1.to_le_bytes());
3410                bytes.extend_from_slice(&hw2.to_le_bytes());
3411
3412                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3413                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3414                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3415                bytes.extend_from_slice(&hw1.to_le_bytes());
3416                bytes.extend_from_slice(&hw2.to_le_bytes());
3417
3418                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3419                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3420                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3421                bytes.extend_from_slice(&hw1.to_le_bytes());
3422                bytes.extend_from_slice(&hw2.to_le_bytes());
3423
3424                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3425                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3426                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3427                bytes.extend_from_slice(&hw1.to_le_bytes());
3428                bytes.extend_from_slice(&hw2.to_le_bytes());
3429
3430                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3431                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3432                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3433                bytes.extend_from_slice(&hw1.to_le_bytes());
3434                bytes.extend_from_slice(&hw2.to_le_bytes());
3435
3436                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3437                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3438                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3439                bytes.extend_from_slice(&hw1.to_le_bytes());
3440                bytes.extend_from_slice(&hw2.to_le_bytes());
3441
3442                // MOV rd_lo, shift (rd_lo = new_lo)
3443                let d_bit = (rd_lo_bits >> 3) & 1;
3444                let mov_instr: u16 =
3445                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3446                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3447
3448                // POP {R4}
3449                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3450
3451                Ok(bytes) // Total: 74 bytes
3452            }
3453
3454            // I64Clz: Count leading zeros in 64-bit value
3455            // If hi != 0: result = CLZ(hi)
3456            // If hi == 0: result = 32 + CLZ(lo)
3457            //
3458            // Layout (using CMP+BNE approach for consistency):
3459            // 0: CMP.W rnhi, #0 (4 bytes)
3460            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3461            // 6: CLZ.W rd, rnhi (4 bytes)
3462            // 10: B .done (2 bytes) - branch forward to offset 22
3463            // 12: NOP (2 bytes) - padding for alignment
3464            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3465            // 18: ADD.W rd, rd, #32 (4 bytes)
3466            // 22: .done
3467            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3468                let rd_bits = reg_to_bits(rd);
3469                let rn_lo_bits = reg_to_bits(rnlo);
3470                let rn_hi_bits = reg_to_bits(rnhi);
3471                let mut bytes = Vec::new();
3472
3473                // CMP.W rnhi, #0 (4 bytes at offset 0)
3474                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3475                let hw2: u16 = 0x0F00;
3476                bytes.extend_from_slice(&hw1.to_le_bytes());
3477                bytes.extend_from_slice(&hw2.to_le_bytes());
3478
3479                // BEQ .hi_zero (2 bytes at offset 4)
3480                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3481                let beq: u16 = 0xD003;
3482                bytes.extend_from_slice(&beq.to_le_bytes());
3483
3484                // CLZ.W rd, rnhi (4 bytes at offset 6)
3485                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3486                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3487                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3488                bytes.extend_from_slice(&hw1.to_le_bytes());
3489                bytes.extend_from_slice(&hw2.to_le_bytes());
3490
3491                // B .done (2 bytes at offset 10)
3492                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3493                let b_done: u16 = 0xE004;
3494                bytes.extend_from_slice(&b_done.to_le_bytes());
3495
3496                // NOP (2 bytes at offset 12) - padding
3497                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3498
3499                // .hi_zero: (offset 14)
3500                // CLZ.W rd, rnlo (4 bytes)
3501                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3502                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3503                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3504                bytes.extend_from_slice(&hw1.to_le_bytes());
3505                bytes.extend_from_slice(&hw2.to_le_bytes());
3506
3507                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3508                let hw1: u16 = (0xF100 | rd_bits) as u16;
3509                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3510                bytes.extend_from_slice(&hw1.to_le_bytes());
3511                bytes.extend_from_slice(&hw2.to_le_bytes());
3512
3513                // .done: (offset 22)
3514                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3515                // MOVS Rn, #0: 0010 0 Rn 00000000
3516                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3517                bytes.extend_from_slice(&mov0.to_le_bytes());
3518
3519                Ok(bytes)
3520            }
3521
3522            // I64Ctz: Count trailing zeros in 64-bit value
3523            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3524            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3525            //
3526            // Layout:
3527            // 0: CMP.W rnlo, #0 (4 bytes)
3528            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3529            // 6: RBIT.W rd, rnlo (4 bytes)
3530            // 10: CLZ.W rd, rd (4 bytes)
3531            // 14: B .done (2 bytes) - branch to offset 30
3532            // 16: NOP (2 bytes) - padding
3533            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3534            // 22: CLZ.W rd, rd (4 bytes)
3535            // 26: ADD.W rd, rd, #32 (4 bytes)
3536            // 30: .done
3537            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3538                let rd_bits = reg_to_bits(rd);
3539                let rn_lo_bits = reg_to_bits(rnlo);
3540                let rn_hi_bits = reg_to_bits(rnhi);
3541                let mut bytes = Vec::new();
3542
3543                // CMP.W rnlo, #0 (4 bytes at offset 0)
3544                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3545                let hw2: u16 = 0x0F00;
3546                bytes.extend_from_slice(&hw1.to_le_bytes());
3547                bytes.extend_from_slice(&hw2.to_le_bytes());
3548
3549                // BEQ .lo_zero (2 bytes at offset 4)
3550                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3551                let beq: u16 = 0xD005;
3552                bytes.extend_from_slice(&beq.to_le_bytes());
3553
3554                // RBIT.W rd, rnlo (4 bytes at offset 6)
3555                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3556                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3557                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3558                bytes.extend_from_slice(&hw1.to_le_bytes());
3559                bytes.extend_from_slice(&hw2.to_le_bytes());
3560
3561                // CLZ.W rd, rd (4 bytes at offset 10)
3562                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3563                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3564                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3565                bytes.extend_from_slice(&hw1.to_le_bytes());
3566                bytes.extend_from_slice(&hw2.to_le_bytes());
3567
3568                // B .done (2 bytes at offset 14)
3569                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3570                let b_done: u16 = 0xE006;
3571                bytes.extend_from_slice(&b_done.to_le_bytes());
3572
3573                // NOP (2 bytes at offset 16) - padding
3574                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3575
3576                // .lo_zero: (offset 18)
3577                // RBIT.W rd, rnhi (4 bytes)
3578                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3579                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3580                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3581                bytes.extend_from_slice(&hw1.to_le_bytes());
3582                bytes.extend_from_slice(&hw2.to_le_bytes());
3583
3584                // CLZ.W rd, rd (4 bytes at offset 22)
3585                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3586                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3587                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3588                bytes.extend_from_slice(&hw1.to_le_bytes());
3589                bytes.extend_from_slice(&hw2.to_le_bytes());
3590
3591                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3592                let hw1: u16 = (0xF100 | rd_bits) as u16;
3593                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3594                bytes.extend_from_slice(&hw1.to_le_bytes());
3595                bytes.extend_from_slice(&hw2.to_le_bytes());
3596
3597                // .done: (offset 30)
3598                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3599                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3600                bytes.extend_from_slice(&mov0.to_le_bytes());
3601
3602                Ok(bytes)
3603            }
3604
3605            // I64Popcnt: Population count of 64-bit value
3606            // result = POPCNT(lo) + POPCNT(hi)
3607            // Using SIMD-style parallel bit counting algorithm
3608            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3609                let rd_bits = reg_to_bits(rd);
3610                let rn_lo_bits = reg_to_bits(rnlo);
3611                let rn_hi_bits = reg_to_bits(rnhi);
3612                let r12: u32 = 12; // IP scratch
3613                let r3: u32 = 3; // Scratch for hi popcnt result
3614                let mut bytes = Vec::new();
3615
3616                // PUSH {R3, R4, R5} - save scratch registers
3617                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3618
3619                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3620                // Using lookup table approach for each byte would be too large
3621                // Using shift-and-add approach instead
3622
3623                // For simplicity and correctness, use the efficient parallel algorithm
3624                // but implement it as a series of inline operations
3625
3626                // MOV R4, rnlo
3627                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3628                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3629                bytes.extend_from_slice(&mov.to_le_bytes());
3630
3631                // MOV R5, rnhi
3632                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3633                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3634                bytes.extend_from_slice(&mov.to_le_bytes());
3635
3636                // --- POPCNT for R4 (lo word) ---
3637                // Step 1: x = x - ((x >> 1) & 0x55555555)
3638                // LSR.W R12, R4, #1
3639                let hw1: u16 = 0xEA4F;
3640                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3641                bytes.extend_from_slice(&hw1.to_le_bytes());
3642                bytes.extend_from_slice(&hw2.to_le_bytes());
3643
3644                // Load 0x55555555 into R3 using MOVW/MOVT
3645                // MOVW R3, #0x5555
3646                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3647                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3648                // MOVT R3, #0x5555
3649                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3650                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3651
3652                // AND.W R12, R12, R3
3653                let hw1: u16 = (0xEA00 | r12) as u16;
3654                let hw2: u16 = ((r12 << 8) | r3) as u16;
3655                bytes.extend_from_slice(&hw1.to_le_bytes());
3656                bytes.extend_from_slice(&hw2.to_le_bytes());
3657
3658                // SUB.W R4, R4, R12
3659                let hw1: u16 = (0xEBA0 | 4) as u16;
3660                let hw2: u16 = ((4 << 8) | r12) as u16;
3661                bytes.extend_from_slice(&hw1.to_le_bytes());
3662                bytes.extend_from_slice(&hw2.to_le_bytes());
3663
3664                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
3665                // Load 0x33333333 into R3
3666                // MOVW R3, #0x3333
3667                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3668                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3669                // MOVT R3, #0x3333
3670                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3671                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3672
3673                // AND.W R12, R4, R3
3674                let hw1: u16 = (0xEA00 | 4) as u16;
3675                let hw2: u16 = ((r12 << 8) | r3) as u16;
3676                bytes.extend_from_slice(&hw1.to_le_bytes());
3677                bytes.extend_from_slice(&hw2.to_le_bytes());
3678
3679                // LSR.W R4, R4, #2
3680                let hw1: u16 = 0xEA4F;
3681                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
3682                bytes.extend_from_slice(&hw1.to_le_bytes());
3683                bytes.extend_from_slice(&hw2.to_le_bytes());
3684
3685                // AND.W R4, R4, R3
3686                let hw1: u16 = (0xEA00 | 4) as u16;
3687                let hw2: u16 = ((4 << 8) | r3) as u16;
3688                bytes.extend_from_slice(&hw1.to_le_bytes());
3689                bytes.extend_from_slice(&hw2.to_le_bytes());
3690
3691                // ADD.W R4, R4, R12
3692                let hw1: u16 = (0xEB00 | 4) as u16;
3693                let hw2: u16 = ((4 << 8) | r12) as u16;
3694                bytes.extend_from_slice(&hw1.to_le_bytes());
3695                bytes.extend_from_slice(&hw2.to_le_bytes());
3696
3697                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
3698                // LSR.W R12, R4, #4
3699                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
3700                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3701                let hw1: u16 = 0xEA4F;
3702                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
3703                bytes.extend_from_slice(&hw1.to_le_bytes());
3704                bytes.extend_from_slice(&hw2.to_le_bytes());
3705
3706                // ADD.W R4, R4, R12
3707                let hw1: u16 = (0xEB00 | 4) as u16;
3708                let hw2: u16 = ((4 << 8) | r12) as u16;
3709                bytes.extend_from_slice(&hw1.to_le_bytes());
3710                bytes.extend_from_slice(&hw2.to_le_bytes());
3711
3712                // Load 0x0F0F0F0F into R3
3713                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
3714                // hw1 = 11110 1 10 0100 0000 = 0xF640
3715                // hw2 = 0 111 0011 00001111 = 0x730F
3716                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3717                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3718                // MOVT R3, #0x0F0F
3719                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3720                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3721
3722                // AND.W R4, R4, R3
3723                let hw1: u16 = (0xEA00 | 4) as u16;
3724                let hw2: u16 = ((4 << 8) | r3) as u16;
3725                bytes.extend_from_slice(&hw1.to_le_bytes());
3726                bytes.extend_from_slice(&hw2.to_le_bytes());
3727
3728                // Step 4: x = x * 0x01010101 >> 24
3729                // Load 0x01010101 into R3
3730                // MOVW R3, #0x0101
3731                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3732                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3733                // MOVT R3, #0x0101
3734                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3735                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3736
3737                // MUL R4, R4, R3
3738                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3739                let hw1: u16 = (0xFB00 | 4) as u16;
3740                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
3741                bytes.extend_from_slice(&hw1.to_le_bytes());
3742                bytes.extend_from_slice(&hw2.to_le_bytes());
3743
3744                // LSR.W R4, R4, #24
3745                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3746                let hw1: u16 = 0xEA4F;
3747                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
3748                bytes.extend_from_slice(&hw1.to_le_bytes());
3749                bytes.extend_from_slice(&hw2.to_le_bytes());
3750
3751                // --- POPCNT for R5 (hi word) - same algorithm ---
3752                // Step 1
3753                let hw1: u16 = 0xEA4F;
3754                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
3755                bytes.extend_from_slice(&hw1.to_le_bytes());
3756                bytes.extend_from_slice(&hw2.to_le_bytes());
3757
3758                // Load 0x55555555 into R3
3759                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3760                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3761                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3762                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3763
3764                let hw1: u16 = (0xEA00 | r12) as u16;
3765                let hw2: u16 = ((r12 << 8) | r3) as u16;
3766                bytes.extend_from_slice(&hw1.to_le_bytes());
3767                bytes.extend_from_slice(&hw2.to_le_bytes());
3768
3769                let hw1: u16 = (0xEBA0 | 5) as u16;
3770                let hw2: u16 = ((5 << 8) | r12) as u16;
3771                bytes.extend_from_slice(&hw1.to_le_bytes());
3772                bytes.extend_from_slice(&hw2.to_le_bytes());
3773
3774                // Step 2
3775                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3776                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3777                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3778                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3779
3780                let hw1: u16 = (0xEA00 | 5) as u16;
3781                let hw2: u16 = ((r12 << 8) | r3) as u16;
3782                bytes.extend_from_slice(&hw1.to_le_bytes());
3783                bytes.extend_from_slice(&hw2.to_le_bytes());
3784
3785                let hw1: u16 = 0xEA4F;
3786                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
3787                bytes.extend_from_slice(&hw1.to_le_bytes());
3788                bytes.extend_from_slice(&hw2.to_le_bytes());
3789
3790                let hw1: u16 = (0xEA00 | 5) as u16;
3791                let hw2: u16 = ((5 << 8) | r3) as u16;
3792                bytes.extend_from_slice(&hw1.to_le_bytes());
3793                bytes.extend_from_slice(&hw2.to_le_bytes());
3794
3795                let hw1: u16 = (0xEB00 | 5) as u16;
3796                let hw2: u16 = ((5 << 8) | r12) as u16;
3797                bytes.extend_from_slice(&hw1.to_le_bytes());
3798                bytes.extend_from_slice(&hw2.to_le_bytes());
3799
3800                // Step 3: LSR.W R12, R5, #4
3801                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3802                let hw1: u16 = 0xEA4F;
3803                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
3804                bytes.extend_from_slice(&hw1.to_le_bytes());
3805                bytes.extend_from_slice(&hw2.to_le_bytes());
3806
3807                let hw1: u16 = (0xEB00 | 5) as u16;
3808                let hw2: u16 = ((5 << 8) | r12) as u16;
3809                bytes.extend_from_slice(&hw1.to_le_bytes());
3810                bytes.extend_from_slice(&hw2.to_le_bytes());
3811
3812                // Load 0x0F0F0F0F into R3 (for hi-word)
3813                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3814                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3815                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3816                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3817
3818                let hw1: u16 = (0xEA00 | 5) as u16;
3819                let hw2: u16 = ((5 << 8) | r3) as u16;
3820                bytes.extend_from_slice(&hw1.to_le_bytes());
3821                bytes.extend_from_slice(&hw2.to_le_bytes());
3822
3823                // Step 4
3824                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3825                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3826                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3827                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3828
3829                // MUL R5, R5, R3
3830                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3831                let hw1: u16 = (0xFB00 | 5) as u16;
3832                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
3833                bytes.extend_from_slice(&hw1.to_le_bytes());
3834                bytes.extend_from_slice(&hw2.to_le_bytes());
3835
3836                // LSR.W R5, R5, #24
3837                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3838                let hw1: u16 = 0xEA4F;
3839                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
3840                bytes.extend_from_slice(&hw1.to_le_bytes());
3841                bytes.extend_from_slice(&hw2.to_le_bytes());
3842
3843                // ADD rd, R4, R5 (combine lo and hi counts)
3844                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
3845                let rd_bits_u16 = rd_bits as u16;
3846                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
3847                bytes.extend_from_slice(&instr.to_le_bytes());
3848
3849                // POP {R3, R4, R5}
3850                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
3851
3852                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3853                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3854                bytes.extend_from_slice(&mov0.to_le_bytes());
3855
3856                Ok(bytes)
3857            }
3858
3859            // I64Extend8S: Sign-extend low 8 bits to 64 bits
3860            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
3861            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
3862                let rdlo_bits = reg_to_bits(rdlo);
3863                let rdhi_bits = reg_to_bits(rdhi);
3864                let rnlo_bits = reg_to_bits(rnlo);
3865                let mut bytes = Vec::new();
3866
3867                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
3868                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
3869                let hw1: u16 = 0xFA4F_u16;
3870                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
3871                bytes.extend_from_slice(&hw1.to_le_bytes());
3872                bytes.extend_from_slice(&hw2.to_le_bytes());
3873
3874                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
3875                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
3876                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
3877                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
3878                let hw1: u16 = 0xEA4F;
3879                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
3880                bytes.extend_from_slice(&hw1.to_le_bytes());
3881                bytes.extend_from_slice(&hw2.to_le_bytes());
3882
3883                Ok(bytes)
3884            }
3885
3886            // I64Extend16S: Sign-extend low 16 bits to 64 bits
3887            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
3888            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
3889                let rdlo_bits = reg_to_bits(rdlo);
3890                let rdhi_bits = reg_to_bits(rdhi);
3891                let rnlo_bits = reg_to_bits(rnlo);
3892                let mut bytes = Vec::new();
3893
3894                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
3895                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
3896                let hw1: u16 = 0xFA0F_u16;
3897                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
3898                bytes.extend_from_slice(&hw1.to_le_bytes());
3899                bytes.extend_from_slice(&hw2.to_le_bytes());
3900
3901                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
3902                let hw1: u16 = 0xEA4F;
3903                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
3904                bytes.extend_from_slice(&hw1.to_le_bytes());
3905                bytes.extend_from_slice(&hw2.to_le_bytes());
3906
3907                Ok(bytes)
3908            }
3909
3910            // I64Extend32S: Sign-extend low 32 bits to 64 bits
3911            // Result: rdlo = rnlo, rdhi = rnlo >> 31
3912            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
3913                let rdlo_bits = reg_to_bits(rdlo);
3914                let rdhi_bits = reg_to_bits(rdhi);
3915                let rnlo_bits = reg_to_bits(rnlo);
3916                let mut bytes = Vec::new();
3917
3918                // MOV rdlo, rnlo (if different)
3919                if rdlo_bits != rnlo_bits {
3920                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
3921                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
3922                    let mov: u16 = 0x4600
3923                        | (d_bit << 7)
3924                        | ((rnlo_bits as u16) << 3)
3925                        | ((rdlo_bits & 0x7) as u16);
3926                    bytes.extend_from_slice(&mov.to_le_bytes());
3927                }
3928
3929                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
3930                let hw1: u16 = 0xEA4F;
3931                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
3932                bytes.extend_from_slice(&hw1.to_le_bytes());
3933                bytes.extend_from_slice(&hw2.to_le_bytes());
3934
3935                Ok(bytes)
3936            }
3937
3938            // SelectMove: IT <cond>; MOV{cond} rd, rm
3939            // Conditional move: only execute MOV if condition is true
3940            ArmOp::SelectMove { rd, rm, cond } => {
3941                let rd_bits = reg_to_bits(rd) as u16;
3942                let rm_bits = reg_to_bits(rm) as u16;
3943
3944                // Condition code encoding for IT block
3945                use synth_synthesis::Condition;
3946                let cond_bits: u16 = match cond {
3947                    Condition::EQ => 0x0, // Equal
3948                    Condition::NE => 0x1, // Not equal
3949                    Condition::HS => 0x2, // Higher or same (unsigned >=)
3950                    Condition::LO => 0x3, // Lower (unsigned <)
3951                    Condition::HI => 0x8, // Higher (unsigned >)
3952                    Condition::LS => 0x9, // Lower or same (unsigned <=)
3953                    Condition::GE => 0xA, // Greater or equal (signed)
3954                    Condition::LT => 0xB, // Less than (signed)
3955                    Condition::GT => 0xC, // Greater than (signed)
3956                    Condition::LE => 0xD, // Less or equal (signed)
3957                };
3958
3959                // IT <cond>: single Then block (mask = 0x8 for T only)
3960                // IT instruction: 1011 1111 firstcond mask
3961                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
3962
3963                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
3964                // This MOV will only execute if condition is true due to IT block
3965                let d_bit = (rd_bits >> 3) & 1;
3966                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
3967
3968                // Emit: IT <cond>, MOV rd, rm
3969                let mut bytes = it_instr.to_le_bytes().to_vec();
3970                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3971                Ok(bytes)
3972            }
3973
3974            // Popcnt: Population count (count set bits)
3975            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
3976            // x = x - ((x >> 1) & 0x55555555);
3977            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
3978            // x = (x + (x >> 4)) & 0x0F0F0F0F;
3979            // x = x + (x >> 8);
3980            // x = x + (x >> 16);
3981            // return x & 0x3F;
3982            //
3983            // Uses rd as working register and R12 as scratch for constants
3984            ArmOp::Popcnt { rd, rm } => {
3985                let mut bytes = Vec::new();
3986
3987                // First, move rm to rd if they're different
3988                if rd != rm {
3989                    let rd_bits = reg_to_bits(rd) as u16;
3990                    let rm_bits = reg_to_bits(rm) as u16;
3991                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
3992                    let d_bit = (rd_bits >> 3) & 1;
3993                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
3994                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
3995                }
3996
3997                // Step 1: x = x - ((x >> 1) & 0x55555555)
3998                // Load 0x55555555 into R12
3999                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4000                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4001
4002                // R12_temp = rd >> 1
4003                // We need a second scratch register. Use R11.
4004                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4005
4006                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4007                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4008
4009                // rd = rd - R11
4010                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4011                    reg_to_bits(rd),
4012                    reg_to_bits(rd),
4013                    11,
4014                )?);
4015
4016                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4017                // Load 0x33333333 into R12
4018                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4019                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4020
4021                // R11 = rd & R12
4022                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4023                    11,
4024                    reg_to_bits(rd),
4025                    12,
4026                )?);
4027
4028                // rd = rd >> 2
4029                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4030                    reg_to_bits(rd),
4031                    reg_to_bits(rd),
4032                    2,
4033                )?);
4034
4035                // rd = rd & R12
4036                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4037                    reg_to_bits(rd),
4038                    reg_to_bits(rd),
4039                    12,
4040                )?);
4041
4042                // rd = rd + R11
4043                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4044                    reg_to_bits(rd),
4045                    reg_to_bits(rd),
4046                    11,
4047                )?);
4048
4049                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4050                // R11 = rd >> 4
4051                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4052
4053                // rd = rd + R11
4054                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4055                    reg_to_bits(rd),
4056                    reg_to_bits(rd),
4057                    11,
4058                )?);
4059
4060                // Load 0x0F0F0F0F into R12
4061                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4062                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4063
4064                // rd = rd & R12
4065                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4066                    reg_to_bits(rd),
4067                    reg_to_bits(rd),
4068                    12,
4069                )?);
4070
4071                // Step 4: x = x + (x >> 8)
4072                // R11 = rd >> 8
4073                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4074
4075                // rd = rd + R11
4076                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4077                    reg_to_bits(rd),
4078                    reg_to_bits(rd),
4079                    11,
4080                )?);
4081
4082                // Step 5: x = x + (x >> 16)
4083                // R11 = rd >> 16
4084                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4085
4086                // rd = rd + R11
4087                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4088                    reg_to_bits(rd),
4089                    reg_to_bits(rd),
4090                    11,
4091                )?);
4092
4093                // Step 6: return x & 0x3F
4094                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4095                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4096                    reg_to_bits(rd),
4097                    reg_to_bits(rd),
4098                    0x3F,
4099                )?);
4100
4101                Ok(bytes)
4102            }
4103
4104            // I64DivU: 64-bit unsigned division using binary long division
4105            // Input: R0:R1 = dividend, R2:R3 = divisor
4106            // Output: R0:R1 = quotient
4107            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4108            ArmOp::I64DivU {
4109                rdlo: _,
4110                rdhi: _,
4111                rnlo: _,
4112                rnhi: _,
4113                rmlo: _,
4114                rmhi: _,
4115            } => {
4116                let mut bytes = Vec::new();
4117
4118                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4119                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4120                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4121                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4122
4123                // Initialize quotient (R4:R5) = 0
4124                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4125                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4126
4127                // Initialize remainder (R6:R7) = 0
4128                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4129                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4130
4131                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4132                // MOV.W R12, #64: F04F 0C40
4133                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4134                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4135
4136                // Loop start
4137                let loop_start = bytes.len();
4138
4139                // === Loop body: process one bit ===
4140
4141                // 1. Shift quotient R4:R5 left by 1
4142                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4143                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4144                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4145                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4146                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4147                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4148                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4149                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4150                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4151                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4152                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4153
4154                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4155                // LSLS R7, R7, #1
4156                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4157                // ORR.W R7, R7, R6, LSR #31
4158                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4159                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4160                // LSLS R6, R6, #1
4161                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4162                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4163                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4164                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4165
4166                // 3. Shift dividend R0:R1 left by 1
4167                // LSLS R1, R1, #1
4168                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4169                // ORR.W R1, R1, R0, LSR #31
4170                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4171                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4172                // LSLS R0, R0, #1
4173                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4174
4175                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4176                // Compare high words first: CMP R7, R3
4177                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4178                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4179                // BHI means R7 > R3 (unsigned) - definitely subtract
4180                // BLO means R7 < R3 - definitely don't subtract
4181                // BEQ means need to check low words
4182
4183                // If high > divisor high: branch to subtract (forward +offset)
4184                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4185                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4186                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4187
4188                // If high < divisor high: branch past subtract
4189                // BLO.N +10 (skip to decrement)
4190                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4191
4192                // High words equal, compare low: CMP R6, R2
4193                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4194                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4195                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4196
4197                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4198                // SUBS R6, R6, R2
4199                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4200                // SBC R7, R7, R3 (with borrow)
4201                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4202                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4203                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4204                // ORR R4, R4, #1 (set bit 0 of quotient low)
4205                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4206                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4207
4208                // === Decrement counter and loop ===
4209                // SUBS.W R12, R12, #1 (decrement loop counter)
4210                // SUBS.W R12, R12, #1: F1BC 0C01
4211                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4212                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4213
4214                // BNE back to loop_start
4215                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4216                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4217                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4218                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4219
4220                // === Loop done, move quotient to R0:R1 ===
4221                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4222                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4223
4224                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4225                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4226                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4227                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4228
4229                Ok(bytes)
4230            }
4231
4232            // I64DivS: 64-bit signed division
4233            // Converts to unsigned, divides, then applies sign
4234            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4235            // Output: R0:R1 = quotient (signed)
4236            ArmOp::I64DivS {
4237                rdlo: _,
4238                rdhi: _,
4239                rnlo: _,
4240                rnhi: _,
4241                rmlo: _,
4242                rmhi: _,
4243            } => {
4244                let mut bytes = Vec::new();
4245
4246                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4247                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4248                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4249
4250                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4251                // EOR.W R9, R1, R3
4252                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4253                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4254
4255                // If dividend negative (R1 MSB set), negate it
4256                // TST R1, R1 (check sign)
4257                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4258                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4259                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4260
4261                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4262                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4263                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4264                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4265                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4266                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4267                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4268
4269                // If divisor negative (R3 MSB set), negate it
4270                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4271                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4272
4273                // Negate R2:R3
4274                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4275                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4276                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4277                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4278                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4279
4280                // === Now do unsigned division (same as I64DivU) ===
4281                // Initialize quotient (R4:R5) = 0
4282                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4283                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4284                // Initialize remainder (R6:R7) = 0
4285                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4286                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4287                // Initialize loop counter R8 = 64
4288                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4289                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4290
4291                let loop_start = bytes.len();
4292
4293                // Shift quotient left
4294                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4295                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4296                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4297                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4298
4299                // Shift remainder left, OR in MSB of dividend
4300                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4301                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4302                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4303                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4304                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4305                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4306
4307                // Shift dividend left
4308                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4309                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4310                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4311                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4312
4313                // Compare and conditionally subtract
4314                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4315                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4316                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4317                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4318                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4319
4320                // Subtract and set quotient bit
4321                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4322                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4323                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4324                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4325                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4326
4327                // Decrement and loop
4328                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4329                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4330
4331                let branch_offset_bytes = bytes.len() - loop_start + 4;
4332                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4333                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4334                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4335
4336                // Move quotient to R0:R1
4337                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4338                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4339
4340                // If result should be negative (R9 MSB set), negate R0:R1
4341                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4342                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4343                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4344
4345                // Negate result R0:R1
4346                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4347                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4348                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4349                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4350                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4351
4352                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4353                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4354                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4355
4356                Ok(bytes)
4357            }
4358
4359            // I64RemU: 64-bit unsigned remainder using binary long division
4360            // Same algorithm as I64DivU but returns remainder instead of quotient
4361            // Input: R0:R1 = dividend, R2:R3 = divisor
4362            // Output: R0:R1 = remainder
4363            ArmOp::I64RemU {
4364                rdlo: _,
4365                rdhi: _,
4366                rnlo: _,
4367                rnhi: _,
4368                rmlo: _,
4369                rmhi: _,
4370            } => {
4371                let mut bytes = Vec::new();
4372
4373                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4374                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4375                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4376
4377                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4378                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4379                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4380                // Initialize remainder (R6:R7) = 0
4381                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4382                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4383                // Initialize loop counter R8 = 64
4384                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4385                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4386
4387                let loop_start = bytes.len();
4388
4389                // Shift quotient left (not needed for result, but keeps algorithm same)
4390                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4391                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4392                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4393                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4394
4395                // Shift remainder left, OR in MSB of dividend
4396                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4397                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4398                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4399                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4400                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4401                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4402
4403                // Shift dividend left
4404                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4405                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4406                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4407                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4408
4409                // Compare and conditionally subtract
4410                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4411                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4412                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4413                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4414                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4415
4416                // Subtract and set quotient bit
4417                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4418                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4419                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4420                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4421                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4422
4423                // Decrement and loop
4424                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4425                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4426
4427                let branch_offset_bytes = bytes.len() - loop_start + 4;
4428                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4429                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4430                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4431
4432                // Move REMAINDER to R0:R1 (difference from I64DivU)
4433                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4434                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4435
4436                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4437                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4438                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4439
4440                Ok(bytes)
4441            }
4442
4443            // I64RemS: 64-bit signed remainder
4444            // Remainder sign follows dividend sign (not quotient rule)
4445            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4446            // Output: R0:R1 = remainder (signed, same sign as dividend)
4447            ArmOp::I64RemS {
4448                rdlo: _,
4449                rdhi: _,
4450                rnlo: _,
4451                rnhi: _,
4452                rmlo: _,
4453                rmhi: _,
4454            } => {
4455                let mut bytes = Vec::new();
4456
4457                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4458                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4459                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4460
4461                // Save dividend sign in R9 (remainder sign = dividend sign)
4462                // MOV R9, R1 (just need the sign bit)
4463                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4464
4465                // If dividend negative (R1 MSB set), negate it
4466                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4467                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4468
4469                // Negate R0:R1
4470                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4471                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4472                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4473                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4474                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4475
4476                // If divisor negative (R3 MSB set), negate it
4477                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4478                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4479
4480                // Negate R2:R3
4481                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4482                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4483                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4484                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4485                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4486
4487                // === Unsigned division algorithm ===
4488                // Initialize quotient (R4:R5) = 0
4489                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4490                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4491                // Initialize remainder (R6:R7) = 0
4492                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4493                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4494                // Initialize loop counter R8 = 64
4495                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4496                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4497
4498                let loop_start = bytes.len();
4499
4500                // Shift quotient left
4501                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4502                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4503                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4504                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4505
4506                // Shift remainder left, OR in MSB of dividend
4507                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4508                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4509                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4510                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4511                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4512                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4513
4514                // Shift dividend left
4515                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4516                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4517                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4518                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4519
4520                // Compare and conditionally subtract
4521                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4522                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4523                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4524                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4525                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4526
4527                // Subtract and set quotient bit
4528                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4529                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4530                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4531                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4532                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4533
4534                // Decrement and loop
4535                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4536                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4537
4538                let branch_offset_bytes = bytes.len() - loop_start + 4;
4539                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4540                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4541                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4542
4543                // Move remainder to R0:R1
4544                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4545                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4546
4547                // If original dividend was negative (R9 MSB set), negate remainder
4548                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4549                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4550                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4551
4552                // Negate result R0:R1
4553                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4554                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4555                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4556                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4557                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4558
4559                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4560                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4561                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4562
4563                Ok(bytes)
4564            }
4565
4566            // === F32 VFP single-precision Thumb-2 encodings ===
4567            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4568            ArmOp::F32Add { sd, sn, sm } => {
4569                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4570            }
4571            ArmOp::F32Sub { sd, sn, sm } => {
4572                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4573            }
4574            ArmOp::F32Mul { sd, sn, sm } => {
4575                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4576            }
4577            ArmOp::F32Div { sd, sn, sm } => {
4578                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4579            }
4580            ArmOp::F32Abs { sd, sm } => {
4581                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4582            }
4583            ArmOp::F32Neg { sd, sm } => {
4584                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4585            }
4586            ArmOp::F32Sqrt { sd, sm } => {
4587                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4588            }
4589
4590            // f32 pseudo-ops — multi-instruction sequences
4591            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4592            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4593            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4594            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4595            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4596            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4597            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4598            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4599
4600            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4601            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4602            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4603            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4604            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4605            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4606            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4607
4608            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4609
4610            ArmOp::F32Load { sd, addr } => {
4611                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4612            }
4613            ArmOp::F32Store { sd, addr } => {
4614                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4615            }
4616
4617            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4618            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4619            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4620                Err(synth_core::Error::synthesis(
4621                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4622                ))
4623            }
4624            ArmOp::F32ReinterpretI32 { sd, rm } => {
4625                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4626            }
4627            ArmOp::I32ReinterpretF32 { rd, sm } => {
4628                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4629            }
4630            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4631            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4632
4633            // === F64 VFP double-precision Thumb-2 encodings ===
4634            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4635            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4636                0xEE300B00, dd, dn, dm,
4637            )?)),
4638            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4639                0xEE300B40, dd, dn, dm,
4640            )?)),
4641            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4642                0xEE200B00, dd, dn, dm,
4643            )?)),
4644            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4645                0xEE800B00, dd, dn, dm,
4646            )?)),
4647            ArmOp::F64Abs { dd, dm } => {
4648                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4649            }
4650            ArmOp::F64Neg { dd, dm } => {
4651                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4652            }
4653            ArmOp::F64Sqrt { dd, dm } => {
4654                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4655            }
4656
4657            // f64 pseudo-ops
4658            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4659            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
4660            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
4661            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
4662            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
4663            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
4664            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
4665            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
4666
4667            // f64 comparisons
4668            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
4669            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
4670            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
4671            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
4672            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
4673            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
4674
4675            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
4676
4677            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4678                0xED900B00, dd, addr,
4679            )?)),
4680            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4681                0xED800B00, dd, addr,
4682            )?)),
4683
4684            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
4685            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
4686            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
4687                Err(synth_core::Error::synthesis(
4688                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4689                ))
4690            }
4691            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
4692            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
4693                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
4694            )),
4695            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
4696                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
4697            )),
4698            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
4699                Err(synth_core::Error::synthesis(
4700                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
4701                ))
4702            }
4703            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
4704            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
4705
4706            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
4707
4708            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
4709            ArmOp::I64Add {
4710                rdlo,
4711                rdhi,
4712                rnlo,
4713                rnhi,
4714                rmlo,
4715                rmhi,
4716            } => {
4717                let mut bytes = Vec::new();
4718                // ADDS rdlo, rnlo, rmlo (16-bit)
4719                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
4720                    rd: *rdlo,
4721                    rn: *rnlo,
4722                    op2: Operand2::Reg(*rmlo),
4723                })?);
4724                // ADC.W rdhi, rnhi, rmhi (32-bit)
4725                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
4726                    rd: *rdhi,
4727                    rn: *rnhi,
4728                    op2: Operand2::Reg(*rmhi),
4729                })?);
4730                Ok(bytes)
4731            }
4732
4733            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
4734            ArmOp::I64Sub {
4735                rdlo,
4736                rdhi,
4737                rnlo,
4738                rnhi,
4739                rmlo,
4740                rmhi,
4741            } => {
4742                let mut bytes = Vec::new();
4743                // SUBS rdlo, rnlo, rmlo (16-bit)
4744                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
4745                    rd: *rdlo,
4746                    rn: *rnlo,
4747                    op2: Operand2::Reg(*rmlo),
4748                })?);
4749                // SBC.W rdhi, rnhi, rmhi (32-bit)
4750                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
4751                    rd: *rdhi,
4752                    rn: *rnhi,
4753                    op2: Operand2::Reg(*rmhi),
4754                })?);
4755                Ok(bytes)
4756            }
4757
4758            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
4759            ArmOp::I64And {
4760                rdlo,
4761                rdhi,
4762                rnlo,
4763                rnhi,
4764                rmlo,
4765                rmhi,
4766            } => {
4767                let mut bytes = Vec::new();
4768                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4769                    rd: *rdlo,
4770                    rn: *rnlo,
4771                    op2: Operand2::Reg(*rmlo),
4772                })?);
4773                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4774                    rd: *rdhi,
4775                    rn: *rnhi,
4776                    op2: Operand2::Reg(*rmhi),
4777                })?);
4778                Ok(bytes)
4779            }
4780
4781            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
4782            ArmOp::I64Or {
4783                rdlo,
4784                rdhi,
4785                rnlo,
4786                rnhi,
4787                rmlo,
4788                rmhi,
4789            } => {
4790                let mut bytes = Vec::new();
4791                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4792                    rd: *rdlo,
4793                    rn: *rnlo,
4794                    op2: Operand2::Reg(*rmlo),
4795                })?);
4796                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4797                    rd: *rdhi,
4798                    rn: *rnhi,
4799                    op2: Operand2::Reg(*rmhi),
4800                })?);
4801                Ok(bytes)
4802            }
4803
4804            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
4805            ArmOp::I64Xor {
4806                rdlo,
4807                rdhi,
4808                rnlo,
4809                rnhi,
4810                rmlo,
4811                rmhi,
4812            } => {
4813                let mut bytes = Vec::new();
4814                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4815                    rd: *rdlo,
4816                    rn: *rnlo,
4817                    op2: Operand2::Reg(*rmlo),
4818                })?);
4819                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4820                    rd: *rdhi,
4821                    rn: *rnhi,
4822                    op2: Operand2::Reg(*rmhi),
4823                })?);
4824                Ok(bytes)
4825            }
4826
4827            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
4828            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
4829                rd: *rd,
4830                rn_lo: *rnlo,
4831                rn_hi: *rnhi,
4832            }),
4833
4834            // I64 comparisons: delegate to I64SetCond
4835            ArmOp::I64Eq {
4836                rd,
4837                rnlo,
4838                rnhi,
4839                rmlo,
4840                rmhi,
4841            } => self.encode_thumb(&ArmOp::I64SetCond {
4842                rd: *rd,
4843                rn_lo: *rnlo,
4844                rn_hi: *rnhi,
4845                rm_lo: *rmlo,
4846                rm_hi: *rmhi,
4847                cond: synth_synthesis::Condition::EQ,
4848            }),
4849
4850            ArmOp::I64Ne {
4851                rd,
4852                rnlo,
4853                rnhi,
4854                rmlo,
4855                rmhi,
4856            } => self.encode_thumb(&ArmOp::I64SetCond {
4857                rd: *rd,
4858                rn_lo: *rnlo,
4859                rn_hi: *rnhi,
4860                rm_lo: *rmlo,
4861                rm_hi: *rmhi,
4862                cond: synth_synthesis::Condition::NE,
4863            }),
4864
4865            ArmOp::I64LtS {
4866                rd,
4867                rnlo,
4868                rnhi,
4869                rmlo,
4870                rmhi,
4871            } => self.encode_thumb(&ArmOp::I64SetCond {
4872                rd: *rd,
4873                rn_lo: *rnlo,
4874                rn_hi: *rnhi,
4875                rm_lo: *rmlo,
4876                rm_hi: *rmhi,
4877                cond: synth_synthesis::Condition::LT,
4878            }),
4879
4880            ArmOp::I64LtU {
4881                rd,
4882                rnlo,
4883                rnhi,
4884                rmlo,
4885                rmhi,
4886            } => self.encode_thumb(&ArmOp::I64SetCond {
4887                rd: *rd,
4888                rn_lo: *rnlo,
4889                rn_hi: *rnhi,
4890                rm_lo: *rmlo,
4891                rm_hi: *rmhi,
4892                cond: synth_synthesis::Condition::LO,
4893            }),
4894
4895            ArmOp::I64LeS {
4896                rd,
4897                rnlo,
4898                rnhi,
4899                rmlo,
4900                rmhi,
4901            } => self.encode_thumb(&ArmOp::I64SetCond {
4902                rd: *rd,
4903                rn_lo: *rnlo,
4904                rn_hi: *rnhi,
4905                rm_lo: *rmlo,
4906                rm_hi: *rmhi,
4907                cond: synth_synthesis::Condition::LE,
4908            }),
4909
4910            ArmOp::I64LeU {
4911                rd,
4912                rnlo,
4913                rnhi,
4914                rmlo,
4915                rmhi,
4916            } => self.encode_thumb(&ArmOp::I64SetCond {
4917                rd: *rd,
4918                rn_lo: *rnlo,
4919                rn_hi: *rnhi,
4920                rm_lo: *rmlo,
4921                rm_hi: *rmhi,
4922                cond: synth_synthesis::Condition::LS,
4923            }),
4924
4925            ArmOp::I64GtS {
4926                rd,
4927                rnlo,
4928                rnhi,
4929                rmlo,
4930                rmhi,
4931            } => self.encode_thumb(&ArmOp::I64SetCond {
4932                rd: *rd,
4933                rn_lo: *rnlo,
4934                rn_hi: *rnhi,
4935                rm_lo: *rmlo,
4936                rm_hi: *rmhi,
4937                cond: synth_synthesis::Condition::GT,
4938            }),
4939
4940            ArmOp::I64GtU {
4941                rd,
4942                rnlo,
4943                rnhi,
4944                rmlo,
4945                rmhi,
4946            } => self.encode_thumb(&ArmOp::I64SetCond {
4947                rd: *rd,
4948                rn_lo: *rnlo,
4949                rn_hi: *rnhi,
4950                rm_lo: *rmlo,
4951                rm_hi: *rmhi,
4952                cond: synth_synthesis::Condition::HI,
4953            }),
4954
4955            ArmOp::I64GeS {
4956                rd,
4957                rnlo,
4958                rnhi,
4959                rmlo,
4960                rmhi,
4961            } => self.encode_thumb(&ArmOp::I64SetCond {
4962                rd: *rd,
4963                rn_lo: *rnlo,
4964                rn_hi: *rnhi,
4965                rm_lo: *rmlo,
4966                rm_hi: *rmhi,
4967                cond: synth_synthesis::Condition::GE,
4968            }),
4969
4970            ArmOp::I64GeU {
4971                rd,
4972                rnlo,
4973                rnhi,
4974                rmlo,
4975                rmhi,
4976            } => self.encode_thumb(&ArmOp::I64SetCond {
4977                rd: *rd,
4978                rn_lo: *rnlo,
4979                rn_hi: *rnhi,
4980                rm_lo: *rmlo,
4981                rm_hi: *rmhi,
4982                cond: synth_synthesis::Condition::HS,
4983            }),
4984
4985            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
4986            ArmOp::I64Const { rdlo, rdhi, value } => {
4987                let lo32 = *value as u32;
4988                let hi32 = (*value >> 32) as u32;
4989                let mut bytes = Vec::new();
4990                // Load low 32 bits into rdlo
4991                bytes.extend_from_slice(
4992                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
4993                );
4994                if lo32 > 0xFFFF {
4995                    bytes.extend_from_slice(
4996                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
4997                    );
4998                }
4999                // Load high 32 bits into rdhi
5000                bytes.extend_from_slice(
5001                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5002                );
5003                if hi32 > 0xFFFF {
5004                    bytes.extend_from_slice(
5005                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5006                    );
5007                }
5008                Ok(bytes)
5009            }
5010
5011            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5012            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5013                let mut bytes = Vec::new();
5014                let offset = if addr.offset < 0 {
5015                    0u32
5016                } else {
5017                    addr.offset as u32
5018                };
5019                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &addr.base, offset)?);
5020                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5021                    rdhi,
5022                    &addr.base,
5023                    offset.wrapping_add(4),
5024                )?);
5025                Ok(bytes)
5026            }
5027
5028            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5029            ArmOp::I64Str { rdlo, rdhi, addr } => {
5030                let mut bytes = Vec::new();
5031                let offset = if addr.offset < 0 {
5032                    0u32
5033                } else {
5034                    addr.offset as u32
5035                };
5036                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &addr.base, offset)?);
5037                bytes.extend_from_slice(&self.encode_thumb32_str(
5038                    rdhi,
5039                    &addr.base,
5040                    offset.wrapping_add(4),
5041                )?);
5042                Ok(bytes)
5043            }
5044
5045            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5046            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5047                let mut bytes = Vec::new();
5048                if rdlo != rn {
5049                    // MOV rdlo, rn (16-bit)
5050                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5051                        rd: *rdlo,
5052                        op2: Operand2::Reg(*rn),
5053                    })?);
5054                }
5055                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5056                bytes.extend_from_slice(
5057                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5058                );
5059                Ok(bytes)
5060            }
5061
5062            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5063            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5064                let mut bytes = Vec::new();
5065                if rdlo != rn {
5066                    // MOV rdlo, rn
5067                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5068                        rd: *rdlo,
5069                        op2: Operand2::Reg(*rn),
5070                    })?);
5071                }
5072                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5073                let rdhi_bits = reg_to_bits(rdhi) as u16;
5074                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5075                bytes.extend_from_slice(&instr.to_le_bytes());
5076                Ok(bytes)
5077            }
5078
5079            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5080            ArmOp::I32WrapI64 { rd, rnlo } => {
5081                if rd == rnlo {
5082                    // No-op: already in the right register
5083                    let instr: u16 = 0xBF00; // NOP
5084                    Ok(instr.to_le_bytes().to_vec())
5085                } else {
5086                    // MOV rd, rnlo
5087                    self.encode_thumb(&ArmOp::Mov {
5088                        rd: *rd,
5089                        op2: Operand2::Reg(*rnlo),
5090                    })
5091                }
5092            }
5093
5094            // ===== Helium MVE operations (Thumb-2 encoding) =====
5095            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5096            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5097            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5098            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5099                0xEF000150, qd, qn, qm,
5100            ))),
5101            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5102                0xEF200150, qd, qn, qm,
5103            ))),
5104            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5105                0xFF000150, qd, qn, qm,
5106            ))),
5107            ArmOp::MveMvn { qd, qm } => {
5108                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5109                let qd_enc = qreg_to_num(qd);
5110                let qm_enc = qreg_to_num(qm);
5111                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5112                Ok(vfp_to_thumb_bytes(instr))
5113            }
5114            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5115                0xEF100150, qd, qn, qm,
5116            ))),
5117            ArmOp::MveAddI { qd, qn, qm, size } => {
5118                let sz = mve_size_bits(size);
5119                let base: u32 = 0xEF000840 | (sz << 20);
5120                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5121            }
5122            ArmOp::MveSubI { qd, qn, qm, size } => {
5123                let sz = mve_size_bits(size);
5124                let base: u32 = 0xFF000840 | (sz << 20);
5125                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5126            }
5127            ArmOp::MveMulI { qd, qn, qm, size } => {
5128                let sz = mve_size_bits(size);
5129                let base: u32 = 0xEF000950 | (sz << 20);
5130                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5131            }
5132            ArmOp::MveNegI { qd, qm, size } => {
5133                let sz = mve_size_bits(size);
5134                // VNEG.Sx Qd, Qm
5135                let qd_enc = qreg_to_num(qd);
5136                let qm_enc = qreg_to_num(qm);
5137                let base: u32 = 0xFFB103C0 | (sz << 18);
5138                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5139                Ok(vfp_to_thumb_bytes(instr))
5140            }
5141            ArmOp::MveDup { qd, rn, size } => {
5142                let sz = mve_size_bits(size);
5143                let qd_enc = qreg_to_num(qd);
5144                let rn_bits = reg_to_bits(rn);
5145                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5146                // size encoding: 00=32, 01=16, 10=8
5147                let be = match sz {
5148                    0 => 0b00u32, // 8-bit
5149                    1 => 0b01,    // 16-bit
5150                    _ => 0b00,    // 32-bit (default)
5151                };
5152                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5153                Ok(vfp_to_thumb_bytes(instr))
5154            }
5155            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5156                let qn_enc = qreg_to_num(qn);
5157                let rd_bits = reg_to_bits(rd);
5158                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5159                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5160                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5161                let lane_in_d = (*lane as u32) & 1;
5162                let _sz = mve_size_bits(size);
5163                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5164                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5165                Ok(vfp_to_thumb_bytes(instr))
5166            }
5167            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5168                let qd_enc = qreg_to_num(qd);
5169                let rn_bits = reg_to_bits(rn);
5170                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5171                let lane_in_d = (*lane as u32) & 1;
5172                let _sz = mve_size_bits(size);
5173                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5174                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5175                Ok(vfp_to_thumb_bytes(instr))
5176            }
5177
5178            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5179            ArmOp::MveCmpEqI { qd, qn, qm, size }
5180            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5181            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5182            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5183            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5184            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5185            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5186            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5187            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5188            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5189                // Encode as VADD (placeholder encoding — real implementation
5190                // would use VCMP + VPSEL pair)
5191                let sz = mve_size_bits(size);
5192                let base: u32 = 0xEF000840 | (sz << 20);
5193                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5194            }
5195
5196            // f32x4 MVE arithmetic
5197            ArmOp::MveAddF32 { qd, qn, qm } => {
5198                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5199                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5200            }
5201            ArmOp::MveSubF32 { qd, qn, qm } => {
5202                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5203                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5204            }
5205            ArmOp::MveMulF32 { qd, qn, qm } => {
5206                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5207                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5208            }
5209            ArmOp::MveNegF32 { qd, qm } => {
5210                let qd_enc = qreg_to_num(qd);
5211                let qm_enc = qreg_to_num(qm);
5212                // VNEG.F32 Qd, Qm: FFB907C0
5213                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5214                Ok(vfp_to_thumb_bytes(instr))
5215            }
5216            ArmOp::MveAbsF32 { qd, qm } => {
5217                let qd_enc = qreg_to_num(qd);
5218                let qm_enc = qreg_to_num(qm);
5219                // VABS.F32 Qd, Qm: FFB90740
5220                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5221                Ok(vfp_to_thumb_bytes(instr))
5222            }
5223            ArmOp::MveCmpEqF32 { qd, qn, qm }
5224            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5225            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5226            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5227            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5228            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5229                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5230                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5231            }
5232            ArmOp::MveDupF32 { qd, rn } => {
5233                let qd_enc = qreg_to_num(qd);
5234                let rn_bits = reg_to_bits(rn);
5235                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5236                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5237                Ok(vfp_to_thumb_bytes(instr))
5238            }
5239            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5240                let qn_enc = qreg_to_num(qn);
5241                let rd_bits = reg_to_bits(rd);
5242                // VMOV Rd, Sn where Sn = Q*4 + lane
5243                let s_num = qn_enc * 4 + (*lane as u32);
5244                let (vn, n) = encode_sreg(s_num);
5245                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5246                Ok(vfp_to_thumb_bytes(instr))
5247            }
5248            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5249                let qd_enc = qreg_to_num(qd);
5250                let rn_bits = reg_to_bits(rn);
5251                // VMOV Sn, Rn where Sn = Q*4 + lane
5252                let s_num = qd_enc * 4 + (*lane as u32);
5253                let (vn, n) = encode_sreg(s_num);
5254                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5255                Ok(vfp_to_thumb_bytes(instr))
5256            }
5257            ArmOp::MveDivF32 { qd, qn, qm } => {
5258                // Lane-wise: extract 4 S-regs, VDIV, insert back
5259                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5260            }
5261            ArmOp::MveSqrtF32 { qd, qm } => {
5262                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5263                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5264            }
5265
5266            // Catch-all for any remaining ops
5267            _ => {
5268                let instr: u16 = 0xBF00; // NOP
5269                Ok(instr.to_le_bytes().to_vec())
5270            }
5271        }
5272    }
5273
5274    // === Thumb-2 VFP multi-instruction helpers ===
5275
5276    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5277    fn encode_thumb_f32_compare(
5278        &self,
5279        rd: &Reg,
5280        sn: &VfpReg,
5281        sm: &VfpReg,
5282        cond_code: u32,
5283    ) -> Result<Vec<u8>> {
5284        let mut bytes = Vec::new();
5285        let rd_bits = reg_to_bits(rd);
5286
5287        // VCMP.F32 Sn, Sm
5288        let sn_num = vfp_sreg_to_num(sn)?;
5289        let sm_num = vfp_sreg_to_num(sm)?;
5290        let (vd, d) = encode_sreg(sn_num);
5291        let (vm, m) = encode_sreg(sm_num);
5292        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5293        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5294
5295        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5296        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5297
5298        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5299        if rd_bits < 8 {
5300            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5301            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5302        } else {
5303            // MOV.W Rd, #0 (32-bit Thumb-2)
5304            let hw1: u16 = 0xF04F;
5305            let hw2: u16 = (rd_bits as u16) << 8;
5306            bytes.extend_from_slice(&hw1.to_le_bytes());
5307            bytes.extend_from_slice(&hw2.to_le_bytes());
5308        }
5309
5310        // IT<cond> — If-Then for conditional MOV
5311        // IT encoding: 1011 1111 cond(4) mask(4)
5312        // mask = 0x8 for single "then" (IT)
5313        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5314        bytes.extend_from_slice(&it.to_le_bytes());
5315
5316        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5317        if rd_bits < 8 {
5318            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5319            bytes.extend_from_slice(&mov_one.to_le_bytes());
5320        } else {
5321            // MOV.W Rd, #1 (32-bit)
5322            let hw1: u16 = 0xF04F;
5323            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5324            bytes.extend_from_slice(&hw1.to_le_bytes());
5325            bytes.extend_from_slice(&hw2.to_le_bytes());
5326        }
5327
5328        Ok(bytes)
5329    }
5330
5331    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5332    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5333        let mut bytes = Vec::new();
5334        let bits = value.to_bits();
5335        let rt: u32 = 12; // R12/IP as temp
5336
5337        // MOVW R12, #lo16
5338        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5339        let lo16 = bits & 0xFFFF;
5340        let imm4 = (lo16 >> 12) & 0xF;
5341        let i_bit = (lo16 >> 11) & 1;
5342        let imm3 = (lo16 >> 8) & 0x7;
5343        let imm8 = lo16 & 0xFF;
5344        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5345        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5346        bytes.extend_from_slice(&hw1.to_le_bytes());
5347        bytes.extend_from_slice(&hw2.to_le_bytes());
5348
5349        // MOVT R12, #hi16
5350        let hi16 = (bits >> 16) & 0xFFFF;
5351        let imm4 = (hi16 >> 12) & 0xF;
5352        let i_bit = (hi16 >> 11) & 1;
5353        let imm3 = (hi16 >> 8) & 0x7;
5354        let imm8 = hi16 & 0xFF;
5355        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5356        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5357        bytes.extend_from_slice(&hw1.to_le_bytes());
5358        bytes.extend_from_slice(&hw2.to_le_bytes());
5359
5360        // VMOV Sd, R12
5361        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5362        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5363
5364        Ok(bytes)
5365    }
5366
5367    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5368    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5369        let mut bytes = Vec::new();
5370
5371        // VMOV Sd, Rm
5372        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5373        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5374
5375        // VCVT.F32.S32/U32 Sd, Sd
5376        let sd_num = vfp_sreg_to_num(sd)?;
5377        let (vd, d) = encode_sreg(sd_num);
5378        let (vm, m) = encode_sreg(sd_num);
5379        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5380        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5381        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5382
5383        Ok(bytes)
5384    }
5385
5386    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5387    /// Encode F32 rounding as Thumb-2.
5388    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5389    ///
5390    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5391    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5392    /// then restores FPSCR.
5393    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5394        let mut bytes = Vec::new();
5395        let sm_num = vfp_sreg_to_num(sm)?;
5396        let sd_num = vfp_sreg_to_num(sd)?;
5397        let (vd_s, d_s) = encode_sreg(sd_num);
5398        let (vm_s, m_s) = encode_sreg(sm_num);
5399
5400        if mode == 0b11 {
5401            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5402            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5403            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5404        } else {
5405            // ceil/floor/nearest: manipulate FPSCR rounding mode
5406            let rt: u32 = 12; // R12/IP as temp
5407
5408            // VMRS R12, FPSCR
5409            let vmrs = 0xEEF10A10 | (rt << 12);
5410            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5411
5412            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5413            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5414            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5415            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5416            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5417            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5418            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5419            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5420
5421            // ORR.W R12, R12, #(mode << 22)
5422            if mode != 0 {
5423                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5424                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5425                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5426                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5427            }
5428
5429            // VMSR FPSCR, R12
5430            let vmsr = 0xEEE10A10 | (rt << 12);
5431            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5432
5433            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5434            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5435            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5436
5437            // Restore FPSCR: clear rmode bits back to nearest (default)
5438            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5439            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5440            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5441            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5442        }
5443
5444        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5445        let (vd2, d2) = encode_sreg(sd_num);
5446        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5447        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5448
5449        Ok(bytes)
5450    }
5451
5452    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5453    fn encode_thumb_f32_minmax(
5454        &self,
5455        sd: &VfpReg,
5456        sn: &VfpReg,
5457        sm: &VfpReg,
5458        is_min: bool,
5459    ) -> Result<Vec<u8>> {
5460        let mut bytes = Vec::new();
5461        let sn_num = vfp_sreg_to_num(sn)?;
5462        let sm_num = vfp_sreg_to_num(sm)?;
5463        let sd_num = vfp_sreg_to_num(sd)?;
5464
5465        // VMOV.F32 Sd, Sn
5466        let (vd, d) = encode_sreg(sd_num);
5467        let (vn, n) = encode_sreg(sn_num);
5468        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5469        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5470
5471        // VCMP.F32 Sn, Sm
5472        let (vm, m) = encode_sreg(sm_num);
5473        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5474        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5475
5476        // VMRS APSR_nzcv, FPSCR
5477        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5478
5479        // IT GT (for min) or IT MI (for max)
5480        let cond: u16 = if is_min { 0xC } else { 0x4 };
5481        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5482        bytes.extend_from_slice(&it.to_le_bytes());
5483
5484        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5485        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5486        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5487
5488        Ok(bytes)
5489    }
5490
5491    /// Encode F32 copysign as Thumb-2
5492    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5493        let mut bytes = Vec::new();
5494
5495        // VMOV R12, Sm (get sign source bits)
5496        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5497            false,
5498            sm,
5499            &Reg::R12,
5500        )?));
5501
5502        // VMOV R0, Sn (get magnitude source bits)
5503        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5504            false,
5505            sn,
5506            &Reg::R0,
5507        )?));
5508
5509        // AND.W R12, R12, #0x80000000
5510        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5511        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5512        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5513        // Actually encoding #0x80000000 as modified constant:
5514        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5515        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5516        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5517        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5518        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5519        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5520        bytes.extend_from_slice(&hw1.to_le_bytes());
5521        bytes.extend_from_slice(&hw2.to_le_bytes());
5522
5523        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5524        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5525        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5526        bytes.extend_from_slice(&hw1.to_le_bytes());
5527        bytes.extend_from_slice(&hw2.to_le_bytes());
5528
5529        // ORR.W R0, R0, R12 (R0 = register 0)
5530        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5531        let hw2: u16 = 12; // Rd=R0, Rm=R12
5532        bytes.extend_from_slice(&hw1.to_le_bytes());
5533        bytes.extend_from_slice(&hw2.to_le_bytes());
5534
5535        // VMOV Sd, R0
5536        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5537            true,
5538            sd,
5539            &Reg::R0,
5540        )?));
5541
5542        Ok(bytes)
5543    }
5544
5545    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5546    fn encode_thumb_f64_compare(
5547        &self,
5548        rd: &Reg,
5549        dn: &VfpReg,
5550        dm: &VfpReg,
5551        cond_code: u32,
5552    ) -> Result<Vec<u8>> {
5553        let mut bytes = Vec::new();
5554        let rd_bits = reg_to_bits(rd);
5555
5556        // VCMP.F64 Dn, Dm
5557        let dn_num = vfp_dreg_to_num(dn)?;
5558        let dm_num = vfp_dreg_to_num(dm)?;
5559        let (vd, d) = encode_dreg(dn_num);
5560        let (vm, m) = encode_dreg(dm_num);
5561        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5562        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5563
5564        // VMRS APSR_nzcv, FPSCR
5565        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5566
5567        // MOVS Rd, #0
5568        if rd_bits < 8 {
5569            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5570            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5571        } else {
5572            let hw1: u16 = 0xF04F;
5573            let hw2: u16 = (rd_bits as u16) << 8;
5574            bytes.extend_from_slice(&hw1.to_le_bytes());
5575            bytes.extend_from_slice(&hw2.to_le_bytes());
5576        }
5577
5578        // IT<cond>
5579        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5580        bytes.extend_from_slice(&it.to_le_bytes());
5581
5582        // MOV Rd, #1
5583        if rd_bits < 8 {
5584            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5585            bytes.extend_from_slice(&mov_one.to_le_bytes());
5586        } else {
5587            let hw1: u16 = 0xF04F;
5588            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5589            bytes.extend_from_slice(&hw1.to_le_bytes());
5590            bytes.extend_from_slice(&hw2.to_le_bytes());
5591        }
5592
5593        Ok(bytes)
5594    }
5595
5596    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5597    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5598        let mut bytes = Vec::new();
5599        let bits = value.to_bits();
5600        let lo32 = bits as u32;
5601        let hi32 = (bits >> 32) as u32;
5602
5603        // MOVW R0, #lo16(lo32)
5604        let lo16 = lo32 & 0xFFFF;
5605        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5606
5607        // MOVT R0, #hi16(lo32)
5608        let hi16 = (lo32 >> 16) & 0xFFFF;
5609        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5610
5611        // MOVW R12, #lo16(hi32)
5612        let lo16 = hi32 & 0xFFFF;
5613        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5614
5615        // MOVT R12, #hi16(hi32)
5616        let hi16 = (hi32 >> 16) & 0xFFFF;
5617        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5618
5619        // VMOV Dd, R0, R12
5620        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5621        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5622
5623        Ok(bytes)
5624    }
5625
5626    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5627    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5628        let mut bytes = Vec::new();
5629
5630        // VMOV S0, Rm
5631        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5632        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5633
5634        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5635        let dd_num = vfp_dreg_to_num(dd)?;
5636        let (vd, d) = encode_dreg(dd_num);
5637        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5638        let vcvt = base | (d << 22) | (vd << 12);
5639        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5640
5641        Ok(bytes)
5642    }
5643
5644    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5645    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5646        let dd_num = vfp_dreg_to_num(dd)?;
5647        let sm_num = vfp_sreg_to_num(sm)?;
5648        let (vd, d) = encode_dreg(dd_num);
5649        let (vm, m) = encode_sreg(sm_num);
5650
5651        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
5652        Ok(vfp_to_thumb_bytes(vcvt))
5653    }
5654
5655    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
5656    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5657        let mut bytes = Vec::new();
5658        let dm_num = vfp_dreg_to_num(dm)?;
5659        let (vm, m) = encode_dreg(dm_num);
5660
5661        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
5662        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
5663        let vcvt = base | (m << 5) | vm;
5664        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5665
5666        // VMOV Rd, S0
5667        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
5668        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5669
5670        Ok(bytes)
5671    }
5672
5673    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5674    /// Encode F64 rounding as Thumb-2.
5675    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5676    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5677        let mut bytes = Vec::new();
5678        let dm_num = vfp_dreg_to_num(dm)?;
5679        let dd_num = vfp_dreg_to_num(dd)?;
5680        let (vm, m) = encode_dreg(dm_num);
5681        let (vd, d) = encode_dreg(dd_num);
5682
5683        if mode == 0b11 {
5684            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
5685            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
5686            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5687        } else {
5688            let rt: u32 = 12;
5689
5690            // VMRS R12, FPSCR
5691            let vmrs = 0xEEF10A10 | (rt << 12);
5692            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5693
5694            // BIC.W R12, R12, #(3 << 22)
5695            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
5696            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5697            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5698            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5699
5700            // ORR.W R12, R12, #(mode << 22)
5701            if mode != 0 {
5702                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
5703                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5704                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5705                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5706            }
5707
5708            // VMSR FPSCR, R12
5709            let vmsr = 0xEEE10A10 | (rt << 12);
5710            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5711
5712            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
5713            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
5714            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5715
5716            // Restore FPSCR
5717            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5718            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5719            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5720            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5721        }
5722
5723        // VCVT.F64.S32 Dd, S0
5724        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
5725        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5726
5727        Ok(bytes)
5728    }
5729
5730    /// Encode F64 min/max as Thumb-2
5731    fn encode_thumb_f64_minmax(
5732        &self,
5733        dd: &VfpReg,
5734        dn: &VfpReg,
5735        dm: &VfpReg,
5736        is_min: bool,
5737    ) -> Result<Vec<u8>> {
5738        let mut bytes = Vec::new();
5739        let dn_num = vfp_dreg_to_num(dn)?;
5740        let dm_num = vfp_dreg_to_num(dm)?;
5741        let dd_num = vfp_dreg_to_num(dd)?;
5742
5743        // VMOV.F64 Dd, Dn
5744        let (vd, d) = encode_dreg(dd_num);
5745        let (vn, n) = encode_dreg(dn_num);
5746        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5747        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
5748
5749        // VCMP.F64 Dn, Dm
5750        let (vm, m) = encode_dreg(dm_num);
5751        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5752        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5753
5754        // VMRS APSR_nzcv, FPSCR
5755        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5756
5757        // IT GT (for min) or IT MI (for max)
5758        let cond: u16 = if is_min { 0xC } else { 0x4 };
5759        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5760        bytes.extend_from_slice(&it.to_le_bytes());
5761
5762        // VMOV{cond}.F64 Dd, Dm
5763        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5764        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
5765
5766        Ok(bytes)
5767    }
5768
5769    /// Encode F64 copysign as Thumb-2
5770    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
5771        let mut bytes = Vec::new();
5772
5773        // VMOV R0, R12, Dm (get sign source)
5774        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5775            false,
5776            dm,
5777            &Reg::R0,
5778            &Reg::R12,
5779        )?));
5780
5781        // VMOV R1, R2, Dn (get magnitude source)
5782        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5783            false,
5784            dn,
5785            &Reg::R1,
5786            &Reg::R2,
5787        )?));
5788
5789        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
5790        let hw1: u16 = 0xF000 | 12;
5791        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
5792        bytes.extend_from_slice(&hw1.to_le_bytes());
5793        bytes.extend_from_slice(&hw2.to_le_bytes());
5794
5795        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
5796        let hw1: u16 = 0xF020 | 2;
5797        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
5798        bytes.extend_from_slice(&hw1.to_le_bytes());
5799        bytes.extend_from_slice(&hw2.to_le_bytes());
5800
5801        // ORR.W R2, R2, R12
5802        let hw1: u16 = 0xEA40 | 2;
5803        let hw2: u16 = (2 << 8) | 12;
5804        bytes.extend_from_slice(&hw1.to_le_bytes());
5805        bytes.extend_from_slice(&hw2.to_le_bytes());
5806
5807        // VMOV Dd, R1, R2
5808        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5809            true,
5810            dd,
5811            &Reg::R1,
5812            &Reg::R2,
5813        )?));
5814
5815        Ok(bytes)
5816    }
5817
5818    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
5819    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5820        let mut bytes = Vec::new();
5821
5822        let sm_num = vfp_sreg_to_num(sm)?;
5823        let (vd, d) = encode_sreg(sm_num);
5824        let (vm, m) = encode_sreg(sm_num);
5825        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
5826        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5827        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5828
5829        // VMOV Rd, Sm
5830        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
5831        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5832
5833        Ok(bytes)
5834    }
5835
5836    // === Thumb-2 32-bit encoding helpers ===
5837
5838    /// Encode Thumb-2 32-bit ADD with immediate
5839    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5840        let rd_bits = reg_to_bits(rd);
5841        let rn_bits = reg_to_bits(rn);
5842
5843        // ADD.W Rd, Rn, #imm12
5844        // First halfword: 1111 0 i 0 1000 S Rn
5845        // Second halfword: 0 imm3 Rd imm8
5846        let i_bit = (imm >> 11) & 1;
5847        let imm3 = (imm >> 8) & 0x7;
5848        let imm8 = imm & 0xFF;
5849
5850        let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
5851        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5852
5853        let mut bytes = hw1.to_le_bytes().to_vec();
5854        bytes.extend_from_slice(&hw2.to_le_bytes());
5855        Ok(bytes)
5856    }
5857
5858    /// Encode Thumb-2 32-bit SUB with immediate
5859    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5860        let rd_bits = reg_to_bits(rd);
5861        let rn_bits = reg_to_bits(rn);
5862
5863        let i_bit = (imm >> 11) & 1;
5864        let imm3 = (imm >> 8) & 0x7;
5865        let imm8 = imm & 0xFF;
5866
5867        let hw1: u16 = (0xF1A0 | (i_bit << 10) | rn_bits) as u16;
5868        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5869
5870        let mut bytes = hw1.to_le_bytes().to_vec();
5871        bytes.extend_from_slice(&hw2.to_le_bytes());
5872        Ok(bytes)
5873    }
5874
5875    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
5876    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5877        let rd_bits = reg_to_bits(rd);
5878        let rn_bits = reg_to_bits(rn);
5879
5880        let i_bit = (imm >> 11) & 1;
5881        let imm3 = (imm >> 8) & 0x7;
5882        let imm8 = imm & 0xFF;
5883
5884        // ADDS.W Rd, Rn, #imm (with S=1)
5885        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
5886        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
5887        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5888
5889        let mut bytes = hw1.to_le_bytes().to_vec();
5890        bytes.extend_from_slice(&hw2.to_le_bytes());
5891        Ok(bytes)
5892    }
5893
5894    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
5895    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5896        let rd_bits = reg_to_bits(rd);
5897        let rn_bits = reg_to_bits(rn);
5898
5899        let i_bit = (imm >> 11) & 1;
5900        let imm3 = (imm >> 8) & 0x7;
5901        let imm8 = imm & 0xFF;
5902
5903        // SUBS.W Rd, Rn, #imm (with S=1)
5904        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
5905        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
5906        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5907
5908        let mut bytes = hw1.to_le_bytes().to_vec();
5909        bytes.extend_from_slice(&hw2.to_le_bytes());
5910        Ok(bytes)
5911    }
5912
5913    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
5914    ///
5915    /// # Contract (Verus-style)
5916    /// ```text
5917    /// requires rd <= R14
5918    /// ensures result.len() == 4
5919    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
5920    /// ```
5921    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
5922        let rd_bits = reg_to_bits(rd);
5923        reg_bits_checked(rd_bits)?;
5924        let imm16 = imm & 0xFFFF;
5925
5926        // MOVW Rd, #imm16
5927        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
5928        let imm4 = (imm16 >> 12) & 0xF;
5929        let i_bit = (imm16 >> 11) & 1;
5930        let imm3 = (imm16 >> 8) & 0x7;
5931        let imm8 = imm16 & 0xFF;
5932
5933        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5934        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5935
5936        let mut bytes = hw1.to_le_bytes().to_vec();
5937        bytes.extend_from_slice(&hw2.to_le_bytes());
5938        encoding_contracts::verify_thumb32(&bytes);
5939        Ok(bytes)
5940    }
5941
5942    /// Encode Thumb-2 32-bit shift with immediate
5943    ///
5944    /// # Contract (Verus-style)
5945    /// ```text
5946    /// requires rd <= R14, rm <= R14
5947    /// ensures result.len() == 4
5948    /// ```
5949    fn encode_thumb32_shift(
5950        &self,
5951        rd: &Reg,
5952        rm: &Reg,
5953        shift: u32,
5954        shift_type: u8,
5955    ) -> Result<Vec<u8>> {
5956        let rd_bits = reg_to_bits(rd);
5957        let rm_bits = reg_to_bits(rm);
5958        reg_bits_checked(rd_bits)?;
5959        reg_bits_checked(rm_bits)?;
5960        let imm5 = shift & 0x1F;
5961        let imm2 = imm5 & 0x3;
5962        let imm3 = (imm5 >> 2) & 0x7;
5963
5964        // MOV.W Rd, Rm, <shift> #imm
5965        // EA4F 0 imm3 Rd imm2 type Rm
5966        let hw1: u16 = 0xEA4F;
5967        let hw2: u16 =
5968            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
5969                as u16;
5970
5971        let mut bytes = hw1.to_le_bytes().to_vec();
5972        bytes.extend_from_slice(&hw2.to_le_bytes());
5973        Ok(bytes)
5974    }
5975
5976    /// Encode Thumb-2 32-bit shift by register
5977    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
5978    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
5979    fn encode_thumb32_shift_reg(
5980        &self,
5981        rd: &Reg,
5982        rn: &Reg,
5983        rm: &Reg,
5984        shift_type: u8,
5985    ) -> Result<Vec<u8>> {
5986        let rd_bits = reg_to_bits(rd);
5987        let rn_bits = reg_to_bits(rn);
5988        let rm_bits = reg_to_bits(rm);
5989
5990        // hw1: 1111 1010 0xx0 Rn
5991        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
5992        // hw2: 1111 Rd 0000 Rm
5993        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
5994
5995        let mut bytes = hw1.to_le_bytes().to_vec();
5996        bytes.extend_from_slice(&hw2.to_le_bytes());
5997        Ok(bytes)
5998    }
5999
6000    /// Encode Thumb-2 32-bit CMP with immediate
6001    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6002        let rn_bits = reg_to_bits(rn);
6003
6004        let i_bit = (imm >> 11) & 1;
6005        let imm3 = (imm >> 8) & 0x7;
6006        let imm8 = imm & 0xFF;
6007
6008        // CMP.W Rn, #imm
6009        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6010        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6011
6012        let mut bytes = hw1.to_le_bytes().to_vec();
6013        bytes.extend_from_slice(&hw2.to_le_bytes());
6014        Ok(bytes)
6015    }
6016
6017    /// Encode Thumb-2 32-bit LDR
6018    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6019        let rd_bits = reg_to_bits(rd);
6020        let base_bits = reg_to_bits(base);
6021
6022        // LDR.W Rd, [Rn, #imm12]
6023        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6024        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6025
6026        let mut bytes = hw1.to_le_bytes().to_vec();
6027        bytes.extend_from_slice(&hw2.to_le_bytes());
6028        Ok(bytes)
6029    }
6030
6031    /// Encode Thumb-2 32-bit STR
6032    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6033        let rd_bits = reg_to_bits(rd);
6034        let base_bits = reg_to_bits(base);
6035
6036        // STR.W Rd, [Rn, #imm12]
6037        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6038        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6039
6040        let mut bytes = hw1.to_le_bytes().to_vec();
6041        bytes.extend_from_slice(&hw2.to_le_bytes());
6042        Ok(bytes)
6043    }
6044
6045    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6046    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6047        let rd_bits = reg_to_bits(rd);
6048        let base_bits = reg_to_bits(base);
6049        let rm_bits = reg_to_bits(offset_reg);
6050
6051        // LDR.W Rd, [Rn, Rm, LSL #0]
6052        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6053        // imm2 = 00 for no shift (LSL #0)
6054        let hw1: u16 = (0xF850 | base_bits) as u16;
6055        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6056
6057        let mut bytes = hw1.to_le_bytes().to_vec();
6058        bytes.extend_from_slice(&hw2.to_le_bytes());
6059        Ok(bytes)
6060    }
6061
6062    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6063    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6064        let rd_bits = reg_to_bits(rd);
6065        let base_bits = reg_to_bits(base);
6066        let rm_bits = reg_to_bits(offset_reg);
6067
6068        // STR.W Rd, [Rn, Rm, LSL #0]
6069        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6070        // imm2 = 00 for no shift (LSL #0)
6071        let hw1: u16 = (0xF840 | base_bits) as u16;
6072        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6073
6074        let mut bytes = hw1.to_le_bytes().to_vec();
6075        bytes.extend_from_slice(&hw2.to_le_bytes());
6076        Ok(bytes)
6077    }
6078
6079    // === Sub-word load/store Thumb-2 encoding helpers ===
6080
6081    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6082    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6083        let rd_bits = reg_to_bits(rd);
6084        let base_bits = reg_to_bits(base);
6085        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6086        let hw1: u16 = (0xF890 | base_bits) as u16;
6087        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6088        let mut bytes = hw1.to_le_bytes().to_vec();
6089        bytes.extend_from_slice(&hw2.to_le_bytes());
6090        Ok(bytes)
6091    }
6092
6093    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6094    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6095        let rd_bits = reg_to_bits(rd);
6096        let base_bits = reg_to_bits(base);
6097        let rm_bits = reg_to_bits(offset_reg);
6098        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6099        let hw1: u16 = (0xF810 | base_bits) as u16;
6100        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6101        let mut bytes = hw1.to_le_bytes().to_vec();
6102        bytes.extend_from_slice(&hw2.to_le_bytes());
6103        Ok(bytes)
6104    }
6105
6106    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6107    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6108        let rd_bits = reg_to_bits(rd);
6109        let base_bits = reg_to_bits(base);
6110        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6111        let hw1: u16 = (0xF990 | base_bits) as u16;
6112        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6113        let mut bytes = hw1.to_le_bytes().to_vec();
6114        bytes.extend_from_slice(&hw2.to_le_bytes());
6115        Ok(bytes)
6116    }
6117
6118    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6119    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6120        let rd_bits = reg_to_bits(rd);
6121        let base_bits = reg_to_bits(base);
6122        let rm_bits = reg_to_bits(offset_reg);
6123        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6124        let hw1: u16 = (0xF910 | base_bits) as u16;
6125        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6126        let mut bytes = hw1.to_le_bytes().to_vec();
6127        bytes.extend_from_slice(&hw2.to_le_bytes());
6128        Ok(bytes)
6129    }
6130
6131    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6132    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6133        let rd_bits = reg_to_bits(rd);
6134        let base_bits = reg_to_bits(base);
6135        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6136        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6137        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6138        let mut bytes = hw1.to_le_bytes().to_vec();
6139        bytes.extend_from_slice(&hw2.to_le_bytes());
6140        Ok(bytes)
6141    }
6142
6143    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6144    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6145        let rd_bits = reg_to_bits(rd);
6146        let base_bits = reg_to_bits(base);
6147        let rm_bits = reg_to_bits(offset_reg);
6148        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6149        let hw1: u16 = (0xF830 | base_bits) as u16;
6150        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6151        let mut bytes = hw1.to_le_bytes().to_vec();
6152        bytes.extend_from_slice(&hw2.to_le_bytes());
6153        Ok(bytes)
6154    }
6155
6156    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6157    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6158        let rd_bits = reg_to_bits(rd);
6159        let base_bits = reg_to_bits(base);
6160        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6161        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6162        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6163        let mut bytes = hw1.to_le_bytes().to_vec();
6164        bytes.extend_from_slice(&hw2.to_le_bytes());
6165        Ok(bytes)
6166    }
6167
6168    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6169    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6170        let rd_bits = reg_to_bits(rd);
6171        let base_bits = reg_to_bits(base);
6172        let rm_bits = reg_to_bits(offset_reg);
6173        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6174        let hw1: u16 = (0xF930 | base_bits) as u16;
6175        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6176        let mut bytes = hw1.to_le_bytes().to_vec();
6177        bytes.extend_from_slice(&hw2.to_le_bytes());
6178        Ok(bytes)
6179    }
6180
6181    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6182    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6183        let rd_bits = reg_to_bits(rd);
6184        let base_bits = reg_to_bits(base);
6185        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6186        let hw1: u16 = (0xF880 | base_bits) as u16;
6187        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6188        let mut bytes = hw1.to_le_bytes().to_vec();
6189        bytes.extend_from_slice(&hw2.to_le_bytes());
6190        Ok(bytes)
6191    }
6192
6193    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6194    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6195        let rd_bits = reg_to_bits(rd);
6196        let base_bits = reg_to_bits(base);
6197        let rm_bits = reg_to_bits(offset_reg);
6198        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6199        let hw1: u16 = (0xF800 | base_bits) as u16;
6200        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6201        let mut bytes = hw1.to_le_bytes().to_vec();
6202        bytes.extend_from_slice(&hw2.to_le_bytes());
6203        Ok(bytes)
6204    }
6205
6206    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6207    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6208        let rd_bits = reg_to_bits(rd);
6209        let base_bits = reg_to_bits(base);
6210        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6211        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6212        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6213        let mut bytes = hw1.to_le_bytes().to_vec();
6214        bytes.extend_from_slice(&hw2.to_le_bytes());
6215        Ok(bytes)
6216    }
6217
6218    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6219    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6220        let rd_bits = reg_to_bits(rd);
6221        let base_bits = reg_to_bits(base);
6222        let rm_bits = reg_to_bits(offset_reg);
6223        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6224        let hw1: u16 = (0xF820 | base_bits) as u16;
6225        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6226        let mut bytes = hw1.to_le_bytes().to_vec();
6227        bytes.extend_from_slice(&hw2.to_le_bytes());
6228        Ok(bytes)
6229    }
6230
6231    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6232    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6233        let rd_bits = reg_to_bits(rd);
6234        let rn_bits = reg_to_bits(rn);
6235
6236        // For small immediates, use ADD.W Rd, Rn, #imm12
6237        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6238        // S = 0 (don't update flags)
6239        // The 12-bit immediate is encoded as: i:imm3:imm8
6240        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6241        if imm <= 0xFFF {
6242            let i_bit = (imm >> 11) & 1;
6243            let imm3 = (imm >> 8) & 0x7;
6244            let imm8 = imm & 0xFF;
6245
6246            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6247            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6248
6249            let mut bytes = hw1.to_le_bytes().to_vec();
6250            bytes.extend_from_slice(&hw2.to_le_bytes());
6251            Ok(bytes)
6252        } else {
6253            // For larger immediates, would need MOVW/MOVT + ADD
6254            // For now, return error
6255            Err(synth_core::Error::synthesis(
6256                "ADD immediate too large for single instruction",
6257            ))
6258        }
6259    }
6260
6261    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6262
6263    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6264    ///
6265    /// # Contract (Verus-style)
6266    /// ```text
6267    /// requires rd <= 14, imm16 <= 0xFFFF
6268    /// ensures result.len() == 4
6269    /// ```
6270    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6271        reg_bits_checked(rd)?;
6272        encoding_contracts::verify_imm16(imm16);
6273        // MOVW Rd, #imm16
6274        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6275        let imm16 = imm16 & 0xFFFF;
6276        let imm4 = (imm16 >> 12) & 0xF;
6277        let i_bit = (imm16 >> 11) & 1;
6278        let imm3 = (imm16 >> 8) & 0x7;
6279        let imm8 = imm16 & 0xFF;
6280
6281        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6282        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6283
6284        let mut bytes = hw1.to_le_bytes().to_vec();
6285        bytes.extend_from_slice(&hw2.to_le_bytes());
6286        encoding_contracts::verify_thumb32(&bytes);
6287        Ok(bytes)
6288    }
6289
6290    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6291    ///
6292    /// # Contract (Verus-style)
6293    /// ```text
6294    /// requires rd <= 14, imm16 <= 0xFFFF
6295    /// ensures result.len() == 4
6296    /// ```
6297    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6298        reg_bits_checked(rd)?;
6299        encoding_contracts::verify_imm16(imm16);
6300        // MOVT Rd, #imm16
6301        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6302        let imm16 = imm16 & 0xFFFF;
6303        let imm4 = (imm16 >> 12) & 0xF;
6304        let i_bit = (imm16 >> 11) & 1;
6305        let imm3 = (imm16 >> 8) & 0x7;
6306        let imm8 = imm16 & 0xFF;
6307
6308        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6309        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6310
6311        let mut bytes = hw1.to_le_bytes().to_vec();
6312        bytes.extend_from_slice(&hw2.to_le_bytes());
6313        encoding_contracts::verify_thumb32(&bytes);
6314        Ok(bytes)
6315    }
6316
6317    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6318    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6319        // MOV.W Rd, Rm, LSR #imm
6320        // EA4F 0 imm3 Rd imm2 01 Rm
6321        let imm5 = shift & 0x1F;
6322        let imm2 = imm5 & 0x3;
6323        let imm3 = (imm5 >> 2) & 0x7;
6324
6325        let hw1: u16 = 0xEA4F;
6326        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6327
6328        let mut bytes = hw1.to_le_bytes().to_vec();
6329        bytes.extend_from_slice(&hw2.to_le_bytes());
6330        Ok(bytes)
6331    }
6332
6333    /// Encode Thumb-2 32-bit AND (register) - raw version
6334    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6335        // AND.W Rd, Rn, Rm
6336        // EA00 Rn | 0 Rd 00 00 Rm
6337        let hw1: u16 = (0xEA00 | rn) as u16;
6338        let hw2: u16 = ((rd << 8) | rm) as u16;
6339
6340        let mut bytes = hw1.to_le_bytes().to_vec();
6341        bytes.extend_from_slice(&hw2.to_le_bytes());
6342        Ok(bytes)
6343    }
6344
6345    /// Encode Thumb-2 32-bit AND with immediate - raw version
6346    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6347        // AND.W Rd, Rn, #<modified_immediate>
6348        // For small immediates (0-255), the encoding is simpler
6349        // F0 00 Rn | 0 imm3 Rd imm8
6350        let i_bit = (imm >> 11) & 1;
6351        let imm3 = (imm >> 8) & 0x7;
6352        let imm8 = imm & 0xFF;
6353
6354        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6355        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6356
6357        let mut bytes = hw1.to_le_bytes().to_vec();
6358        bytes.extend_from_slice(&hw2.to_le_bytes());
6359        Ok(bytes)
6360    }
6361
6362    /// Encode Thumb-2 32-bit SUB (register) - raw version
6363    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6364        // SUB.W Rd, Rn, Rm
6365        // EBA0 Rn | 0 Rd 00 00 Rm
6366        let hw1: u16 = (0xEBA0 | rn) as u16;
6367        let hw2: u16 = ((rd << 8) | rm) as u16;
6368
6369        let mut bytes = hw1.to_le_bytes().to_vec();
6370        bytes.extend_from_slice(&hw2.to_le_bytes());
6371        Ok(bytes)
6372    }
6373
6374    /// Encode Thumb-2 32-bit ADD (register) - raw version
6375    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6376        // ADD.W Rd, Rn, Rm
6377        // EB00 Rn | 0 Rd 00 00 Rm
6378        let hw1: u16 = (0xEB00 | rn) as u16;
6379        let hw2: u16 = ((rd << 8) | rm) as u16;
6380
6381        let mut bytes = hw1.to_le_bytes().to_vec();
6382        bytes.extend_from_slice(&hw2.to_le_bytes());
6383        Ok(bytes)
6384    }
6385
6386    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6387    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6388    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6389    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6390        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6391        let hw1: u16 = (0xEB10 | rn) as u16;
6392        let hw2: u16 = ((rd << 8) | rm) as u16;
6393        let mut bytes = hw1.to_le_bytes().to_vec();
6394        bytes.extend_from_slice(&hw2.to_le_bytes());
6395        Ok(bytes)
6396    }
6397
6398    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6399    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6400    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6401        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6402        let hw1: u16 = (0xEBB0 | rn) as u16;
6403        let hw2: u16 = ((rd << 8) | rm) as u16;
6404        let mut bytes = hw1.to_le_bytes().to_vec();
6405        bytes.extend_from_slice(&hw2.to_le_bytes());
6406        Ok(bytes)
6407    }
6408
6409    /// Encode a sequence of ARM instructions
6410    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6411        let mut code = Vec::new();
6412
6413        for op in ops {
6414            let encoded = self.encode(op)?;
6415            code.extend_from_slice(&encoded);
6416        }
6417
6418        Ok(code)
6419    }
6420}
6421
6422/// Convert register to bit encoding (0-15)
6423fn reg_to_bits(reg: &Reg) -> u32 {
6424    match reg {
6425        Reg::R0 => 0,
6426        Reg::R1 => 1,
6427        Reg::R2 => 2,
6428        Reg::R3 => 3,
6429        Reg::R4 => 4,
6430        Reg::R5 => 5,
6431        Reg::R6 => 6,
6432        Reg::R7 => 7,
6433        Reg::R8 => 8,
6434        Reg::R9 => 9,
6435        Reg::R10 => 10,
6436        Reg::R11 => 11,
6437        Reg::R12 => 12,
6438        Reg::SP => 13,
6439        Reg::LR => 14,
6440        Reg::PC => 15,
6441    }
6442}
6443
6444/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6445/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6446/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6447/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6448/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6449/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6450/// Returns a typed Err instead. See #185.
6451fn reg_bits_checked(bits: u32) -> Result<()> {
6452    if bits > 14 {
6453        return Err(synth_core::Error::synthesis(format!(
6454            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6455        )));
6456    }
6457    Ok(())
6458}
6459
6460/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6461/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6462fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6463    if val == 0 {
6464        return Some((0, 1));
6465    }
6466    for rot in 0..16u32 {
6467        let shift = rot * 2;
6468        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6469        let unrotated = val.rotate_left(shift);
6470        if unrotated <= 0xFF {
6471            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6472            return Some(((rot << 8) | unrotated, 1));
6473        }
6474    }
6475    None
6476}
6477
6478/// Encode operand2 field and return (bits, immediate_flag).
6479/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
6480/// Panics if an immediate value cannot be represented. Callers that need large
6481/// immediates should use MOVW/MOVT instead of Operand2::Imm.
6482fn encode_operand2(op2: &Operand2) -> (u32, u32) {
6483    match op2 {
6484        Operand2::Imm(val) => {
6485            let uval = *val as u32;
6486            // Attempt rotated-immediate encoding (ARM32 Operand2)
6487            if let Some(encoded) = try_encode_rotated_imm(uval) {
6488                encoded
6489            } else {
6490                // Fallback: mask to 8 bits (legacy behavior for values that
6491                // cannot be represented). This should not be reached for
6492                // correctly-selected instructions; the instruction selector
6493                // must use MOVW/MOVT for large constants.
6494                let imm = uval & 0xFF;
6495                (imm, 1)
6496            }
6497        }
6498
6499        Operand2::Reg(reg) => {
6500            let reg_bits = reg_to_bits(reg);
6501            (reg_bits, 0) // I=0 for register
6502        }
6503
6504        Operand2::RegShift {
6505            rm,
6506            shift: _,
6507            amount,
6508        } => {
6509            // Simplified encoding with shift
6510            let rm_bits = reg_to_bits(rm);
6511            let shift_bits = (*amount & 0x1F) << 7;
6512            (shift_bits | rm_bits, 0)
6513        }
6514    }
6515}
6516
6517/// Encode memory address to (base_reg, offset)
6518fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
6519    let base_bits = reg_to_bits(&addr.base);
6520    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
6521    (base_bits, offset_bits)
6522}
6523
6524/// S-register number: S0=0, S1=1, ..., S31=31
6525fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
6526    match reg {
6527        VfpReg::S0 => Ok(0),
6528        VfpReg::S1 => Ok(1),
6529        VfpReg::S2 => Ok(2),
6530        VfpReg::S3 => Ok(3),
6531        VfpReg::S4 => Ok(4),
6532        VfpReg::S5 => Ok(5),
6533        VfpReg::S6 => Ok(6),
6534        VfpReg::S7 => Ok(7),
6535        VfpReg::S8 => Ok(8),
6536        VfpReg::S9 => Ok(9),
6537        VfpReg::S10 => Ok(10),
6538        VfpReg::S11 => Ok(11),
6539        VfpReg::S12 => Ok(12),
6540        VfpReg::S13 => Ok(13),
6541        VfpReg::S14 => Ok(14),
6542        VfpReg::S15 => Ok(15),
6543        VfpReg::S16 => Ok(16),
6544        VfpReg::S17 => Ok(17),
6545        VfpReg::S18 => Ok(18),
6546        VfpReg::S19 => Ok(19),
6547        VfpReg::S20 => Ok(20),
6548        VfpReg::S21 => Ok(21),
6549        VfpReg::S22 => Ok(22),
6550        VfpReg::S23 => Ok(23),
6551        VfpReg::S24 => Ok(24),
6552        VfpReg::S25 => Ok(25),
6553        VfpReg::S26 => Ok(26),
6554        VfpReg::S27 => Ok(27),
6555        VfpReg::S28 => Ok(28),
6556        VfpReg::S29 => Ok(29),
6557        VfpReg::S30 => Ok(30),
6558        VfpReg::S31 => Ok(31),
6559        // D-registers are not used in F32 single-precision encodings
6560        _ => Err(synth_core::Error::SynthesisError(
6561            "D-register not supported in single-precision VFP encoding".to_string(),
6562        )),
6563    }
6564}
6565
6566/// D-register number: D0=0, D1=1, ..., D15=15
6567fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
6568    match reg {
6569        VfpReg::D0 => Ok(0),
6570        VfpReg::D1 => Ok(1),
6571        VfpReg::D2 => Ok(2),
6572        VfpReg::D3 => Ok(3),
6573        VfpReg::D4 => Ok(4),
6574        VfpReg::D5 => Ok(5),
6575        VfpReg::D6 => Ok(6),
6576        VfpReg::D7 => Ok(7),
6577        VfpReg::D8 => Ok(8),
6578        VfpReg::D9 => Ok(9),
6579        VfpReg::D10 => Ok(10),
6580        VfpReg::D11 => Ok(11),
6581        VfpReg::D12 => Ok(12),
6582        VfpReg::D13 => Ok(13),
6583        VfpReg::D14 => Ok(14),
6584        VfpReg::D15 => Ok(15),
6585        // S-registers are not used in F64 double-precision encodings
6586        _ => Err(synth_core::Error::SynthesisError(
6587            "S-register not supported in double-precision VFP encoding".to_string(),
6588        )),
6589    }
6590}
6591
6592/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
6593/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
6594/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
6595fn encode_sreg(s: u32) -> (u32, u32) {
6596    (s >> 1, s & 1)
6597}
6598
6599/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
6600/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
6601/// For D0-D15, qualifier is always 0.
6602fn encode_dreg(d: u32) -> (u32, u32) {
6603    (d & 0xF, (d >> 4) & 1)
6604}
6605
6606/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
6607/// Returns the full 32-bit instruction word.
6608///
6609/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
6610/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
6611fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
6612    let sd_num = vfp_sreg_to_num(sd)?;
6613    let sn_num = vfp_sreg_to_num(sn)?;
6614    let sm_num = vfp_sreg_to_num(sm)?;
6615    let (vd, d) = encode_sreg(sd_num);
6616    let (vn, n) = encode_sreg(sn_num);
6617    let (vm, m) = encode_sreg(sm_num);
6618
6619    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6620}
6621
6622/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
6623/// Returns the full 32-bit instruction word.
6624fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
6625    let sd_num = vfp_sreg_to_num(sd)?;
6626    let sm_num = vfp_sreg_to_num(sm)?;
6627    let (vd, d) = encode_sreg(sd_num);
6628    let (vm, m) = encode_sreg(sm_num);
6629
6630    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6631}
6632
6633/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
6634/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6635/// U bit (bit 23) controls add/subtract offset.
6636fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6637    let sd_num = vfp_sreg_to_num(sd)?;
6638    let (vd, d) = encode_sreg(sd_num);
6639    let rn = reg_to_bits(&addr.base);
6640
6641    let offset = addr.offset;
6642    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6643    let abs_offset = offset.unsigned_abs();
6644    let imm8 = (abs_offset / 4) & 0xFF;
6645
6646    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6647}
6648
6649/// Encode VMOV between core register and S-register.
6650/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6651/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6652fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
6653    let s_num = vfp_sreg_to_num(sreg)?;
6654    let (vn, n) = encode_sreg(s_num);
6655    let rt = reg_to_bits(core);
6656
6657    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
6658    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
6659}
6660
6661/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
6662/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
6663/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
6664fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
6665    let dd_num = vfp_dreg_to_num(dd)?;
6666    let dn_num = vfp_dreg_to_num(dn)?;
6667    let dm_num = vfp_dreg_to_num(dm)?;
6668    let (vd, d) = encode_dreg(dd_num);
6669    let (vn, n) = encode_dreg(dn_num);
6670    let (vm, m) = encode_dreg(dm_num);
6671
6672    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6673}
6674
6675/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
6676fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
6677    let dd_num = vfp_dreg_to_num(dd)?;
6678    let dm_num = vfp_dreg_to_num(dm)?;
6679    let (vd, d) = encode_dreg(dd_num);
6680    let (vm, m) = encode_dreg(dm_num);
6681
6682    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6683}
6684
6685/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
6686/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6687fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6688    let dd_num = vfp_dreg_to_num(dd)?;
6689    let (vd, d) = encode_dreg(dd_num);
6690    let rn = reg_to_bits(&addr.base);
6691
6692    let offset = addr.offset;
6693    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6694    let abs_offset = offset.unsigned_abs();
6695    let imm8 = (abs_offset / 4) & 0xFF;
6696
6697    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6698}
6699
6700/// Encode VMOV between two core registers and a D-register.
6701/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6702/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6703fn encode_vmov_core_dreg(
6704    to_dreg: bool,
6705    dreg: &VfpReg,
6706    core_lo: &Reg,
6707    core_hi: &Reg,
6708) -> Result<u32> {
6709    let d_num = vfp_dreg_to_num(dreg)?;
6710    let (vm, m) = encode_dreg(d_num);
6711    let rt = reg_to_bits(core_lo);
6712    let rt2 = reg_to_bits(core_hi);
6713
6714    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
6715    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
6716}
6717
6718/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
6719fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
6720    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
6721    let hw2 = (instr & 0xFFFF) as u16;
6722    let mut bytes = hw1.to_le_bytes().to_vec();
6723    bytes.extend_from_slice(&hw2.to_le_bytes());
6724    bytes
6725}
6726
6727// ============================================================================
6728// Helium MVE encoding helpers
6729// ============================================================================
6730
6731/// Q-register number: Q0=0, Q1=1, ..., Q7=7
6732fn qreg_to_num(reg: &QReg) -> u32 {
6733    match reg {
6734        QReg::Q0 => 0,
6735        QReg::Q1 => 1,
6736        QReg::Q2 => 2,
6737        QReg::Q3 => 3,
6738        QReg::Q4 => 4,
6739        QReg::Q5 => 5,
6740        QReg::Q6 => 6,
6741        QReg::Q7 => 7,
6742    }
6743}
6744
6745/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
6746fn mve_size_bits(size: &MveSize) -> u32 {
6747    match size {
6748        MveSize::S8 => 0b00,
6749        MveSize::S16 => 0b01,
6750        MveSize::S32 => 0b10,
6751    }
6752}
6753
6754/// Encode MVE 3-register instruction.
6755/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
6756/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
6757fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6758    let d = qreg_to_num(qd) * 2;
6759    let n = qreg_to_num(qn) * 2;
6760    let m = qreg_to_num(qm) * 2;
6761
6762    // Standard NEON/MVE 3-register encoding:
6763    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
6764    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
6765    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
6766    let vd = d & 0xF;
6767    let d_bit = (d >> 4) & 1;
6768    let vn = n & 0xF;
6769    let n_bit = (n >> 4) & 1;
6770    let vm = m & 0xF;
6771    let m_bit = (m >> 4) & 1;
6772
6773    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
6774}
6775
6776/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
6777fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6778    encode_mve_3reg(base, qd, qn, qm)
6779}
6780
6781/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
6782/// Format: EC9x xxxx - contiguous load, word-sized elements
6783fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
6784    let qd_enc = qreg_to_num(qd) * 2;
6785    let rn = reg_to_bits(&addr.base);
6786    let offset = addr.offset;
6787    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6788    let abs_offset = offset.unsigned_abs();
6789    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
6790
6791    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
6792    0xED100E80
6793        | (u_bit << 23)
6794        | ((qd_enc >> 4) << 22)
6795        | (rn << 16)
6796        | ((qd_enc & 0xF) << 12)
6797        | (imm7 & 0x7F)
6798}
6799
6800/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
6801fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
6802    let qd_enc = qreg_to_num(qd) * 2;
6803    let rn = reg_to_bits(&addr.base);
6804    let offset = addr.offset;
6805    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6806    let abs_offset = offset.unsigned_abs();
6807    let imm7 = (abs_offset / 4) & 0x7F;
6808
6809    0xED000E80
6810        | (u_bit << 23)
6811        | ((qd_enc >> 4) << 22)
6812        | (rn << 16)
6813        | ((qd_enc & 0xF) << 12)
6814        | (imm7 & 0x7F)
6815}
6816
6817impl ArmEncoder {
6818    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
6819    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
6820        let mut result = Vec::new();
6821        let qd_num = qreg_to_num(qd);
6822
6823        // Load each 32-bit word into R12 (temp) then VMOV into S-register
6824        for i in 0..4 {
6825            let word = u32::from_le_bytes([
6826                bytes[i * 4],
6827                bytes[i * 4 + 1],
6828                bytes[i * 4 + 2],
6829                bytes[i * 4 + 3],
6830            ]);
6831            let lo16 = word & 0xFFFF;
6832            let hi16 = (word >> 16) & 0xFFFF;
6833
6834            // MOVW R12, #lo16
6835            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
6836            // MOVT R12, #hi16
6837            if hi16 != 0 {
6838                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
6839            }
6840
6841            // VMOV Sn, R12 where Sn = Qd*4 + i
6842            let s_num = qd_num * 4 + i as u32;
6843            let (vn, n) = encode_sreg(s_num);
6844            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
6845            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6846        }
6847
6848        Ok(result)
6849    }
6850
6851    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
6852    fn encode_thumb_mve_lane_wise_f32_binop(
6853        &self,
6854        qd: &QReg,
6855        qn: &QReg,
6856        qm: &QReg,
6857        vfp_base: u32,
6858    ) -> Result<Vec<u8>> {
6859        let mut result = Vec::new();
6860        let qd_num = qreg_to_num(qd);
6861        let qn_num = qreg_to_num(qn);
6862        let qm_num = qreg_to_num(qm);
6863
6864        // For each lane 0..3: use S-registers directly (Q aliasing)
6865        for i in 0..4u32 {
6866            let sd = qd_num * 4 + i;
6867            let sn = qn_num * 4 + i;
6868            let sm = qm_num * 4 + i;
6869
6870            let (vd, d) = encode_sreg(sd);
6871            let (vn, n) = encode_sreg(sn);
6872            let (vm, m) = encode_sreg(sm);
6873
6874            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
6875            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
6876        }
6877
6878        Ok(result)
6879    }
6880
6881    /// Encode lane-wise f32 VSQRT via S-register extraction
6882    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
6883        let mut result = Vec::new();
6884        let qd_num = qreg_to_num(qd);
6885        let qm_num = qreg_to_num(qm);
6886
6887        // VSQRT.F32 base: 0xEEB10AC0
6888        for i in 0..4u32 {
6889            let sd = qd_num * 4 + i;
6890            let sm = qm_num * 4 + i;
6891
6892            let (vd, d) = encode_sreg(sd);
6893            let (vm, m) = encode_sreg(sm);
6894
6895            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
6896            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
6897        }
6898
6899        Ok(result)
6900    }
6901}
6902
6903#[cfg(test)]
6904mod tests {
6905    use super::*;
6906
6907    #[test]
6908    fn test_encoder_creation() {
6909        let encoder_arm = ArmEncoder::new_arm32();
6910        assert!(!encoder_arm.thumb_mode);
6911
6912        let encoder_thumb = ArmEncoder::new_thumb2();
6913        assert!(encoder_thumb.thumb_mode);
6914    }
6915
6916    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
6917    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
6918    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
6919    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
6920    /// dropping the address operand and miscompiling every optimized memory
6921    /// access. High registers must use the 32-bit `.W` forms.
6922    #[test]
6923    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
6924        let encoder = ArmEncoder::new_thumb2();
6925
6926        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
6927        let code = encoder
6928            .encode(&ArmOp::Add {
6929                rd: Reg::R12,
6930                rn: Reg::R12,
6931                op2: Operand2::Reg(Reg::R0),
6932            })
6933            .unwrap();
6934        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
6935        assert_eq!(
6936            code,
6937            vec![0x0C, 0xEB, 0x00, 0x0C],
6938            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
6939        );
6940        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
6941        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
6942
6943        // Low-register add stays 16-bit (no regression for the common case).
6944        let lo = encoder
6945            .encode(&ArmOp::Add {
6946                rd: Reg::R1,
6947                rn: Reg::R2,
6948                op2: Operand2::Reg(Reg::R3),
6949            })
6950            .unwrap();
6951        assert_eq!(
6952            lo.len(),
6953            2,
6954            "low-reg ADD should remain 16-bit, got {lo:02X?}"
6955        );
6956    }
6957
6958    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
6959    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
6960    #[test]
6961    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
6962        let encoder = ArmEncoder::new_thumb2();
6963
6964        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
6965        let adds = encoder
6966            .encode(&ArmOp::Adds {
6967                rd: Reg::R10,
6968                rn: Reg::R10,
6969                op2: Operand2::Reg(Reg::R8),
6970            })
6971            .unwrap();
6972        assert_eq!(
6973            adds,
6974            vec![0x1A, 0xEB, 0x08, 0x0A],
6975            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
6976        );
6977
6978        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
6979        let subs = encoder
6980            .encode(&ArmOp::Subs {
6981                rd: Reg::R10,
6982                rn: Reg::R10,
6983                op2: Operand2::Reg(Reg::R8),
6984            })
6985            .unwrap();
6986        assert_eq!(
6987            subs,
6988            vec![0xBA, 0xEB, 0x08, 0x0A],
6989            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
6990        );
6991    }
6992
6993    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
6994    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
6995    #[test]
6996    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
6997        let encoder = ArmEncoder::new_thumb2();
6998
6999        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7000        let cmn = encoder
7001            .encode(&ArmOp::Cmn {
7002                rn: Reg::R10,
7003                op2: Operand2::Reg(Reg::R8),
7004            })
7005            .unwrap();
7006        assert_eq!(
7007            cmn,
7008            vec![0x1A, 0xEB, 0x08, 0x0F],
7009            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7010        );
7011
7012        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7013        let lo = encoder
7014            .encode(&ArmOp::Cmn {
7015                rn: Reg::R1,
7016                op2: Operand2::Reg(Reg::R2),
7017            })
7018            .unwrap();
7019        assert_eq!(
7020            lo.len(),
7021            2,
7022            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7023        );
7024        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7025    }
7026
7027    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7028    /// guards its registers must return Err, not panic under debug-assertions.
7029    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7030    #[test]
7031    fn test_encode_pc_operand_returns_err_not_panic_185() {
7032        let encoder = ArmEncoder::new_thumb2();
7033        for op in [
7034            ArmOp::Sdiv {
7035                rd: Reg::PC,
7036                rn: Reg::R0,
7037                rm: Reg::R1,
7038            },
7039            ArmOp::Udiv {
7040                rd: Reg::R0,
7041                rn: Reg::PC,
7042                rm: Reg::R1,
7043            },
7044            ArmOp::Sdiv {
7045                rd: Reg::R0,
7046                rn: Reg::R1,
7047                rm: Reg::PC,
7048            },
7049        ] {
7050            let r = encoder.encode(&op);
7051            assert!(
7052                r.is_err(),
7053                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7054            );
7055        }
7056        // Valid registers still encode fine (no false rejection).
7057        assert!(
7058            encoder
7059                .encode(&ArmOp::Sdiv {
7060                    rd: Reg::R0,
7061                    rn: Reg::R1,
7062                    rm: Reg::R2
7063                })
7064                .is_ok()
7065        );
7066    }
7067
7068    #[test]
7069    fn test_encode_nop_arm32() {
7070        let encoder = ArmEncoder::new_arm32();
7071        let code = encoder.encode(&ArmOp::Nop).unwrap();
7072
7073        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7074        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7075    }
7076
7077    #[test]
7078    fn test_encode_nop_thumb() {
7079        let encoder = ArmEncoder::new_thumb2();
7080        let code = encoder.encode(&ArmOp::Nop).unwrap();
7081
7082        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7083        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7084    }
7085
7086    #[test]
7087    fn test_encode_mov_immediate_arm32() {
7088        let encoder = ArmEncoder::new_arm32();
7089        let op = ArmOp::Mov {
7090            rd: Reg::R0,
7091            op2: Operand2::Imm(42),
7092        };
7093
7094        let code = encoder.encode(&op).unwrap();
7095        assert_eq!(code.len(), 4);
7096
7097        // Verify it's a MOV instruction (bits should have immediate flag set)
7098        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7099        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7100    }
7101
7102    #[test]
7103    fn test_encode_add_registers_arm32() {
7104        let encoder = ArmEncoder::new_arm32();
7105        let op = ArmOp::Add {
7106            rd: Reg::R0,
7107            rn: Reg::R1,
7108            op2: Operand2::Reg(Reg::R2),
7109        };
7110
7111        let code = encoder.encode(&op).unwrap();
7112        assert_eq!(code.len(), 4);
7113
7114        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7115        // Verify it's an ADD instruction with correct opcode
7116        assert_eq!(instr & 0x0FE00000, 0x00800000);
7117    }
7118
7119    #[test]
7120    fn test_encode_ldr_arm32() {
7121        let encoder = ArmEncoder::new_arm32();
7122        let op = ArmOp::Ldr {
7123            rd: Reg::R0,
7124            addr: MemAddr::imm(Reg::R1, 4),
7125        };
7126
7127        let code = encoder.encode(&op).unwrap();
7128        assert_eq!(code.len(), 4);
7129
7130        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7131        // Verify load bit is set
7132        assert_eq!(instr & 0x00100000, 0x00100000);
7133    }
7134
7135    #[test]
7136    fn test_encode_str_arm32() {
7137        let encoder = ArmEncoder::new_arm32();
7138        let op = ArmOp::Str {
7139            rd: Reg::R0,
7140            addr: MemAddr::imm(Reg::SP, 0),
7141        };
7142
7143        let code = encoder.encode(&op).unwrap();
7144        assert_eq!(code.len(), 4);
7145    }
7146
7147    #[test]
7148    fn test_encode_branch_arm32() {
7149        let encoder = ArmEncoder::new_arm32();
7150        let op = ArmOp::Bl {
7151            label: "main".to_string(),
7152        };
7153
7154        let code = encoder.encode(&op).unwrap();
7155        assert_eq!(code.len(), 4);
7156
7157        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7158        // Verify BL opcode
7159        assert_eq!(instr & 0x0F000000, 0x0B000000);
7160    }
7161
7162    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7163    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7164    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7165    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7166    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7167    ///     to fit (#167).
7168    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7169    ///     entry (#174).
7170    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7171    #[test]
7172    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7173        let encoder = ArmEncoder::new_thumb2();
7174        let op = ArmOp::Bl {
7175            label: "callee".to_string(),
7176        };
7177
7178        let code = encoder.encode(&op).unwrap();
7179        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7180
7181        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7182        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7183        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7184        assert_eq!(
7185            hw2, 0xFFFE,
7186            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
7187        );
7188        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
7189        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
7190    }
7191
7192    #[test]
7193    fn test_encode_sequence() {
7194        let encoder = ArmEncoder::new_arm32();
7195        let ops = vec![
7196            ArmOp::Mov {
7197                rd: Reg::R0,
7198                op2: Operand2::Imm(42),
7199            },
7200            ArmOp::Mov {
7201                rd: Reg::R1,
7202                op2: Operand2::Imm(10),
7203            },
7204            ArmOp::Add {
7205                rd: Reg::R2,
7206                rn: Reg::R0,
7207                op2: Operand2::Reg(Reg::R1),
7208            },
7209        ];
7210
7211        let code = encoder.encode_sequence(&ops).unwrap();
7212        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
7213    }
7214
7215    #[test]
7216    fn test_reg_to_bits() {
7217        assert_eq!(reg_to_bits(&Reg::R0), 0);
7218        assert_eq!(reg_to_bits(&Reg::R7), 7);
7219        assert_eq!(reg_to_bits(&Reg::SP), 13);
7220        assert_eq!(reg_to_bits(&Reg::LR), 14);
7221        assert_eq!(reg_to_bits(&Reg::PC), 15);
7222    }
7223
7224    #[test]
7225    fn test_encode_bitwise_operations() {
7226        let encoder = ArmEncoder::new_arm32();
7227
7228        let and_op = ArmOp::And {
7229            rd: Reg::R0,
7230            rn: Reg::R1,
7231            op2: Operand2::Reg(Reg::R2),
7232        };
7233        let and_code = encoder.encode(&and_op).unwrap();
7234        assert_eq!(and_code.len(), 4);
7235
7236        let orr_op = ArmOp::Orr {
7237            rd: Reg::R0,
7238            rn: Reg::R1,
7239            op2: Operand2::Reg(Reg::R2),
7240        };
7241        let orr_code = encoder.encode(&orr_op).unwrap();
7242        assert_eq!(orr_code.len(), 4);
7243
7244        let eor_op = ArmOp::Eor {
7245            rd: Reg::R0,
7246            rn: Reg::R1,
7247            op2: Operand2::Reg(Reg::R2),
7248        };
7249        let eor_code = encoder.encode(&eor_op).unwrap();
7250        assert_eq!(eor_code.len(), 4);
7251    }
7252
7253    // === Thumb-2 32-bit encoding tests ===
7254
7255    #[test]
7256    fn test_encode_sdiv_thumb2() {
7257        let encoder = ArmEncoder::new_thumb2();
7258        let op = ArmOp::Sdiv {
7259            rd: Reg::R0,
7260            rn: Reg::R1,
7261            rm: Reg::R2,
7262        };
7263
7264        let code = encoder.encode(&op).unwrap();
7265        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7266
7267        // SDIV R0, R1, R2: 0xFB91 0xF0F2
7268        // First halfword: 0xFB90 | Rn(1) = 0xFB91
7269        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
7270        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
7271        assert_eq!(code[0], 0x91);
7272        assert_eq!(code[1], 0xFB);
7273        assert_eq!(code[2], 0xF2);
7274        assert_eq!(code[3], 0xF0);
7275    }
7276
7277    #[test]
7278    fn test_encode_udiv_thumb2() {
7279        let encoder = ArmEncoder::new_thumb2();
7280        let op = ArmOp::Udiv {
7281            rd: Reg::R0,
7282            rn: Reg::R1,
7283            rm: Reg::R2,
7284        };
7285
7286        let code = encoder.encode(&op).unwrap();
7287        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7288
7289        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
7290        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
7291        assert_eq!(code[0], 0xB1);
7292        assert_eq!(code[1], 0xFB);
7293        assert_eq!(code[2], 0xF2);
7294        assert_eq!(code[3], 0xF0);
7295    }
7296
7297    #[test]
7298    fn test_encode_mul_thumb2() {
7299        let encoder = ArmEncoder::new_thumb2();
7300        let op = ArmOp::Mul {
7301            rd: Reg::R0,
7302            rn: Reg::R1,
7303            rm: Reg::R2,
7304        };
7305
7306        let code = encoder.encode(&op).unwrap();
7307        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7308    }
7309
7310    #[test]
7311    fn test_encode_and_thumb2() {
7312        let encoder = ArmEncoder::new_thumb2();
7313        let op = ArmOp::And {
7314            rd: Reg::R0,
7315            rn: Reg::R1,
7316            op2: Operand2::Reg(Reg::R2),
7317        };
7318
7319        let code = encoder.encode(&op).unwrap();
7320        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7321    }
7322
7323    #[test]
7324    fn test_encode_lsl_thumb2_low_regs() {
7325        let encoder = ArmEncoder::new_thumb2();
7326        let op = ArmOp::Lsl {
7327            rd: Reg::R0,
7328            rn: Reg::R1,
7329            shift: 5,
7330        };
7331
7332        let code = encoder.encode(&op).unwrap();
7333        assert_eq!(code.len(), 2); // 16-bit for low registers
7334    }
7335
7336    #[test]
7337    fn test_encode_clz_thumb2() {
7338        let encoder = ArmEncoder::new_thumb2();
7339        let op = ArmOp::Clz {
7340            rd: Reg::R0,
7341            rm: Reg::R1,
7342        };
7343
7344        let code = encoder.encode(&op).unwrap();
7345        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7346    }
7347
7348    #[test]
7349    fn test_encode_bx_thumb2() {
7350        let encoder = ArmEncoder::new_thumb2();
7351        let op = ArmOp::Bx { rm: Reg::LR };
7352
7353        let code = encoder.encode(&op).unwrap();
7354        assert_eq!(code.len(), 2); // 16-bit instruction
7355
7356        // BX LR: 0x4770
7357        assert_eq!(code, vec![0x70, 0x47]);
7358    }
7359
7360    // ========================================================================
7361    // f32 pseudo-op encoding tests
7362    // ========================================================================
7363
7364    #[test]
7365    fn test_encode_f32_abs_arm32() {
7366        let encoder = ArmEncoder::new_arm32();
7367        let op = ArmOp::F32Abs {
7368            sd: VfpReg::S0,
7369            sm: VfpReg::S2,
7370        };
7371        let code = encoder.encode(&op).unwrap();
7372        assert_eq!(code.len(), 4); // Single VFP instruction
7373    }
7374
7375    #[test]
7376    fn test_encode_f32_neg_arm32() {
7377        let encoder = ArmEncoder::new_arm32();
7378        let op = ArmOp::F32Neg {
7379            sd: VfpReg::S0,
7380            sm: VfpReg::S2,
7381        };
7382        let code = encoder.encode(&op).unwrap();
7383        assert_eq!(code.len(), 4);
7384    }
7385
7386    #[test]
7387    fn test_encode_f32_sqrt_arm32() {
7388        let encoder = ArmEncoder::new_arm32();
7389        let op = ArmOp::F32Sqrt {
7390            sd: VfpReg::S0,
7391            sm: VfpReg::S2,
7392        };
7393        let code = encoder.encode(&op).unwrap();
7394        assert_eq!(code.len(), 4);
7395    }
7396
7397    #[test]
7398    fn test_encode_f32_ceil_arm32() {
7399        let encoder = ArmEncoder::new_arm32();
7400        let op = ArmOp::F32Ceil {
7401            sd: VfpReg::S0,
7402            sm: VfpReg::S2,
7403        };
7404        let code = encoder.encode(&op).unwrap();
7405        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
7406        assert_eq!(code.len(), 36);
7407    }
7408
7409    #[test]
7410    fn test_encode_f32_floor_thumb2() {
7411        let encoder = ArmEncoder::new_thumb2();
7412        let op = ArmOp::F32Floor {
7413            sd: VfpReg::S0,
7414            sm: VfpReg::S2,
7415        };
7416        let code = encoder.encode(&op).unwrap();
7417        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
7418        assert_eq!(code.len(), 36);
7419    }
7420
7421    #[test]
7422    fn test_encode_f32_min_arm32() {
7423        let encoder = ArmEncoder::new_arm32();
7424        let op = ArmOp::F32Min {
7425            sd: VfpReg::S0,
7426            sn: VfpReg::S2,
7427            sm: VfpReg::S4,
7428        };
7429        let code = encoder.encode(&op).unwrap();
7430        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
7431    }
7432
7433    #[test]
7434    fn test_encode_f32_max_thumb2() {
7435        let encoder = ArmEncoder::new_thumb2();
7436        let op = ArmOp::F32Max {
7437            sd: VfpReg::S0,
7438            sn: VfpReg::S2,
7439            sm: VfpReg::S4,
7440        };
7441        let code = encoder.encode(&op).unwrap();
7442        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
7443        assert_eq!(code.len(), 18);
7444    }
7445
7446    #[test]
7447    fn test_encode_f32_copysign_arm32() {
7448        let encoder = ArmEncoder::new_arm32();
7449        let op = ArmOp::F32Copysign {
7450            sd: VfpReg::S0,
7451            sn: VfpReg::S2,
7452            sm: VfpReg::S4,
7453        };
7454        let code = encoder.encode(&op).unwrap();
7455        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
7456        assert_eq!(code.len(), 24);
7457    }
7458
7459    // ========================================================================
7460    // f64 encoding tests
7461    // ========================================================================
7462
7463    #[test]
7464    fn test_encode_f64_add_arm32() {
7465        let encoder = ArmEncoder::new_arm32();
7466        let op = ArmOp::F64Add {
7467            dd: VfpReg::D0,
7468            dn: VfpReg::D1,
7469            dm: VfpReg::D2,
7470        };
7471        let code = encoder.encode(&op).unwrap();
7472        assert_eq!(code.len(), 4);
7473        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
7474        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7475        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7476    }
7477
7478    #[test]
7479    fn test_encode_f64_sub_thumb2() {
7480        let encoder = ArmEncoder::new_thumb2();
7481        let op = ArmOp::F64Sub {
7482            dd: VfpReg::D0,
7483            dn: VfpReg::D1,
7484            dm: VfpReg::D2,
7485        };
7486        let code = encoder.encode(&op).unwrap();
7487        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
7488    }
7489
7490    #[test]
7491    fn test_encode_f64_mul_arm32() {
7492        let encoder = ArmEncoder::new_arm32();
7493        let op = ArmOp::F64Mul {
7494            dd: VfpReg::D0,
7495            dn: VfpReg::D1,
7496            dm: VfpReg::D2,
7497        };
7498        let code = encoder.encode(&op).unwrap();
7499        assert_eq!(code.len(), 4);
7500    }
7501
7502    #[test]
7503    fn test_encode_f64_div_arm32() {
7504        let encoder = ArmEncoder::new_arm32();
7505        let op = ArmOp::F64Div {
7506            dd: VfpReg::D0,
7507            dn: VfpReg::D1,
7508            dm: VfpReg::D2,
7509        };
7510        let code = encoder.encode(&op).unwrap();
7511        assert_eq!(code.len(), 4);
7512    }
7513
7514    #[test]
7515    fn test_encode_f64_abs_arm32() {
7516        let encoder = ArmEncoder::new_arm32();
7517        let op = ArmOp::F64Abs {
7518            dd: VfpReg::D0,
7519            dm: VfpReg::D2,
7520        };
7521        let code = encoder.encode(&op).unwrap();
7522        assert_eq!(code.len(), 4);
7523    }
7524
7525    #[test]
7526    fn test_encode_f64_neg_arm32() {
7527        let encoder = ArmEncoder::new_arm32();
7528        let op = ArmOp::F64Neg {
7529            dd: VfpReg::D0,
7530            dm: VfpReg::D2,
7531        };
7532        let code = encoder.encode(&op).unwrap();
7533        assert_eq!(code.len(), 4);
7534    }
7535
7536    #[test]
7537    fn test_encode_f64_sqrt_arm32() {
7538        let encoder = ArmEncoder::new_arm32();
7539        let op = ArmOp::F64Sqrt {
7540            dd: VfpReg::D0,
7541            dm: VfpReg::D2,
7542        };
7543        let code = encoder.encode(&op).unwrap();
7544        assert_eq!(code.len(), 4);
7545    }
7546
7547    #[test]
7548    fn test_encode_f64_load_arm32() {
7549        let encoder = ArmEncoder::new_arm32();
7550        let op = ArmOp::F64Load {
7551            dd: VfpReg::D0,
7552            addr: MemAddr::imm(Reg::R0, 8),
7553        };
7554        let code = encoder.encode(&op).unwrap();
7555        assert_eq!(code.len(), 4);
7556        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7557        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
7558        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
7559    }
7560
7561    #[test]
7562    fn test_encode_f64_store_thumb2() {
7563        let encoder = ArmEncoder::new_thumb2();
7564        let op = ArmOp::F64Store {
7565            dd: VfpReg::D0,
7566            addr: MemAddr::imm(Reg::SP, 0),
7567        };
7568        let code = encoder.encode(&op).unwrap();
7569        assert_eq!(code.len(), 4);
7570    }
7571
7572    #[test]
7573    fn test_encode_f64_compare_arm32() {
7574        let encoder = ArmEncoder::new_arm32();
7575        let op = ArmOp::F64Eq {
7576            rd: Reg::R0,
7577            dn: VfpReg::D0,
7578            dm: VfpReg::D1,
7579        };
7580        let code = encoder.encode(&op).unwrap();
7581        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
7582    }
7583
7584    #[test]
7585    fn test_encode_f64_compare_thumb2() {
7586        let encoder = ArmEncoder::new_thumb2();
7587        let op = ArmOp::F64Lt {
7588            rd: Reg::R0,
7589            dn: VfpReg::D0,
7590            dm: VfpReg::D1,
7591        };
7592        let code = encoder.encode(&op).unwrap();
7593        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
7594        assert_eq!(code.len(), 14);
7595    }
7596
7597    #[test]
7598    fn test_encode_f64_const_arm32() {
7599        let encoder = ArmEncoder::new_arm32();
7600        let op = ArmOp::F64Const {
7601            dd: VfpReg::D0,
7602            value: 3.125,
7603        };
7604        let code = encoder.encode(&op).unwrap();
7605        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7606        assert_eq!(code.len(), 20);
7607    }
7608
7609    #[test]
7610    fn test_encode_f64_const_thumb2() {
7611        let encoder = ArmEncoder::new_thumb2();
7612        let op = ArmOp::F64Const {
7613            dd: VfpReg::D0,
7614            value: 2.5,
7615        };
7616        let code = encoder.encode(&op).unwrap();
7617        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7618        assert_eq!(code.len(), 20);
7619    }
7620
7621    #[test]
7622    fn test_encode_f64_convert_i32s_arm32() {
7623        let encoder = ArmEncoder::new_arm32();
7624        let op = ArmOp::F64ConvertI32S {
7625            dd: VfpReg::D0,
7626            rm: Reg::R0,
7627        };
7628        let code = encoder.encode(&op).unwrap();
7629        // VMOV(4) + VCVT(4) = 8
7630        assert_eq!(code.len(), 8);
7631    }
7632
7633    #[test]
7634    fn test_encode_f64_promote_f32_arm32() {
7635        let encoder = ArmEncoder::new_arm32();
7636        let op = ArmOp::F64PromoteF32 {
7637            dd: VfpReg::D0,
7638            sm: VfpReg::S0,
7639        };
7640        let code = encoder.encode(&op).unwrap();
7641        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
7642    }
7643
7644    #[test]
7645    fn test_encode_f64_promote_f32_thumb2() {
7646        let encoder = ArmEncoder::new_thumb2();
7647        let op = ArmOp::F64PromoteF32 {
7648            dd: VfpReg::D0,
7649            sm: VfpReg::S0,
7650        };
7651        let code = encoder.encode(&op).unwrap();
7652        assert_eq!(code.len(), 4);
7653    }
7654
7655    #[test]
7656    fn test_encode_i32_trunc_f64s_arm32() {
7657        let encoder = ArmEncoder::new_arm32();
7658        let op = ArmOp::I32TruncF64S {
7659            rd: Reg::R0,
7660            dm: VfpReg::D0,
7661        };
7662        let code = encoder.encode(&op).unwrap();
7663        // VCVT(4) + VMOV(4) = 8
7664        assert_eq!(code.len(), 8);
7665    }
7666
7667    #[test]
7668    fn test_encode_f64_reinterpret_i64_arm32() {
7669        let encoder = ArmEncoder::new_arm32();
7670        let op = ArmOp::F64ReinterpretI64 {
7671            dd: VfpReg::D0,
7672            rmlo: Reg::R0,
7673            rmhi: Reg::R1,
7674        };
7675        let code = encoder.encode(&op).unwrap();
7676        assert_eq!(code.len(), 4); // Single VMOV instruction
7677    }
7678
7679    #[test]
7680    fn test_encode_i64_reinterpret_f64_thumb2() {
7681        let encoder = ArmEncoder::new_thumb2();
7682        let op = ArmOp::I64ReinterpretF64 {
7683            rdlo: Reg::R0,
7684            rdhi: Reg::R1,
7685            dm: VfpReg::D0,
7686        };
7687        let code = encoder.encode(&op).unwrap();
7688        assert_eq!(code.len(), 4);
7689    }
7690
7691    #[test]
7692    fn test_encode_f64_trunc_thumb2() {
7693        let encoder = ArmEncoder::new_thumb2();
7694        let op = ArmOp::F64Trunc {
7695            dd: VfpReg::D0,
7696            dm: VfpReg::D1,
7697        };
7698        let code = encoder.encode(&op).unwrap();
7699        // Two VFP instructions via Thumb encoding
7700        assert_eq!(code.len(), 8);
7701    }
7702
7703    #[test]
7704    fn test_encode_f64_min_arm32() {
7705        let encoder = ArmEncoder::new_arm32();
7706        let op = ArmOp::F64Min {
7707            dd: VfpReg::D0,
7708            dn: VfpReg::D1,
7709            dm: VfpReg::D2,
7710        };
7711        let code = encoder.encode(&op).unwrap();
7712        // VMOV + VCMP + VMRS + conditional VMOV = 16
7713        assert_eq!(code.len(), 16);
7714    }
7715
7716    #[test]
7717    fn test_f64_cp11_encoding() {
7718        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
7719        let encoder = ArmEncoder::new_arm32();
7720
7721        // F64Add
7722        let code = encoder
7723            .encode(&ArmOp::F64Add {
7724                dd: VfpReg::D0,
7725                dn: VfpReg::D0,
7726                dm: VfpReg::D0,
7727            })
7728            .unwrap();
7729        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7730        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
7731
7732        // F32Add for comparison
7733        let code = encoder
7734            .encode(&ArmOp::F32Add {
7735                sd: VfpReg::S0,
7736                sn: VfpReg::S0,
7737                sm: VfpReg::S0,
7738            })
7739            .unwrap();
7740        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7741        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
7742    }
7743
7744    #[test]
7745    fn test_dreg_encoding_higher_registers() {
7746        let encoder = ArmEncoder::new_arm32();
7747
7748        // Test with D15 (highest register)
7749        let op = ArmOp::F64Add {
7750            dd: VfpReg::D15,
7751            dn: VfpReg::D14,
7752            dm: VfpReg::D13,
7753        };
7754        let code = encoder.encode(&op).unwrap();
7755        assert_eq!(code.len(), 4);
7756
7757        // Verify the register encoding worked (instruction is valid)
7758        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7759        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7760    }
7761
7762    // ========================================================================
7763    // Control flow encoding tests
7764    // ========================================================================
7765
7766    #[test]
7767    fn test_encode_label_emits_no_bytes() {
7768        let encoder = ArmEncoder::new_thumb2();
7769        let op = ArmOp::Label {
7770            name: ".Lblock_end_0".to_string(),
7771        };
7772        let code = encoder.encode(&op).unwrap();
7773        assert!(code.is_empty(), "Label should emit zero bytes");
7774
7775        let encoder32 = ArmEncoder::new_arm32();
7776        let code32 = encoder32.encode(&op).unwrap();
7777        assert!(
7778            code32.is_empty(),
7779            "Label should emit zero bytes in ARM32 too"
7780        );
7781    }
7782
7783    #[test]
7784    fn test_encode_bcc_eq_thumb2() {
7785        use synth_synthesis::Condition;
7786        let encoder = ArmEncoder::new_thumb2();
7787        let op = ArmOp::Bcc {
7788            cond: Condition::EQ,
7789            label: "target".to_string(),
7790        };
7791        let code = encoder.encode(&op).unwrap();
7792        assert_eq!(code.len(), 2); // 16-bit conditional branch
7793
7794        // BEQ with offset 0: 0xD000 in little-endian
7795        assert_eq!(code, vec![0x00, 0xD0]);
7796    }
7797
7798    #[test]
7799    fn test_encode_bcc_ne_thumb2() {
7800        use synth_synthesis::Condition;
7801        let encoder = ArmEncoder::new_thumb2();
7802        let op = ArmOp::Bcc {
7803            cond: Condition::NE,
7804            label: "target".to_string(),
7805        };
7806        let code = encoder.encode(&op).unwrap();
7807        assert_eq!(code.len(), 2);
7808
7809        // BNE with offset 0: 0xD100 in little-endian
7810        assert_eq!(code, vec![0x00, 0xD1]);
7811    }
7812
7813    #[test]
7814    fn test_encode_bcc_arm32() {
7815        use synth_synthesis::Condition;
7816        let encoder = ArmEncoder::new_arm32();
7817        let op = ArmOp::Bcc {
7818            cond: Condition::EQ,
7819            label: "target".to_string(),
7820        };
7821        let code = encoder.encode(&op).unwrap();
7822        assert_eq!(code.len(), 4); // 32-bit ARM instruction
7823
7824        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7825        // BEQ: cond=0x0, opcode=0xA, offset=0
7826        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
7827        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
7828    }
7829
7830    #[test]
7831    fn test_encode_udf_thumb2() {
7832        let encoder = ArmEncoder::new_thumb2();
7833        let op = ArmOp::Udf { imm: 0 };
7834        let code = encoder.encode(&op).unwrap();
7835        assert_eq!(code.len(), 2); // 16-bit
7836
7837        // UDF #0: 0xDE00 in little-endian
7838        assert_eq!(code, vec![0x00, 0xDE]);
7839    }
7840
7841    #[test]
7842    fn test_encode_nop_thumb2() {
7843        let encoder = ArmEncoder::new_thumb2();
7844        let op = ArmOp::Nop;
7845        let code = encoder.encode(&op).unwrap();
7846        assert_eq!(code.len(), 2); // 16-bit
7847
7848        // NOP: 0xBF00 in little-endian
7849        assert_eq!(code, vec![0x00, 0xBF]);
7850    }
7851
7852    // =========================================================================
7853    // i64 Thumb-2 encoding tests
7854    // =========================================================================
7855
7856    #[test]
7857    fn test_encode_i64_add_thumb2() {
7858        let encoder = ArmEncoder::new_thumb2();
7859        let op = ArmOp::I64Add {
7860            rdlo: Reg::R0,
7861            rdhi: Reg::R1,
7862            rnlo: Reg::R0,
7863            rnhi: Reg::R1,
7864            rmlo: Reg::R2,
7865            rmhi: Reg::R3,
7866        };
7867        let code = encoder.encode(&op).unwrap();
7868        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
7869        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
7870    }
7871
7872    #[test]
7873    fn test_encode_i64_sub_thumb2() {
7874        let encoder = ArmEncoder::new_thumb2();
7875        let op = ArmOp::I64Sub {
7876            rdlo: Reg::R0,
7877            rdhi: Reg::R1,
7878            rnlo: Reg::R0,
7879            rnhi: Reg::R1,
7880            rmlo: Reg::R2,
7881            rmhi: Reg::R3,
7882        };
7883        let code = encoder.encode(&op).unwrap();
7884        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
7885        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
7886    }
7887
7888    #[test]
7889    fn test_encode_i64_and_thumb2() {
7890        let encoder = ArmEncoder::new_thumb2();
7891        let op = ArmOp::I64And {
7892            rdlo: Reg::R0,
7893            rdhi: Reg::R1,
7894            rnlo: Reg::R0,
7895            rnhi: Reg::R1,
7896            rmlo: Reg::R2,
7897            rmhi: Reg::R3,
7898        };
7899        let code = encoder.encode(&op).unwrap();
7900        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
7901        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
7902    }
7903
7904    #[test]
7905    fn test_encode_i64_or_thumb2() {
7906        let encoder = ArmEncoder::new_thumb2();
7907        let op = ArmOp::I64Or {
7908            rdlo: Reg::R0,
7909            rdhi: Reg::R1,
7910            rnlo: Reg::R0,
7911            rnhi: Reg::R1,
7912            rmlo: Reg::R2,
7913            rmhi: Reg::R3,
7914        };
7915        let code = encoder.encode(&op).unwrap();
7916        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
7917    }
7918
7919    #[test]
7920    fn test_encode_i64_xor_thumb2() {
7921        let encoder = ArmEncoder::new_thumb2();
7922        let op = ArmOp::I64Xor {
7923            rdlo: Reg::R0,
7924            rdhi: Reg::R1,
7925            rnlo: Reg::R0,
7926            rnhi: Reg::R1,
7927            rmlo: Reg::R2,
7928            rmhi: Reg::R3,
7929        };
7930        let code = encoder.encode(&op).unwrap();
7931        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
7932    }
7933
7934    #[test]
7935    fn test_encode_i64_const_small_thumb2() {
7936        let encoder = ArmEncoder::new_thumb2();
7937        // Small constant: only needs MOVW for each half
7938        let op = ArmOp::I64Const {
7939            rdlo: Reg::R0,
7940            rdhi: Reg::R1,
7941            value: 42,
7942        };
7943        let code = encoder.encode(&op).unwrap();
7944        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
7945        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
7946    }
7947
7948    #[test]
7949    fn test_encode_i64_const_large_thumb2() {
7950        let encoder = ArmEncoder::new_thumb2();
7951        // Large constant: needs MOVW+MOVT for each half
7952        let op = ArmOp::I64Const {
7953            rdlo: Reg::R0,
7954            rdhi: Reg::R1,
7955            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
7956        };
7957        let code = encoder.encode(&op).unwrap();
7958        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
7959        assert_eq!(
7960            code.len(),
7961            16,
7962            "I64Const with large value should be 16 bytes"
7963        );
7964    }
7965
7966    #[test]
7967    fn test_encode_i64_extend_i32_s_thumb2() {
7968        let encoder = ArmEncoder::new_thumb2();
7969        let op = ArmOp::I64ExtendI32S {
7970            rdlo: Reg::R0,
7971            rdhi: Reg::R1,
7972            rn: Reg::R0,
7973        };
7974        let code = encoder.encode(&op).unwrap();
7975        // When rdlo == rn, only ASR (4 bytes) is emitted
7976        assert_eq!(
7977            code.len(),
7978            4,
7979            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
7980        );
7981    }
7982
7983    #[test]
7984    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
7985        let encoder = ArmEncoder::new_thumb2();
7986        let op = ArmOp::I64ExtendI32S {
7987            rdlo: Reg::R0,
7988            rdhi: Reg::R1,
7989            rn: Reg::R2,
7990        };
7991        let code = encoder.encode(&op).unwrap();
7992        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
7993        assert!(
7994            code.len() >= 6,
7995            "I64ExtendI32S (diff reg) should be at least 6 bytes"
7996        );
7997    }
7998
7999    #[test]
8000    fn test_encode_i64_extend_i32_u_thumb2() {
8001        let encoder = ArmEncoder::new_thumb2();
8002        let op = ArmOp::I64ExtendI32U {
8003            rdlo: Reg::R0,
8004            rdhi: Reg::R1,
8005            rn: Reg::R0,
8006        };
8007        let code = encoder.encode(&op).unwrap();
8008        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8009        assert_eq!(
8010            code.len(),
8011            2,
8012            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8013        );
8014    }
8015
8016    #[test]
8017    fn test_encode_i32_wrap_i64_nop_thumb2() {
8018        let encoder = ArmEncoder::new_thumb2();
8019        // When rd == rnlo, should be a NOP
8020        let op = ArmOp::I32WrapI64 {
8021            rd: Reg::R0,
8022            rnlo: Reg::R0,
8023        };
8024        let code = encoder.encode(&op).unwrap();
8025        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8026        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8027    }
8028
8029    #[test]
8030    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8031        let encoder = ArmEncoder::new_thumb2();
8032        let op = ArmOp::I32WrapI64 {
8033            rd: Reg::R2,
8034            rnlo: Reg::R0,
8035        };
8036        let code = encoder.encode(&op).unwrap();
8037        // MOV R2, R0 (2 or 4 bytes)
8038        assert!(
8039            code.len() >= 2,
8040            "I32WrapI64 diff reg should emit at least 2 bytes"
8041        );
8042    }
8043
8044    #[test]
8045    fn test_encode_i64_eqz_thumb2() {
8046        let encoder = ArmEncoder::new_thumb2();
8047        let op = ArmOp::I64Eqz {
8048            rd: Reg::R0,
8049            rnlo: Reg::R0,
8050            rnhi: Reg::R1,
8051        };
8052        let code = encoder.encode(&op).unwrap();
8053        // Delegates to I64SetCondZ which is already encoded
8054        assert!(
8055            code.len() >= 6,
8056            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8057        );
8058    }
8059
8060    #[test]
8061    fn test_encode_i64_eq_thumb2() {
8062        let encoder = ArmEncoder::new_thumb2();
8063        let op = ArmOp::I64Eq {
8064            rd: Reg::R0,
8065            rnlo: Reg::R0,
8066            rnhi: Reg::R1,
8067            rmlo: Reg::R2,
8068            rmhi: Reg::R3,
8069        };
8070        let code = encoder.encode(&op).unwrap();
8071        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8072        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8073    }
8074
8075    #[test]
8076    fn test_encode_i64_ldr_thumb2() {
8077        let encoder = ArmEncoder::new_thumb2();
8078        let op = ArmOp::I64Ldr {
8079            rdlo: Reg::R0,
8080            rdhi: Reg::R1,
8081            addr: MemAddr::imm(Reg::SP, 0),
8082        };
8083        let code = encoder.encode(&op).unwrap();
8084        // Two LDR instructions (lo at offset, hi at offset+4)
8085        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8086    }
8087
8088    #[test]
8089    fn test_encode_i64_str_thumb2() {
8090        let encoder = ArmEncoder::new_thumb2();
8091        let op = ArmOp::I64Str {
8092            rdlo: Reg::R0,
8093            rdhi: Reg::R1,
8094            addr: MemAddr::imm(Reg::SP, 0),
8095        };
8096        let code = encoder.encode(&op).unwrap();
8097        // Two STR instructions (lo at offset, hi at offset+4)
8098        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8099    }
8100
8101    #[test]
8102    fn test_encode_i64_all_comparisons_thumb2() {
8103        let encoder = ArmEncoder::new_thumb2();
8104
8105        let ops = vec![
8106            ArmOp::I64Ne {
8107                rd: Reg::R0,
8108                rnlo: Reg::R0,
8109                rnhi: Reg::R1,
8110                rmlo: Reg::R2,
8111                rmhi: Reg::R3,
8112            },
8113            ArmOp::I64LtS {
8114                rd: Reg::R0,
8115                rnlo: Reg::R0,
8116                rnhi: Reg::R1,
8117                rmlo: Reg::R2,
8118                rmhi: Reg::R3,
8119            },
8120            ArmOp::I64LtU {
8121                rd: Reg::R0,
8122                rnlo: Reg::R0,
8123                rnhi: Reg::R1,
8124                rmlo: Reg::R2,
8125                rmhi: Reg::R3,
8126            },
8127            ArmOp::I64LeS {
8128                rd: Reg::R0,
8129                rnlo: Reg::R0,
8130                rnhi: Reg::R1,
8131                rmlo: Reg::R2,
8132                rmhi: Reg::R3,
8133            },
8134            ArmOp::I64LeU {
8135                rd: Reg::R0,
8136                rnlo: Reg::R0,
8137                rnhi: Reg::R1,
8138                rmlo: Reg::R2,
8139                rmhi: Reg::R3,
8140            },
8141            ArmOp::I64GtS {
8142                rd: Reg::R0,
8143                rnlo: Reg::R0,
8144                rnhi: Reg::R1,
8145                rmlo: Reg::R2,
8146                rmhi: Reg::R3,
8147            },
8148            ArmOp::I64GtU {
8149                rd: Reg::R0,
8150                rnlo: Reg::R0,
8151                rnhi: Reg::R1,
8152                rmlo: Reg::R2,
8153                rmhi: Reg::R3,
8154            },
8155            ArmOp::I64GeS {
8156                rd: Reg::R0,
8157                rnlo: Reg::R0,
8158                rnhi: Reg::R1,
8159                rmlo: Reg::R2,
8160                rmhi: Reg::R3,
8161            },
8162            ArmOp::I64GeU {
8163                rd: Reg::R0,
8164                rnlo: Reg::R0,
8165                rnhi: Reg::R1,
8166                rmlo: Reg::R2,
8167                rmhi: Reg::R3,
8168            },
8169        ];
8170
8171        for op in &ops {
8172            let code = encoder.encode(op).unwrap();
8173            assert!(
8174                code.len() >= 8,
8175                "i64 comparison {:?} should emit at least 8 bytes, got {}",
8176                op,
8177                code.len()
8178            );
8179        }
8180    }
8181
8182    #[test]
8183    fn test_encode_i64_const_zero_thumb2() {
8184        let encoder = ArmEncoder::new_thumb2();
8185        let op = ArmOp::I64Const {
8186            rdlo: Reg::R0,
8187            rdhi: Reg::R1,
8188            value: 0,
8189        };
8190        let code = encoder.encode(&op).unwrap();
8191        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
8192        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
8193    }
8194
8195    #[test]
8196    fn test_encode_i64_const_negative_one_thumb2() {
8197        let encoder = ArmEncoder::new_thumb2();
8198        let op = ArmOp::I64Const {
8199            rdlo: Reg::R0,
8200            rdhi: Reg::R1,
8201            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
8202        };
8203        let code = encoder.encode(&op).unwrap();
8204        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8205        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
8206    }
8207
8208    // =========================================================================
8209    // Sub-word load/store encoding tests
8210    // =========================================================================
8211
8212    #[test]
8213    fn test_encode_ldrb_arm32() {
8214        let encoder = ArmEncoder::new_arm32();
8215        let op = ArmOp::Ldrb {
8216            rd: Reg::R0,
8217            addr: MemAddr::imm(Reg::R1, 4),
8218        };
8219        let code = encoder.encode(&op).unwrap();
8220        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
8221        // LDRB R0, [R1, #4] = 0xE5D10004
8222        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8223        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
8224    }
8225
8226    #[test]
8227    fn test_encode_strb_arm32() {
8228        let encoder = ArmEncoder::new_arm32();
8229        let op = ArmOp::Strb {
8230            rd: Reg::R0,
8231            addr: MemAddr::imm(Reg::R1, 0),
8232        };
8233        let code = encoder.encode(&op).unwrap();
8234        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
8235        // STRB R0, [R1, #0] = 0xE5C10000
8236        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8237        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
8238    }
8239
8240    #[test]
8241    fn test_encode_ldrh_arm32() {
8242        let encoder = ArmEncoder::new_arm32();
8243        let op = ArmOp::Ldrh {
8244            rd: Reg::R0,
8245            addr: MemAddr::imm(Reg::R1, 2),
8246        };
8247        let code = encoder.encode(&op).unwrap();
8248        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
8249    }
8250
8251    #[test]
8252    fn test_encode_strh_arm32() {
8253        let encoder = ArmEncoder::new_arm32();
8254        let op = ArmOp::Strh {
8255            rd: Reg::R0,
8256            addr: MemAddr::imm(Reg::R1, 0),
8257        };
8258        let code = encoder.encode(&op).unwrap();
8259        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
8260    }
8261
8262    #[test]
8263    fn test_encode_ldrsb_arm32() {
8264        let encoder = ArmEncoder::new_arm32();
8265        let op = ArmOp::Ldrsb {
8266            rd: Reg::R0,
8267            addr: MemAddr::imm(Reg::R1, 0),
8268        };
8269        let code = encoder.encode(&op).unwrap();
8270        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
8271    }
8272
8273    #[test]
8274    fn test_encode_ldrsh_arm32() {
8275        let encoder = ArmEncoder::new_arm32();
8276        let op = ArmOp::Ldrsh {
8277            rd: Reg::R0,
8278            addr: MemAddr::imm(Reg::R1, 0),
8279        };
8280        let code = encoder.encode(&op).unwrap();
8281        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
8282    }
8283
8284    #[test]
8285    fn test_encode_ldrb_thumb2_16bit() {
8286        let encoder = ArmEncoder::new_thumb2();
8287        let op = ArmOp::Ldrb {
8288            rd: Reg::R0,
8289            addr: MemAddr::imm(Reg::R1, 4),
8290        };
8291        let code = encoder.encode(&op).unwrap();
8292        // Low registers + small offset -> 16-bit encoding
8293        assert_eq!(
8294            code.len(),
8295            2,
8296            "Thumb-2 LDRB with small offset should be 16-bit"
8297        );
8298    }
8299
8300    #[test]
8301    fn test_encode_ldrb_thumb2_32bit() {
8302        let encoder = ArmEncoder::new_thumb2();
8303        let op = ArmOp::Ldrb {
8304            rd: Reg::R0,
8305            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
8306        };
8307        let code = encoder.encode(&op).unwrap();
8308        assert_eq!(
8309            code.len(),
8310            4,
8311            "Thumb-2 LDRB with large offset should be 32-bit"
8312        );
8313    }
8314
8315    #[test]
8316    fn test_encode_strb_thumb2_16bit() {
8317        let encoder = ArmEncoder::new_thumb2();
8318        let op = ArmOp::Strb {
8319            rd: Reg::R0,
8320            addr: MemAddr::imm(Reg::R1, 10),
8321        };
8322        let code = encoder.encode(&op).unwrap();
8323        assert_eq!(
8324            code.len(),
8325            2,
8326            "Thumb-2 STRB with small offset should be 16-bit"
8327        );
8328    }
8329
8330    #[test]
8331    fn test_encode_ldrh_thumb2_16bit() {
8332        let encoder = ArmEncoder::new_thumb2();
8333        let op = ArmOp::Ldrh {
8334            rd: Reg::R0,
8335            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
8336        };
8337        let code = encoder.encode(&op).unwrap();
8338        assert_eq!(
8339            code.len(),
8340            2,
8341            "Thumb-2 LDRH with small aligned offset should be 16-bit"
8342        );
8343    }
8344
8345    #[test]
8346    fn test_encode_strh_thumb2_16bit() {
8347        let encoder = ArmEncoder::new_thumb2();
8348        let op = ArmOp::Strh {
8349            rd: Reg::R0,
8350            addr: MemAddr::imm(Reg::R1, 4),
8351        };
8352        let code = encoder.encode(&op).unwrap();
8353        assert_eq!(
8354            code.len(),
8355            2,
8356            "Thumb-2 STRH with small aligned offset should be 16-bit"
8357        );
8358    }
8359
8360    #[test]
8361    fn test_encode_ldrsb_thumb2() {
8362        let encoder = ArmEncoder::new_thumb2();
8363        let op = ArmOp::Ldrsb {
8364            rd: Reg::R0,
8365            addr: MemAddr::imm(Reg::R1, 0),
8366        };
8367        let code = encoder.encode(&op).unwrap();
8368        // LDRSB has no 16-bit immediate form, always 32-bit
8369        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
8370    }
8371
8372    #[test]
8373    fn test_encode_ldrsh_thumb2() {
8374        let encoder = ArmEncoder::new_thumb2();
8375        let op = ArmOp::Ldrsh {
8376            rd: Reg::R0,
8377            addr: MemAddr::imm(Reg::R1, 0),
8378        };
8379        let code = encoder.encode(&op).unwrap();
8380        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
8381    }
8382
8383    #[test]
8384    fn test_encode_memory_size_thumb2() {
8385        let encoder = ArmEncoder::new_thumb2();
8386        let op = ArmOp::MemorySize { rd: Reg::R0 };
8387        let code = encoder.encode(&op).unwrap();
8388        // R0 and R10 are not both low registers, so this needs careful handling
8389        assert!(!code.is_empty(), "MemorySize should produce code");
8390    }
8391
8392    #[test]
8393    fn test_encode_memory_grow_thumb2() {
8394        let encoder = ArmEncoder::new_thumb2();
8395        let op = ArmOp::MemoryGrow {
8396            rd: Reg::R0,
8397            rn: Reg::R0,
8398        };
8399        let code = encoder.encode(&op).unwrap();
8400        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
8401    }
8402
8403    #[test]
8404    fn test_encode_subword_reg_offset_thumb2() {
8405        let encoder = ArmEncoder::new_thumb2();
8406
8407        // LDRB with register offset
8408        let op = ArmOp::Ldrb {
8409            rd: Reg::R0,
8410            addr: MemAddr::reg(Reg::R1, Reg::R2),
8411        };
8412        let code = encoder.encode(&op).unwrap();
8413        assert_eq!(
8414            code.len(),
8415            4,
8416            "Thumb-2 LDRB with reg offset should be 32-bit"
8417        );
8418
8419        // STRB with register offset
8420        let op = ArmOp::Strb {
8421            rd: Reg::R0,
8422            addr: MemAddr::reg(Reg::R1, Reg::R2),
8423        };
8424        let code = encoder.encode(&op).unwrap();
8425        assert_eq!(
8426            code.len(),
8427            4,
8428            "Thumb-2 STRB with reg offset should be 32-bit"
8429        );
8430
8431        // LDRH with register offset
8432        let op = ArmOp::Ldrh {
8433            rd: Reg::R0,
8434            addr: MemAddr::reg(Reg::R1, Reg::R2),
8435        };
8436        let code = encoder.encode(&op).unwrap();
8437        assert_eq!(
8438            code.len(),
8439            4,
8440            "Thumb-2 LDRH with reg offset should be 32-bit"
8441        );
8442
8443        // STRH with register offset
8444        let op = ArmOp::Strh {
8445            rd: Reg::R0,
8446            addr: MemAddr::reg(Reg::R1, Reg::R2),
8447        };
8448        let code = encoder.encode(&op).unwrap();
8449        assert_eq!(
8450            code.len(),
8451            4,
8452            "Thumb-2 STRH with reg offset should be 32-bit"
8453        );
8454    }
8455
8456    #[test]
8457    fn test_encode_subword_reg_imm_offset_thumb2() {
8458        let encoder = ArmEncoder::new_thumb2();
8459
8460        // LDRB with both register and immediate offset
8461        let op = ArmOp::Ldrb {
8462            rd: Reg::R0,
8463            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
8464        };
8465        let code = encoder.encode(&op).unwrap();
8466        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
8467        assert_eq!(
8468            code.len(),
8469            8,
8470            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
8471        );
8472    }
8473
8474    // ========================================================================
8475    // Helium MVE encoding tests
8476    // ========================================================================
8477
8478    #[test]
8479    fn test_encode_mve_addi32_thumb2() {
8480        let encoder = ArmEncoder::new_thumb2();
8481        let op = ArmOp::MveAddI {
8482            qd: QReg::Q0,
8483            qn: QReg::Q1,
8484            qm: QReg::Q2,
8485            size: MveSize::S32,
8486        };
8487        let code = encoder.encode(&op).unwrap();
8488        assert_eq!(
8489            code.len(),
8490            4,
8491            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
8492        );
8493    }
8494
8495    #[test]
8496    fn test_encode_mve_subi16_thumb2() {
8497        let encoder = ArmEncoder::new_thumb2();
8498        let op = ArmOp::MveSubI {
8499            qd: QReg::Q0,
8500            qn: QReg::Q1,
8501            qm: QReg::Q2,
8502            size: MveSize::S16,
8503        };
8504        let code = encoder.encode(&op).unwrap();
8505        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
8506    }
8507
8508    #[test]
8509    fn test_encode_mve_muli8_thumb2() {
8510        let encoder = ArmEncoder::new_thumb2();
8511        let op = ArmOp::MveMulI {
8512            qd: QReg::Q0,
8513            qn: QReg::Q1,
8514            qm: QReg::Q2,
8515            size: MveSize::S8,
8516        };
8517        let code = encoder.encode(&op).unwrap();
8518        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
8519    }
8520
8521    #[test]
8522    fn test_encode_mve_bitwise_thumb2() {
8523        let encoder = ArmEncoder::new_thumb2();
8524
8525        let ops = vec![
8526            ArmOp::MveAnd {
8527                qd: QReg::Q0,
8528                qn: QReg::Q1,
8529                qm: QReg::Q2,
8530            },
8531            ArmOp::MveOrr {
8532                qd: QReg::Q0,
8533                qn: QReg::Q1,
8534                qm: QReg::Q2,
8535            },
8536            ArmOp::MveEor {
8537                qd: QReg::Q0,
8538                qn: QReg::Q1,
8539                qm: QReg::Q2,
8540            },
8541            ArmOp::MveBic {
8542                qd: QReg::Q0,
8543                qn: QReg::Q1,
8544                qm: QReg::Q2,
8545            },
8546        ];
8547        for op in ops {
8548            let code = encoder.encode(&op).unwrap();
8549            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
8550        }
8551    }
8552
8553    #[test]
8554    fn test_encode_mve_mvn_thumb2() {
8555        let encoder = ArmEncoder::new_thumb2();
8556        let op = ArmOp::MveMvn {
8557            qd: QReg::Q0,
8558            qm: QReg::Q1,
8559        };
8560        let code = encoder.encode(&op).unwrap();
8561        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
8562    }
8563
8564    #[test]
8565    fn test_encode_mve_load_store_thumb2() {
8566        let encoder = ArmEncoder::new_thumb2();
8567
8568        let load = ArmOp::MveLoad {
8569            qd: QReg::Q0,
8570            addr: MemAddr::imm(Reg::R0, 16),
8571        };
8572        let code = encoder.encode(&load).unwrap();
8573        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
8574
8575        let store = ArmOp::MveStore {
8576            qd: QReg::Q1,
8577            addr: MemAddr::imm(Reg::R1, 0),
8578        };
8579        let code = encoder.encode(&store).unwrap();
8580        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
8581    }
8582
8583    #[test]
8584    fn test_encode_mve_const_thumb2() {
8585        let encoder = ArmEncoder::new_thumb2();
8586        let op = ArmOp::MveConst {
8587            qd: QReg::Q0,
8588            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
8589        };
8590        let code = encoder.encode(&op).unwrap();
8591        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
8592        // Some words with hi16=0 skip MOVT, so length varies
8593        assert!(
8594            code.len() >= 24,
8595            "MVE const should produce multiple instructions"
8596        );
8597    }
8598
8599    #[test]
8600    fn test_encode_mve_dup_thumb2() {
8601        let encoder = ArmEncoder::new_thumb2();
8602        let op = ArmOp::MveDup {
8603            qd: QReg::Q0,
8604            rn: Reg::R0,
8605            size: MveSize::S32,
8606        };
8607        let code = encoder.encode(&op).unwrap();
8608        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
8609    }
8610
8611    #[test]
8612    fn test_encode_mve_extract_lane_thumb2() {
8613        let encoder = ArmEncoder::new_thumb2();
8614        let op = ArmOp::MveExtractLane {
8615            rd: Reg::R0,
8616            qn: QReg::Q1,
8617            lane: 2,
8618            size: MveSize::S32,
8619        };
8620        let code = encoder.encode(&op).unwrap();
8621        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
8622    }
8623
8624    #[test]
8625    fn test_encode_mve_insert_lane_thumb2() {
8626        let encoder = ArmEncoder::new_thumb2();
8627        let op = ArmOp::MveInsertLane {
8628            qd: QReg::Q0,
8629            rn: Reg::R1,
8630            lane: 3,
8631            size: MveSize::S32,
8632        };
8633        let code = encoder.encode(&op).unwrap();
8634        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
8635    }
8636
8637    #[test]
8638    fn test_encode_mve_addf32_thumb2() {
8639        let encoder = ArmEncoder::new_thumb2();
8640        let op = ArmOp::MveAddF32 {
8641            qd: QReg::Q0,
8642            qn: QReg::Q1,
8643            qm: QReg::Q2,
8644        };
8645        let code = encoder.encode(&op).unwrap();
8646        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
8647    }
8648
8649    #[test]
8650    fn test_encode_mve_divf32_thumb2() {
8651        let encoder = ArmEncoder::new_thumb2();
8652        let op = ArmOp::MveDivF32 {
8653            qd: QReg::Q0,
8654            qn: QReg::Q1,
8655            qm: QReg::Q2,
8656        };
8657        let code = encoder.encode(&op).unwrap();
8658        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
8659        assert_eq!(
8660            code.len(),
8661            16,
8662            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
8663        );
8664    }
8665
8666    #[test]
8667    fn test_encode_mve_sqrtf32_thumb2() {
8668        let encoder = ArmEncoder::new_thumb2();
8669        let op = ArmOp::MveSqrtF32 {
8670            qd: QReg::Q0,
8671            qm: QReg::Q1,
8672        };
8673        let code = encoder.encode(&op).unwrap();
8674        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
8675        assert_eq!(
8676            code.len(),
8677            16,
8678            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
8679        );
8680    }
8681
8682    #[test]
8683    fn test_encode_mve_negf32_thumb2() {
8684        let encoder = ArmEncoder::new_thumb2();
8685        let op = ArmOp::MveNegF32 {
8686            qd: QReg::Q0,
8687            qm: QReg::Q1,
8688        };
8689        let code = encoder.encode(&op).unwrap();
8690        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
8691    }
8692
8693    #[test]
8694    fn test_encode_mve_absf32_thumb2() {
8695        let encoder = ArmEncoder::new_thumb2();
8696        let op = ArmOp::MveAbsF32 {
8697            qd: QReg::Q0,
8698            qm: QReg::Q1,
8699        };
8700        let code = encoder.encode(&op).unwrap();
8701        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
8702    }
8703
8704    #[test]
8705    fn test_encode_mve_different_qregs() {
8706        let encoder = ArmEncoder::new_thumb2();
8707
8708        // Test that different Q-register numbers produce different encodings
8709        let op1 = ArmOp::MveAddI {
8710            qd: QReg::Q0,
8711            qn: QReg::Q0,
8712            qm: QReg::Q0,
8713            size: MveSize::S32,
8714        };
8715        let op2 = ArmOp::MveAddI {
8716            qd: QReg::Q3,
8717            qn: QReg::Q5,
8718            qm: QReg::Q7,
8719            size: MveSize::S32,
8720        };
8721        let code1 = encoder.encode(&op1).unwrap();
8722        let code2 = encoder.encode(&op2).unwrap();
8723        assert_ne!(
8724            code1, code2,
8725            "Different Q-registers should produce different encodings"
8726        );
8727    }
8728
8729    #[test]
8730    fn test_encode_mve_arm32_nop() {
8731        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
8732        let encoder = ArmEncoder::new_arm32();
8733        let op = ArmOp::MveAddI {
8734            qd: QReg::Q0,
8735            qn: QReg::Q1,
8736            qm: QReg::Q2,
8737            size: MveSize::S32,
8738        };
8739        let code = encoder.encode(&op).unwrap();
8740        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
8741        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
8742        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8743        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
8744    }
8745}